597 files changed, 39784 insertions, 17478 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index c295ba786edd..f0c7de78e205 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -41,8 +41,8 @@ extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
-extern struct dentry_operations v9fs_dentry_operations;
-extern struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct dentry_operations v9fs_dentry_operations;
+extern const struct dentry_operations v9fs_cached_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 06dcc7c4f234..d74325295b1e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -104,12 +104,12 @@ void v9fs_dentry_release(struct dentry *dentry)
 	}
 }
 
-struct dentry_operations v9fs_cached_dentry_operations = {
+const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
 
-struct dentry_operations v9fs_dentry_operations = {
+const struct dentry_operations v9fs_dentry_operations = {
 	.d_delete = v9fs_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 93212e40221a..5f8ab8adb5f5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	p9stat_free(st);
 	kfree(st);
 
-P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
-	return simple_set_mnt(mnt, sb);
+P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 release_sb:
 	if (sb) {
diff --git a/fs/Kconfig b/fs/Kconfig
index 93945dd0b1ae..86b203fc3c56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -56,61 +56,7 @@ endif # BLOCK
 
 source "fs/notify/Kconfig"
 
-config QUOTA
-	bool "Quota support"
-	help
-	  If you say Y here, you will be able to set per user limits for disk
-	  usage (also called disk quotas). Currently, it works for the
-	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
-	  quotas for which you don't need to run quotacheck(8) after an unclean
-	  shutdown.
-	  For further details, read the Quota mini-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
-	  with the quota tools. Probably the quota support is only useful for
-	  multi user systems. If unsure, say N.
-
-config QUOTA_NETLINK_INTERFACE
-	bool "Report quota messages through netlink interface"
-	depends on QUOTA && NET
-	help
-	  If you say Y here, quota warnings (about exceeding softlimit, reaching
-	  hardlimit, etc.) will be reported through netlink interface. If unsure,
-	  say Y.
-
-config PRINT_QUOTA_WARNING
-	bool "Print quota warnings to console (OBSOLETE)"
-	depends on QUOTA
-	default y
-	help
-	  If you say Y here, quota warnings (about exceeding softlimit, reaching
-	  hardlimit, etc.) will be printed to the process' controlling terminal.
-	  Note that this behavior is currently deprecated and may go away in
-	  future. Please use notification via netlink socket instead.
-
-# Generic support for tree structured quota files. Seleted when needed.
-config QUOTA_TREE
-	 tristate
-
-config QFMT_V1
-	tristate "Old quota format support"
-	depends on QUOTA
-	help
-	  This quota format was (is) used by kernels earlier than 2.4.22. If
-	  you have quota working and you don't want to convert to new quota
-	  format say Y here.
-
-config QFMT_V2
-	tristate "Quota format v2 support"
-	depends on QUOTA
-	select QUOTA_TREE
-	help
-	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here.
-
-config QUOTACTL
-	bool
-	depends on XFS_QUOTA || QUOTA
-	default y
+source "fs/quota/Kconfig"
 
 source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
@@ -120,6 +66,13 @@ config GENERIC_ACL
 	bool
 	select FS_POSIX_ACL
 
+menu "Caches"
+
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+
+endmenu
+
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
@@ -223,6 +176,8 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 
+source "fs/exofs/Kconfig"
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..70b2aed87133 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o fs_struct.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -51,11 +51,7 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
-obj-$(CONFIG_QUOTA)		+= dquot.o
-obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
-obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
-obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
-obj-$(CONFIG_QUOTACTL)		+= quota.o
+obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
@@ -67,12 +63,15 @@ obj-$(CONFIG_PROFILING)		+= dcookies.o
 obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)		+= fscache/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
-obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
 obj-y				+= ramfs/
@@ -118,7 +117,9 @@ obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
+obj-$(CONFIG_CACHEFILES)	+= cachefiles/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 831157502d5a..e0a85dbeeb88 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -86,7 +86,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 /* dir_*.c */
 extern const struct inode_operations adfs_dir_inode_operations;
 extern const struct file_operations adfs_dir_operations;
-extern struct dentry_operations adfs_dentry_operations;
+extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 85a30e929800..e867ccf37246 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -263,7 +263,7 @@ adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
 	return 0;
 }
 
-struct dentry_operations adfs_dentry_operations = {
+const struct dentry_operations adfs_dentry_operations = {
 	.d_hash		= adfs_hash,
 	.d_compare	= adfs_compare,
 };
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 
 static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
+	struct super_block *sb = dentry->d_sb;
+	struct adfs_sb_info *sbi = ADFS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
-	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = dentry->d_sb->s_blocksize;
-	buf->f_blocks  = asb->s_size;
-	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
+	buf->f_namelen = sbi->s_namelen;
+	buf->f_bsize   = sb->s_blocksize;
+	buf->f_blocks  = sbi->s_size;
+	buf->f_files   = sbi->s_ids_per_zone * sbi->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(dentry->d_sb);
+	buf->f_bfree   = adfs_map_free(sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	return 0;
 }
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e9ec915f7553..1a2d5e3c7f4e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -199,8 +199,7 @@ extern const struct address_space_operations	 affs_symlink_aops;
 extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations	 affs_aops_ofs;
 
-extern struct dentry_operations	 affs_dentry_operations;
-extern struct dentry_operations	 affs_dentry_operations_intl;
+extern const struct dentry_operations	 affs_dentry_operations;
 
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 805573005de6..7d0f0a30f7a3 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -179,14 +179,18 @@ affs_remove_link(struct dentry *dentry)
 		affs_lock_dir(dir);
 		affs_fix_dcache(dentry, link_ino);
 		retval = affs_remove_hash(dir, link_bh);
-		if (retval)
+		if (retval) {
+			affs_unlock_dir(dir);
 			goto done;
+		}
 		mark_buffer_dirty_inode(link_bh, inode);
 
 		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
 		retval = affs_insert_hash(dir, bh);
-		if (retval)
+		if (retval) {
+			affs_unlock_dir(dir);
 			goto done;
+		}
 		mark_buffer_dirty_inode(bh, inode);
 
 		affs_unlock_dir(dir);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index cfcf1b6cf82b..960d336ec694 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -19,12 +19,12 @@ static int	 affs_intl_toupper(int ch);
 static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
 static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 
-struct dentry_operations affs_dentry_operations = {
+const struct dentry_operations affs_dentry_operations = {
 	.d_hash		= affs_hash_dentry,
 	.d_compare	= affs_compare_dentry,
 };
 
-static struct dentry_operations affs_intl_dentry_operations = {
+static const struct dentry_operations affs_intl_dentry_operations = {
 	.d_hash		= affs_intl_hash_dentry,
 	.d_compare	= affs_intl_compare_dentry,
 };
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..5ce695e707fe 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	int		 free;
+	u64		 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
 	     AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks  = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
 	buf->f_bfree   = free;
 	buf->f_bavail  = free;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 30;
 	return 0;
 }
 
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
 	  See <file:Documentation/filesystems/afs.txt> for more information.
 
 	  If unsure, say N.
+
+config AFS_FSCACHE
+	bool "Provide AFS client caching support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want AFS data to be cached locally on disk through
+	  the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
 # Makefile for Red Hat Linux AFS client.
 #
 
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
+
 kafs-objs := \
+	$(afs-cache-y) \
 	callback.o \
 	cell.o \
 	cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
 /* AFS caching stuff
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-						const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_cache_cell_index_def = {
-	.name			= "cell_ix",
-	.data_size		= sizeof(struct afs_cache_cell),
-	.keys[0]		= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-	.match			= afs_cell_cache_match,
-	.update			= afs_cell_cache_update,
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen);
+
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+	void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					 void *buffer, uint16_t buflen);
+
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+
+struct fscache_netfs afs_cache_netfs = {
+	.name			= "afs",
+	.version		= 0,
+};
+
+struct fscache_cookie_def afs_cell_cache_index_def = {
+	.name		= "AFS.cell",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_cell_cache_get_key,
+	.get_aux	= afs_cell_cache_get_aux,
+	.check_aux	= afs_cell_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+	.name			= "AFS.vldb",
+	.type			= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key		= afs_vlocation_cache_get_key,
+	.get_aux		= afs_vlocation_cache_get_aux,
+	.check_aux		= afs_vlocation_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_volume_cache_index_def = {
+	.name		= "AFS.volume",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= afs_volume_cache_get_key,
+};
+
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+	.name			= "AFS.vnode",
+	.type			= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key		= afs_vnode_cache_get_key,
+	.get_attr		= afs_vnode_cache_get_attr,
+	.get_aux		= afs_vnode_cache_get_aux,
+	.check_aux		= afs_vnode_cache_check_aux,
+	.now_uncached		= afs_vnode_cache_now_uncached,
 };
-#endif
 
 /*
- * match a cell record obtained from the cache
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-						const void *entry)
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
 {
-	const struct afs_cache_cell *ccell = entry;
-	struct afs_cell *cell = target;
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t klen;
 
-	_enter("{%s},{%s}", ccell->name, cell->name);
+	_enter("%p,%p,%u", cell, buffer, bufmax);
 
-	if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
-		_leave(" = SUCCESS");
-		return CACHEFS_MATCH_SUCCESS;
-	}
+	klen = strlen(cell->name);
+	if (klen > bufmax)
+		return 0;
 
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
+	memcpy(buffer, cell->name, klen);
+	return klen;
 }
-#endif
 
 /*
- * update a cell record in the cache
+ * provide new auxilliary cache data
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+				       void *buffer, uint16_t bufmax)
 {
-	struct afs_cache_cell *ccell = entry;
-	struct afs_cell *cell = source;
+	const struct afs_cell *cell = cookie_netfs_data;
+	uint16_t dlen;
 
-	_enter("%p,%p", source, entry);
+	_enter("%p,%p,%u", cell, buffer, bufmax);
 
-	strncpy(ccell->name, cell->name, sizeof(ccell->name));
+	dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+	dlen = min(dlen, bufmax);
+	dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
 
-	memcpy(ccell->vl_servers,
-	       cell->vl_addrs,
-	       min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+	memcpy(buffer, cell->vl_addrs, dlen);
+	return dlen;
+}
 
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+						      const void *buffer,
+						      uint16_t buflen)
+{
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-						     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-	.name		= "vldb",
-	.data_size	= sizeof(struct afs_cache_vlocation),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-	.match		= afs_vlocation_cache_match,
-	.update		= afs_vlocation_cache_update,
-};
-#endif
 
+/*****************************************************************************/
 /*
- * match a VLDB record stored in the cache
- * - may also load target from entry
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-						     const void *entry)
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
 {
-	const struct afs_cache_vlocation *vldb = entry;
-	struct afs_vlocation *vlocation = target;
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+	if (klen > bufmax)
+		return 0;
 
-	_enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+	memcpy(buffer, vlocation->vldb.name, klen);
 
-	if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
-	    ) {
-		if (!vlocation->valid ||
-		    vlocation->vldb.rtime == vldb->rtime
+	_leave(" = %u", klen);
+	return klen;
+}
+
+/*
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+						    const void *buffer,
+						    uint16_t buflen)
+{
+	const struct afs_cache_vlocation *cvldb;
+	struct afs_vlocation *vlocation = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(struct afs_cache_vlocation);
+	dlen -= offsetof(struct afs_cache_vlocation, nservers);
+	if (dlen != buflen)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+
+	/* if what's on disk is more valid than what's in memory, then use the
+	 * VL record from the cache */
+	if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+		memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+		vlocation->valid = 1;
+		_leave(" = SUCCESS [c->m]");
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	/* need to update the cache if the cached info differs */
+	if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+		/* delete if the volume IDs for this name differ */
+		if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+			   sizeof(cvldb->vid)) != 0
 		    ) {
-			vlocation->vldb = *vldb;
-			vlocation->valid = 1;
-			_leave(" = SUCCESS [c->m]");
-			return CACHEFS_MATCH_SUCCESS;
-		} else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-			/* delete if VIDs for this name differ */
-			if (memcmp(&vlocation->vldb.vid,
-				   &vldb->vid,
-				   sizeof(vldb->vid)) != 0) {
-				_leave(" = DELETE");
-				return CACHEFS_MATCH_SUCCESS_DELETE;
-			}
-
-			_leave(" = UPDATE");
-			return CACHEFS_MATCH_SUCCESS_UPDATE;
-		} else {
-			_leave(" = SUCCESS");
-			return CACHEFS_MATCH_SUCCESS;
+			_leave(" = OBSOLETE");
+			return FSCACHE_CHECKAUX_OBSOLETE;
 		}
+
+		_leave(" = UPDATE");
+		return FSCACHE_CHECKAUX_NEEDS_UPDATE;
 	}
 
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 
+/*****************************************************************************/
 /*
- * update a VLDB record stored in the cache
+ * set the key for the volume index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
 {
-	struct afs_cache_vlocation *vldb = entry;
-	struct afs_vlocation *vlocation = source;
+	const struct afs_volume *volume = cookie_netfs_data;
+	uint16_t klen;
+
+	_enter("{%u},%p,%u", volume->type, buffer, bufmax);
+
+	klen = sizeof(volume->type);
+	if (klen > bufmax)
+		return 0;
 
-	_enter("");
+	memcpy(buffer, &volume->type, sizeof(volume->type));
+
+	_leave(" = %u", klen);
+	return klen;
 
-	*vldb = vlocation->vldb;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-						  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_volume_cache_index_def = {
-	.name		= "volume",
-	.data_size	= sizeof(struct afs_cache_vhash),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
-	.keys[1]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
-	.match		= afs_volume_cache_match,
-	.update		= afs_volume_cache_update,
-};
-#endif
 
+/*****************************************************************************/
 /*
- * match a volume hash record stored in the cache
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-						  const void *entry)
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
 {
-	const struct afs_cache_vhash *vhash = entry;
-	struct afs_volume *volume = target;
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t klen;
 
-	_enter("{%u},{%u}", volume->type, vhash->vtype);
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
 
-	if (volume->type == vhash->vtype) {
-		_leave(" = SUCCESS");
-		return CACHEFS_MATCH_SUCCESS;
-	}
+	klen = sizeof(vnode->fid.vnode);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
 
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
+	_leave(" = %u", klen);
+	return klen;
 }
-#endif
 
 /*
- * update a volume hash record stored in the cache
+ * provide updated file attributes
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+				     uint64_t *size)
 {
-	struct afs_cache_vhash *vhash = entry;
-	struct afs_volume *volume = source;
+	const struct afs_vnode *vnode = cookie_netfs_data;
 
-	_enter("");
+	_enter("{%x,%x,%llx},",
+	       vnode->fid.vnode, vnode->fid.unique,
+	       vnode->status.data_version);
 
-	vhash->vtype = volume->type;
+	*size = vnode->status.size;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-						 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vnode_cache_index_def = {
-	.name		= "vnode",
-	.data_size	= sizeof(struct afs_cache_vnode),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 4 },
-	.match		= afs_vnode_cache_match,
-	.update		= afs_vnode_cache_update,
-};
-#endif
 
 /*
- * match a vnode record stored in the cache
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+					void *buffer, uint16_t bufmax)
+{
+	const struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%Lx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, bufmax);
+
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+	buffer += sizeof(vnode->fid.unique);
+	memcpy(buffer, &vnode->status.data_version,
+	       sizeof(vnode->status.data_version));
+
+	_leave(" = %u", dlen);
+	return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-						 const void *entry)
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+						       const void *buffer,
+						       uint16_t buflen)
 {
-	const struct afs_cache_vnode *cvnode = entry;
-	struct afs_vnode *vnode = target;
-
-	_enter("{%x,%x,%Lx},{%x,%x,%Lx}",
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       vnode->status.version,
-	       cvnode->vnode_id,
-	       cvnode->vnode_unique,
-	       cvnode->data_version);
-
-	if (vnode->fid.vnode != cvnode->vnode_id) {
-		_leave(" = FAILED");
-		return CACHEFS_MATCH_FAILED;
+	struct afs_vnode *vnode = cookie_netfs_data;
+	uint16_t dlen;
+
+	_enter("{%x,%x,%llx},%p,%u",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+	       buffer, buflen);
+
+	/* check the size of the data is what we're expecting */
+	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+	if (dlen != buflen) {
+		_leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
-	if (vnode->fid.unique != cvnode->vnode_unique ||
-	    vnode->status.version != cvnode->data_version) {
-		_leave(" = DELETE");
-		return CACHEFS_MATCH_SUCCESS_DELETE;
+	if (memcmp(buffer,
+		   &vnode->fid.unique,
+		   sizeof(vnode->fid.unique)
+		   ) != 0) {
+		unsigned unique;
+
+		memcpy(&unique, buffer, sizeof(unique));
+
+		_leave(" = OBSOLETE [uniq %x != %x]",
+		       unique, vnode->fid.unique);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	if (memcmp(buffer + sizeof(vnode->fid.unique),
+		   &vnode->status.data_version,
+		   sizeof(vnode->status.data_version)
+		   ) != 0) {
+		afs_dataversion_t version;
+
+		memcpy(&version, buffer + sizeof(vnode->fid.unique),
+		       sizeof(version));
+
+		_leave(" = OBSOLETE [vers %llx != %llx]",
+		       version, vnode->status.data_version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
 	_leave(" = SUCCESS");
-	return CACHEFS_MATCH_SUCCESS;
+	return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 
 /*
- * update a vnode record stored in the cache
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vnode_cache_update(void *source, void *entry)
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
 {
-	struct afs_cache_vnode *cvnode = entry;
-	struct afs_vnode *vnode = source;
+	struct afs_vnode *vnode = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	_enter("{%x,%x,%Lx}",
+	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	for (;;) {
+		/* grab a bunch of pages to clean */
+		nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
 
-	_enter("");
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
 
-	cvnode->vnode_id	= vnode->fid.vnode;
-	cvnode->vnode_unique	= vnode->fid.unique;
-	cvnode->data_version	= vnode->status.version;
+	_leave("");
 }
-#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
 /* AFS local cache management interface
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef AFS_CACHE_H
-#define AFS_CACHE_H
-
-#undef AFS_CACHING_SUPPORT
-
-#include <linux/mm.h>
-#ifdef AFS_CACHING_SUPPORT
-#include <linux/cachefs.h>
-#endif
-#include "types.h"
-
-#endif /* AFS_CACHE_H */
+#include <linux/fscache.h>
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
 	if (ret < 0)
 		goto error;
 
-#ifdef AFS_CACHING_SUPPORT
-	/* put it up for caching */
-	cachefs_acquire_cookie(afs_cache_netfs.primary_index,
-			       &afs_vlocation_cache_index_def,
-			       cell,
-			       &cell->cache);
+#ifdef CONFIG_AFS_FSCACHE
+	/* put it up for caching (this never returns an error) */
+	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+					     &afs_cell_cache_index_def,
+					     cell);
 #endif
 
 	/* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
 	list_del_init(&cell->proc_link);
 	up_write(&afs_proc_cells_sem);
 
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_relinquish_cookie(cell->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(cell->cache, 0);
 #endif
-
 	key_put(cell->anonymous_key);
 	kfree(cell);
 
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 99cf390641f7..9bd757774c9e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations afs_dir_inode_operations = {
 	.setattr	= afs_setattr,
 };
 
-static struct dentry_operations afs_fs_dentry_operations = {
+static const struct dentry_operations afs_fs_dentry_operations = {
 	.d_revalidate	= afs_d_revalidate,
 	.d_delete	= afs_d_delete,
 	.d_release	= afs_d_release,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..7a1d942ef68d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
 
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages);
+
 const struct file_operations afs_file_operations = {
 	.open		= afs_open,
 	.release	= afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
 
 const struct address_space_operations afs_fs_aops = {
 	.readpage	= afs_readpage,
+	.readpages	= afs_readpages,
 	.set_page_dirty	= afs_set_page_dirty,
 	.launder_page	= afs_launder_page,
 	.releasepage	= afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
 /*
  * deal with notification that a page was read from the cache
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_read_complete(void *cookie_data,
-				       struct page *page,
-				       void *data,
-				       int error)
+static void afs_file_readpage_read_complete(struct page *page,
+					    void *data,
+					    int error)
 {
-	_enter("%p,%p,%p,%d", cookie_data, page, data, error);
+	_enter("%p,%p,%d", page, data, error);
 
-	if (error)
-		SetPageError(page);
-	else
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error)
 		SetPageUptodate(page);
 	unlock_page(page);
-
 }
-#endif
-
-/*
- * deal with notification that a page was written to the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_write_complete(void *cookie_data,
-					struct page *page,
-					void *data,
-					int error)
-{
-	_enter("%p,%p,%p,%d", cookie_data, page, data, error);
-
-	unlock_page(page);
-}
-#endif
 
 /*
  * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 		goto error;
 
-#ifdef AFS_CACHING_SUPPORT
 	/* is it cached? */
-	ret = cachefs_read_or_alloc_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_page(vnode->cache,
 					 page,
 					 afs_file_readpage_read_complete,
 					 NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
 #else
 	ret = -ENOBUFS;
 #endif
-
 	switch (ret) {
-		/* read BIO submitted and wb-journal entry found */
-	case 1:
-		BUG(); // TODO - handle wb-journal match
-
 		/* read BIO submitted (page in cache) */
 	case 0:
 		break;
 
-		/* no page available in cache */
-	case -ENOBUFS:
+		/* page not yet cached */
 	case -ENODATA:
+		_debug("cache said ENODATA");
+		goto go_on;
+
+		/* page will not be cached */
+	case -ENOBUFS:
+		_debug("cache said ENOBUFS");
 	default:
+	go_on:
 		offset = page->index << PAGE_CACHE_SHIFT;
 		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
 
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
 				ret = -ESTALE;
 			}
-#ifdef AFS_CACHING_SUPPORT
-			cachefs_uncache_page(vnode->cache, page);
+
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
 #endif
+			BUG_ON(PageFsCache(page));
 			goto error;
 		}
 
 		SetPageUptodate(page);
 
-#ifdef AFS_CACHING_SUPPORT
-		if (cachefs_write_page(vnode->cache,
-				       page,
-				       afs_file_readpage_write_complete,
-				       NULL,
-				       GFP_KERNEL) != 0
-		    ) {
-			cachefs_uncache_page(vnode->cache, page);
-			unlock_page(page);
+		/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page) &&
+		    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+			fscache_uncache_page(vnode->cache, page);
+			BUG_ON(PageFsCache(page));
 		}
-#else
-		unlock_page(page);
 #endif
+		unlock_page(page);
 	}
 
 	_leave(" = 0");
@@ -232,34 +216,59 @@ error:
 }
 
 /*
- * invalidate part or all of a page
+ * read a set of pages
  */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static int afs_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
 {
-	int ret = 1;
+	struct afs_vnode *vnode;
+	int ret = 0;
 
-	_enter("{%lu},%lu", page->index, offset);
+	_enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
 
-	BUG_ON(!PageLocked(page));
+	vnode = AFS_FS_I(mapping->host);
+	if (vnode->flags & AFS_VNODE_DELETED) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
 
-	if (PagePrivate(page)) {
-		/* We release buffers only if the entire page is being
-		 * invalidated.
-		 * The get_block cached value has been unconditionally
-		 * invalidated, so real IO is not possible anymore.
-		 */
-		if (offset == 0) {
-			BUG_ON(!PageLocked(page));
-
-			ret = 0;
-			if (!PageWriteback(page))
-				ret = page->mapping->a_ops->releasepage(page,
-									0);
-			/* possibly should BUG_ON(!ret); - neilb */
-		}
+	/* attempt to read as many of the pages as possible */
+#ifdef CONFIG_AFS_FSCACHE
+	ret = fscache_read_or_alloc_pages(vnode->cache,
+					  mapping,
+					  pages,
+					  &nr_pages,
+					  afs_file_readpage_read_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+#else
+	ret = -ENOBUFS;
+#endif
+
+	switch (ret) {
+		/* all pages are being read from the cache */
+	case 0:
+		BUG_ON(!list_empty(pages));
+		BUG_ON(nr_pages != 0);
+		_leave(" = 0 [reading all]");
+		return 0;
+
+		/* there were pages that couldn't be read from the cache */
+	case -ENODATA:
+	case -ENOBUFS:
+		break;
+
+		/* other error */
+	default:
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	_leave(" = %d", ret);
+	/* load the missing pages from the network */
+	ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+
+	_leave(" = %d [netting]", ret);
+	return ret;
 }
 
 /*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
 }
 
 /*
- * release a page and cleanup its private data
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+
+	_enter("{%lu},%lu", page->index, offset);
+
+	BUG_ON(!PageLocked(page));
+
+	/* we clean up only if the entire page is being invalidated */
+	if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+		if (PageFsCache(page)) {
+			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+			fscache_wait_on_page_write(vnode->cache, page);
+			fscache_uncache_page(vnode->cache, page);
+			ClearPageFsCache(page);
+		}
+#endif
+
+		if (PagePrivate(page)) {
+			if (wb && !PageWriteback(page)) {
+				set_page_private(page, 0);
+				afs_put_writeback(wb);
+			}
+
+			if (!page_private(page))
+				ClearPagePrivate(page);
+		}
+	}
+
+	_leave("");
+}
+
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
  */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
+	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-	struct afs_writeback *wb;
 
 	_enter("{{%x:%u}[%lu],%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
 	       gfp_flags);
 
+	/* deny if page is being written to the cache and the caller hasn't
+	 * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page)) {
+		if (fscache_check_page_write(vnode->cache, page)) {
+			if (!(gfp_flags & __GFP_WAIT)) {
+				_leave(" = F [cache busy]");
+				return 0;
+			}
+			fscache_wait_on_page_write(vnode->cache, page);
+		}
+
+		fscache_uncache_page(vnode->cache, page);
+		ClearPageFsCache(page);
+	}
+#endif
+
 	if (PagePrivate(page)) {
-		wb = (struct afs_writeback *) page_private(page);
-		ASSERT(wb != NULL);
-		set_page_private(page, 0);
+		if (wb) {
+			set_page_private(page, 0);
+			afs_put_writeback(wb);
+		}
 		ClearPagePrivate(page);
-		afs_put_writeback(wb);
 	}
 
-	_leave(" = 0");
-	return 0;
+	/* indicate that the page can be released */
+	_leave(" = T");
+	return 1;
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		return -EBADMSG;
 	}
 
+#ifdef CONFIG_AFS_FSCACHE
+	if (vnode->status.size != inode->i_size)
+		fscache_attr_changed(vnode->cache);
+#endif
+
 	inode->i_nlink		= vnode->status.nlink;
 	inode->i_uid		= vnode->status.owner;
 	inode->i_gid		= 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 		return inode;
 	}
 
-#ifdef AFS_CACHING_SUPPORT
-	/* set up caching before reading the status, as fetch-status reads the
-	 * first page of symlinks to see if they're really mntpts */
-	cachefs_acquire_cookie(vnode->volume->cache,
-			       NULL,
-			       vnode,
-			       &vnode->cache);
-#endif
-
 	if (!status) {
 		/* it's a remotely extant inode */
 		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 		}
 	}
 
+	/* set up caching before mapping the status, as map-status reads the
+	 * first page of symlinks to see if they're really mountpoints */
+	inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+					      &afs_vnode_cache_index_def,
+					      vnode);
+#endif
+
 	ret = afs_inode_map_status(vnode, key);
 	if (ret < 0)
 		goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 
 	/* failure */
 bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
+	vnode->cache = NULL;
+#endif
 	iget_failed(inode);
 	_leave(" = %d [bad]", ret);
 	return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
 	ASSERT(list_empty(&vnode->writebacks));
 	ASSERT(!vnode->cb_promised);
 
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_relinquish_cookie(vnode->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vnode->cache, 0);
 	vnode->cache = NULL;
 #endif
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 
 #include "afs.h"
 #include "afs_vl.h"
+#include "cache.h"
 
 #define AFS_CELL_MAX_ADDRS 15
 
@@ -193,8 +194,8 @@ struct afs_cell {
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
 	struct list_head	proc_link;	/* /proc cell list link */
 	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
 
 	/* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
 	struct list_head	grave;		/* link in master graveyard list */
 	struct list_head	update;		/* link in master update list */
 	struct afs_cell		*cell;		/* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
 	struct afs_cache_vlocation vldb;	/* volume information DB record */
 	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
 	atomic_t		usage;
 	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
 	struct afs_vlocation	*vlocation;	/* volume location */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
 	afs_volid_t		vid;		/* volume ID */
 	afs_voltype_t		type;		/* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
 	struct afs_server	*server;	/* server currently supplying this file */
 	struct afs_fid		fid;		/* the file identifier for this inode */
 	struct afs_file_status	status;		/* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
 	struct afs_permits	*permits;	/* cache of permits so far obtained */
 	struct mutex		permits_lock;	/* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
 
 /*****************************************************************************/
 /*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def	(*(struct fscache_cookie_def *) NULL)
+#endif
+
+/*
  * callback.c
  */
 extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
  */
 extern struct rw_semaphore afs_proc_cells_sem;
 extern struct list_head afs_proc_cells;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_cache_cell_index_def;
-#endif
 
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
  * main.c
  */
 extern struct afs_uuid afs_uuid;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_netfs afs_cache_netfs;
-#endif
 
 /*
  * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
 /*
  * vlclient.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-
 extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
 				    const char *, struct afs_cache_vlocation *,
 				    const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
 /*
  * vnode.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-
 static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
 {
 	return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 /*
  * volume.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-
 #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
 
 extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
 /* AFS client file system
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-#ifdef AFS_CACHING_SUPPORT
-static struct cachefs_netfs_operations afs_cache_ops = {
-	.get_page_cookie	= afs_cache_get_page_cookie,
-};
-
-struct cachefs_netfs afs_cache_netfs = {
-	.name			= "afs",
-	.version		= 0,
-	.ops			= &afs_cache_ops,
-};
-#endif
-
 struct afs_uuid afs_uuid;
 
 /*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
 	if (ret < 0)
 		return ret;
 
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
 	/* we want to be able to cache */
-	ret = cachefs_register_netfs(&afs_cache_netfs,
-				     &afs_cache_cell_index_def);
+	ret = fscache_register_netfs(&afs_cache_netfs);
 	if (ret < 0)
 		goto error_cache;
 #endif
@@ -142,8 +129,8 @@ error_fs:
 error_open_socket:
 error_vl_update_init:
 error_cell_init:
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
 	afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
 	afs_vlocation_purge();
 	flush_scheduled_work();
 	afs_cell_purge();
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_unregister_netfs(&afs_cache_netfs);
 #endif
 	afs_proc_cleanup();
 	rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	if (PageError(page))
 		goto error;
 
-	buf = kmap(page);
+	buf = kmap_atomic(page, KM_USER0);
 	memcpy(devname, buf, size);
-	kunmap(page);
+	kunmap_atomic(buf, KM_USER0);
 	page_cache_release(page);
 	page = NULL;
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 7578c1ab9e0b..8630615e57fe 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -146,7 +146,6 @@ int afs_proc_init(void)
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
 		goto error_dir;
-	proc_afs->owner = THIS_MODULE;
 
 	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
 	if (!p)
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
 
 	vl->vldb = *vldb;
 
-#ifdef AFS_CACHING_SUPPORT
-	/* update volume entry in local cache */
-	cachefs_update_cookie(vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
 #endif
 }
 
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	memset(&vldb, 0, sizeof(vldb));
 
 	/* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_acquire_cookie(cell->cache,
-			       &afs_volume_cache_index_def,
-			       vlocation,
-			       &vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+	vl->cache = fscache_acquire_cookie(vl->cell->cache,
+					   &afs_vlocation_cache_index_def, vl);
 #endif
 
 	if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
 	spin_unlock(&vl->lock);
 	wake_up(&vl->waitq);
 
+	/* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_update_cookie(vl->cache);
+#endif
+
 	/* schedule for regular updates */
 	afs_vlocation_queue_for_updates(vl);
 	goto success;
@@ -465,7 +467,7 @@ found_in_memory:
 	spin_unlock(&vl->lock);
 
 success:
-	_leave(" = %p",vl);
+	_leave(" = %p", vl);
 	return vl;
 
 error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
 	_enter("%p", vl);
 
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_relinquish_cookie(vl->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(vl->cache, 0);
 #endif
-
 	afs_put_cell(vl->cell);
 	kfree(vl);
 }
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	}
 
 	/* attach the cache and volume location */
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_acquire_cookie(vlocation->cache,
-			       &afs_vnode_cache_index_def,
-			       volume,
-			       &volume->cache);
+#ifdef CONFIG_AFS_FSCACHE
+	volume->cache = fscache_acquire_cookie(vlocation->cache,
+					       &afs_volume_cache_index_def,
+					       volume);
 #endif
-
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
 
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
 	up_write(&vlocation->cell->vl_sem);
 
 	/* finish cleaning up the volume */
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_relinquish_cookie(volume->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_relinquish_cookie(volume->cache, 0);
 #endif
 	afs_put_vlocation(vlocation);
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	_leave(" = %d", ret);
 	return ret;
 }
+
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+
+	_enter("{{%x:%u}},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
+
+	/* wait for the page to be written to the cache before we allow it to
+	 * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+	fscache_wait_on_page_write(vnode->cache, page);
+#endif
+
+	_leave(" = 0");
+	return 0;
+}
diff --git a/fs/aio.c b/fs/aio.c
index 8fa77e233944..76da12537956 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
-	req->ki_eventfd = ERR_PTR(-EINVAL);
+	req->ki_eventfd = NULL;
 
 	/* Check if the completion queue has enough free space to
 	 * accept an event from this io.
@@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (!IS_ERR(req->ki_eventfd))
-		fput(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data)
 		list_del(&req->ki_list);
 		spin_unlock_irq(&fput_lock);
 
-		/* Complete the fput */
-		__fput(req->ki_filp);
+		/* Complete the fput(s) */
+		if (req->ki_filp != NULL)
+			__fput(req->ki_filp);
+		if (req->ki_eventfd != NULL)
+			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data)
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
+	int schedule_putreq = 0;
+
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	req->ki_users --;
+	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
 		return 0;
@@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 
-	/* Must be done under the lock to serialise against cancellation.
-	 * Call this aio_fput as it duplicates fput via the fput_work.
+	/*
+	 * Try to optimize the aio and eventfd file* puts, by avoiding to
+	 * schedule work in case it is not __fput() time. In normal cases,
+	 * we would not be holding the last reference to the file*, so
+	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
+		schedule_putreq++;
+	else
+		req->ki_filp = NULL;
+	if (req->ki_eventfd != NULL) {
+		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
+			schedule_putreq++;
+		else
+			req->ki_eventfd = NULL;
+	}
+	if (unlikely(schedule_putreq)) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
@@ -571,7 +587,7 @@ int aio_put_req(struct kiocb *req)
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct mm_struct *mm = current->mm;
-	struct kioctx *ctx = NULL;
+	struct kioctx *ctx, *ret = NULL;
 	struct hlist_node *n;
 
 	rcu_read_lock();
@@ -579,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
 		if (ctx->user_id == ctx_id && !ctx->dead) {
 			get_ioctx(ctx);
+			ret = ctx;
 			break;
 		}
 	}
 
 	rcu_read_unlock();
-	return ctx;
+	return ret;
 }
 
 /*
@@ -1009,7 +1026,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
-	if (!IS_ERR(iocb->ki_eventfd))
+	if (iocb->ki_eventfd != NULL)
 		eventfd_signal(iocb->ki_eventfd, 1);
 
 put_rq:
@@ -1608,6 +1625,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
+			req->ki_eventfd = NULL;
 			goto out_put_req;
 		}
 	}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3bbdb9d02376..1dd96d4406c0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -48,7 +48,7 @@ static struct file_system_type anon_inode_fs_type = {
 	.get_sb		= anon_inodefs_get_sb,
 	.kill_sb	= kill_anon_super,
 };
-static struct dentry_operations anon_inodefs_dentry_operations = {
+static const struct dentry_operations anon_inodefs_dentry_operations = {
 	.d_delete	= anon_inodefs_delete_dentry,
 };
 
diff --git a/fs/attr.c b/fs/attr.c
index f4360192a938..9fe1b1bd30a8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -173,7 +173,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 		if (!error) {
 			if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 			    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-				error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+				error = vfs_dq_transfer(inode, attr) ?
+					-EDQUOT : 0;
 			if (!error)
 				error = inode_setattr(inode, attr);
 		}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8aacade56956..4a1401cea0a1 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -192,7 +192,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 	return 1;
 }
 
-static struct dentry_operations autofs_dentry_operations = {
+static const struct dentry_operations autofs_dentry_operations = {
 	.d_revalidate	= autofs_revalidate,
 };
 
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d06..b7ff33c63101 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *,
 			struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffea..9e5ae8a4f5c8 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
 				   struct autofs_sb_info *sbi,
 				   struct autofs_dev_ioctl *param)
 {
-	struct dentry *dentry;
 	struct vfsmount *mnt;
-	int err = -EAGAIN;
 	int how;
 
 	how = param->expire.how;
 	mnt = fp->f_path.mnt;
 
-	if (autofs_type_trigger(sbi->type))
-		dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
-	else
-		dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
-
-	if (dentry) {
-		struct autofs_info *ino = autofs4_dentry_ino(dentry);
-
-		/*
-		 * This is synchronous because it makes the daemon a
-		 * little easier
-		*/
-		err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
-
-		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-			sbi->sb->s_root->d_mounted++;
-		}
-		ino->flags &= ~AUTOFS_INF_EXPIRING;
-		complete_all(&ino->expire_complete);
-		spin_unlock(&sbi->fs_lock);
-		dput(dentry);
-	}
-
-	return err;
+	return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
 }
 
 /* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9e..75f7ddacf7d6 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
 	return ret;
 }
 
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
-   more to be done */
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-			struct autofs_sb_info *sbi, int __user *arg)
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			    struct autofs_sb_info *sbi, int when)
 {
 	struct dentry *dentry;
 	int ret = -EAGAIN;
-	int do_now = 0;
-
-	if (arg && get_user(do_now, arg))
-		return -EFAULT;
 
 	if (autofs_type_trigger(sbi->type))
-		dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
+		dentry = autofs4_expire_direct(sb, mnt, sbi, when);
 	else
-		dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
+		dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
 
 	if (dentry) {
 		struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	return ret;
 }
 
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+			struct autofs_sb_info *sbi, int __user *arg)
+{
+	int do_now = 0;
+
+	if (arg && get_user(do_now, arg))
+		return -EFAULT;
+
+	return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
+
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 716e12b627b2..69c8142da838 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -310,7 +310,7 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
 	return ino;
 }
 
-static struct dentry_operations autofs4_sb_dentry_operations = {
+static const struct dentry_operations autofs4_sb_dentry_operations = {
 	.d_release      = autofs4_dentry_release,
 };
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a41c2a7fc52..e383bf0334f1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -349,13 +349,13 @@ void autofs4_dentry_release(struct dentry *de)
 }
 
 /* For dentries of directories in the root dir */
-static struct dentry_operations autofs4_root_dentry_operations = {
+static const struct dentry_operations autofs4_root_dentry_operations = {
 	.d_revalidate	= autofs4_revalidate,
 	.d_release	= autofs4_dentry_release,
 };
 
 /* For other dentries */
-static struct dentry_operations autofs4_dentry_operations = {
+static const struct dentry_operations autofs4_dentry_operations = {
 	.d_revalidate	= autofs4_revalidate,
 	.d_release	= autofs4_dentry_release,
 };
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
-	if (expiring) {
-		/*
-		 * If we are racing with expire the request might not
-		 * be quite complete but the directory has been removed
-		 * so it must have been successful, so just wait for it.
-		 */
-		ino = autofs4_dentry_ino(expiring);
-		autofs4_expire_wait(expiring);
-		spin_lock(&sbi->lookup_lock);
-		if (!list_empty(&ino->expiring))
-			list_del_init(&ino->expiring);
-		spin_unlock(&sbi->lookup_lock);
-		dput(expiring);
-	}
-
 	unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
 	if (unhashed)
 		dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	}
 
 	if (!oz_mode) {
+		mutex_unlock(&dir->i_mutex);
+		expiring = autofs4_lookup_expiring(sbi,
+						   dentry->d_parent,
+						   &dentry->d_name);
+		if (expiring) {
+			/*
+			 * If we are racing with expire the request might not
+			 * be quite complete but the directory has been removed
+			 * so it must have been successful, so just wait for it.
+			 */
+			ino = autofs4_dentry_ino(expiring);
+			autofs4_expire_wait(expiring);
+			spin_lock(&sbi->lookup_lock);
+			if (!list_empty(&ino->expiring))
+				list_del_init(&ino->expiring);
+			spin_unlock(&sbi->lookup_lock);
+			dput(expiring);
+		}
+
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 		spin_unlock(&dentry->d_lock);
-		if (dentry->d_op && dentry->d_op->d_revalidate) {
-			mutex_unlock(&dir->i_mutex);
+		if (dentry->d_op && dentry->d_op->d_revalidate)
 			(dentry->d_op->d_revalidate)(dentry, nd);
-			mutex_lock(&dir->i_mutex);
-		}
+		mutex_lock(&dir->i_mutex);
 	}
 
 	/*
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 
 #endif				/* __KERNEL__ */
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
 befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	befs_debug(sb, "---> befs_statfs()");
 
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = 0;	/* UNKNOWN */
 	buf->f_ffree = 0;	/* UNKNOWN */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = BEFS_NAME_LEN;
 
 	befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
@@ -21,20 +19,15 @@
 #include <linux/binfmts.h>
 #include <linux/string.h>
 #include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
 #include <linux/slab.h>
-#include <linux/shm.h>
 #include <linux/personality.h>
 #include <linux/elfcore.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
-#include <linux/smp.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata;
 	unsigned long elf_bss, elf_brk;
-	int elf_exec_fileno;
 	int retval, i;
 	unsigned int size;
 	unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out_free_ph;
 	}
 
-	retval = get_unused_fd();
-	if (retval < 0)
-		goto out_free_ph;
-	get_file(bprm->file);
-	fd_install(elf_exec_fileno = retval, bprm->file);
-
 	elf_ppnt = elf_phdata;
 	elf_bss = 0;
 	elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			retval = -ENOEXEC;
 			if (elf_ppnt->p_filesz > PATH_MAX || 
 			    elf_ppnt->p_filesz < 2)
-				goto out_free_file;
+				goto out_free_ph;
 
 			retval = -ENOMEM;
 			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
 						  GFP_KERNEL);
 			if (!elf_interpreter)
-				goto out_free_file;
+				goto out_free_ph;
 
 			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
 					     elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	kfree(elf_phdata);
 
-	sys_close(elf_exec_fileno);
-
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
 		fput(interpreter);
 out_free_interp:
 	kfree(elf_interpreter);
-out_free_file:
-	sys_close(elf_exec_fileno);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..70cfc4b84ae0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 			params->elfhdr_addr = seg->addr;
 
 		/* clear any space allocated but not loaded */
-		if (phdr->p_filesz < phdr->p_memsz)
-			clear_user((void *) (seg->addr + phdr->p_filesz),
-				   phdr->p_memsz - phdr->p_filesz);
+		if (phdr->p_filesz < phdr->p_memsz) {
+			ret = clear_user((void *) (seg->addr + phdr->p_filesz),
+					 phdr->p_memsz - phdr->p_filesz);
+			if (ret)
+				return ret;
+		}
 
 		if (mm) {
 			if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, delta_vaddr;
-	int loop, dvset;
+	int loop, dvset, ret;
 
 	load_addr = params->load_addr;
 	delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		 * PT_LOAD */
 		if (prot & PROT_WRITE && disp > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-			clear_user((void __user *) maddr, disp);
+			ret = clear_user((void __user *) maddr, disp);
+			if (ret)
+				return ret;
 			maddr += disp;
 		}
 
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (prot & PROT_WRITE && excess1 > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess1);
-			clear_user((void __user *) maddr + phdr->p_filesz,
-				   excess1);
+			ret = clear_user((void __user *) maddr + phdr->p_filesz,
+					 excess1);
+			if (ret)
+				return ret;
 		}
 
 #else
 		if (excess > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess);
-			clear_user((void *) maddr + phdr->p_filesz, excess);
+			ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+			if (ret)
+				return ret;
 		}
 #endif
 
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
 static int
 load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 {
-	int som_exec_fileno;
 	int retval;
 	unsigned int size;
 	unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out_free;
 	}
 
-	retval = get_unused_fd();
-	if (retval < 0)
-		goto out_free;
-	get_file(bprm->file);
-	fd_install(som_exec_fileno = retval, bprm->file);
-
 	/* Flush all traces of the currently running executable */
 	retval = flush_old_exec(bprm);
 	if (retval)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b0144da11..31c46a241bac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -26,23 +26,23 @@
 #include <linux/workqueue.h>
 
 static struct kmem_cache *bio_integrity_slab __read_mostly;
+static mempool_t *bio_integrity_pool;
+static struct bio_set *integrity_bio_set;
 static struct workqueue_struct *kintegrityd_wq;
 
 /**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
  * @gfp_mask:	Memory allocation mask
  * @nr_vecs:	Number of integrity metadata scatter-gather elements
- * @bs:		bio_set to allocate from
  *
  * Description: This function prepares a bio for attaching integrity
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-							 gfp_t gfp_mask,
-							 unsigned int nr_vecs,
-							 struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
 {
 	struct bio_integrity_payload *bip;
 	struct bio_vec *iv;
@@ -50,7 +50,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	BUG_ON(bio == NULL);
 
-	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	bip = mempool_alloc(bio_integrity_pool, gfp_mask);
 	if (unlikely(bip == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
 		return NULL;
@@ -58,10 +58,10 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	memset(bip, 0, sizeof(*bip));
 
-	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
 	if (unlikely(iv == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, bio_integrity_pool);
 		return NULL;
 	}
 
@@ -72,35 +72,16 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	return bip;
 }
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
-{
-	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
 EXPORT_SYMBOL(bio_integrity_alloc);
 
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
- * @bs:		bio_set this bio was allocated from
  *
  * Description: Used to free the integrity portion of a bio. Usually
  * called from bio_free().
  */
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 
@@ -111,8 +92,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
-	mempool_free(bip, bs->bio_integrity_pool);
+	bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+	mempool_free(bip, bio_integrity_pool);
 
 	bio->bi_integrity = NULL;
 }
@@ -685,19 +666,18 @@ EXPORT_SYMBOL(bio_integrity_split);
  * bio_integrity_clone - Callback for cloning bios with integrity metadata
  * @bio:	New bio
  * @bio_src:	Original bio
- * @bs:		bio_set to allocate bip from
+ * @gfp_mask:	Memory allocation mask
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			struct bio_set *bs)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
 
 	BUG_ON(bip_src == NULL);
 
-	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
 
 	if (bip == NULL)
 		return -EIO;
@@ -713,37 +693,25 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 }
 EXPORT_SYMBOL(bio_integrity_clone);
 
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
+static int __init bio_integrity_init(void)
 {
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
-							  bio_integrity_slab);
-	if (!bs->bio_integrity_pool)
-		return -1;
-
-	return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
+	kintegrityd_wq = create_workqueue("kintegrityd");
 
-void bioset_integrity_free(struct bio_set *bs)
-{
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
 
-void __init bio_integrity_init_slab(void)
-{
 	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
 					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-}
 
-static int __init integrity_init(void)
-{
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+						      bio_integrity_slab);
+	if (!bio_integrity_pool)
+		panic("bio_integrity: can't allocate bip pool\n");
 
-	if (!kintegrityd_wq)
-		panic("Failed to create kintegrityd\n");
+	integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	if (!integrity_bio_set)
+		panic("bio_integrity: can't allocate bio_set\n");
 
 	return 0;
 }
-subsys_initcall(integrity_init);
+subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..e0c9e545bbfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,7 +248,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio, bs);
+		bio_integrity_free(bio);
 
 	/*
 	 * If we have front padding, adjust the bio pointer before freeing
@@ -301,47 +301,51 @@ void bio_init(struct bio *bio)
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	struct bio_vec *bvl = NULL;
 	struct bio *bio = NULL;
+	unsigned long idx = 0;
+	void *p = NULL;
 
 	if (bs) {
-		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
-
-		if (p)
-			bio = p + bs->front_pad;
-	} else
+		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		if (!p)
+			goto err;
+		bio = p + bs->front_pad;
+	} else {
 		bio = kmalloc(sizeof(*bio), gfp_mask);
+		if (!bio)
+			goto err;
+	}
 
-	if (likely(bio)) {
-		struct bio_vec *bvl = NULL;
-
-		bio_init(bio);
-		if (likely(nr_iovecs)) {
-			unsigned long uninitialized_var(idx);
-
-			if (nr_iovecs <= BIO_INLINE_VECS) {
-				idx = 0;
-				bvl = bio->bi_inline_vecs;
-				nr_iovecs = BIO_INLINE_VECS;
-			} else {
-				bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
-							bs);
-				nr_iovecs = bvec_nr_vecs(idx);
-			}
-			if (unlikely(!bvl)) {
-				if (bs)
-					mempool_free(bio, bs->bio_pool);
-				else
-					kfree(bio);
-				bio = NULL;
-				goto out;
-			}
-			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = nr_iovecs;
-		}
-		bio->bi_io_vec = bvl;
+	bio_init(bio);
+
+	if (unlikely(!nr_iovecs))
+		goto out_set;
+
+	if (nr_iovecs <= BIO_INLINE_VECS) {
+		bvl = bio->bi_inline_vecs;
+		nr_iovecs = BIO_INLINE_VECS;
+	} else {
+		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		if (unlikely(!bvl))
+			goto err_free;
+
+		nr_iovecs = bvec_nr_vecs(idx);
 	}
-out:
+	bio->bi_flags |= idx << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+out_set:
+	bio->bi_io_vec = bvl;
+
 	return bio;
+
+err_free:
+	if (bs)
+		mempool_free(p, bs->bio_pool);
+	else
+		kfree(bio);
+err:
+	return NULL;
 }
 
 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
@@ -462,10 +466,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, fs_bio_set);
+		ret = bio_integrity_clone(b, bio, gfp_mask);
 
-		if (ret < 0)
+		if (ret < 0) {
+			bio_put(b);
 			return NULL;
+		}
 	}
 
 	return b;
@@ -1414,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 }
 
 /*
- * split a bio - only worry about a bio with a single page
- * in it's iovec
+ * split a bio - only worry about a bio with a single page in its iovec
  */
 struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
@@ -1523,7 +1528,6 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
-	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
@@ -1564,9 +1568,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1583,6 +1584,13 @@ static void __init biovec_init_slabs(void)
 		int size;
 		struct biovec_slab *bvs = bvec_slabs + i;
 
+#ifndef CONFIG_BLK_DEV_INTEGRITY
+		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
+			bvs->slab = NULL;
+			continue;
+		}
+#endif
+
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
@@ -1597,7 +1605,6 @@ static int __init init_bio(void)
 	if (!bio_slabs)
 		panic("bio: can't allocate bios\n");
 
-	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1efff5e1d..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
@@ -174,6 +175,152 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = fsync_super(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev);
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until thaw_bdev() is called.
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
+
+	down(&bdev->bd_mount_sem);
+	sb = get_super(bdev);
+	if (sb && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_frozen = SB_FREEZE_WRITE;
+		smp_wmb();
+
+		__fsync_super(sb);
+
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+
+		sync_blockdev(sb->s_bdev);
+
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
+	}
+
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
+	if (sb) {
+		BUG_ON(sb->s_bdev != bdev);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
+		drop_super(sb);
+	}
+
+	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o
+	   compression.o delayed-ref.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 		}
 
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 
 	if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
 				if (!list_empty(&worker->pending))
 					continue;
 
+				if (kthread_should_stop())
+					break;
+
 				/* still no more work?, sleep for real */
 				spin_lock_irq(&worker->lock);
 				set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
 				worker->working = 0;
 				spin_unlock_irq(&worker->lock);
 
-				schedule();
+				if (!kthread_should_stop())
+					schedule();
 			}
 			__set_current_state(TASK_RUNNING);
 		}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,15 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/*
+	 * list for tracking inodes that must be sent to disk before a
+	 * rename or truncate commit
+	 */
+	struct list_head ordered_operations;
+
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
@@ -83,17 +92,16 @@ struct btrfs_inode {
 	 */
 	u64 logged_trans;
 
-	/*
-	 * trans that last made a change that should be fully fsync'd.  This
-	 * gets reset to zero each time the inode is logged
-	 */
-	u64 log_dirty_trans;
-
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
 	 */
 	u64 delalloc_bytes;
 
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
@@ -113,6 +121,25 @@ struct btrfs_inode {
 	/* the start of block group preferred for allocations. */
 	u64 block_group;
 
+	/* the fsync log has some corner cases that mean we have to check
+	 * directories to see if any unlinks have been done before
+	 * the directory was logged.  See tree-log.c for all the
+	 * details
+	 */
+	u64 last_unlink_trans;
+
+	/*
+	 * ordered_data_close is set by truncate when a file that used
+	 * to have good data has been truncated to zero.  When it is set
+	 * the btrfs file release call will add this inode to the
+	 * ordered operations list so that we make sure to flush out any
+	 * new data the application may have written before commit.
+	 *
+	 * yes, its silly to have a single bitflag, but we might grow more
+	 * of these.
+	 */
+	unsigned ordered_data_close:1;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a9..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-	memset(p, 0, sizeof(*p));
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
-	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path) {
-		btrfs_init_path(path);
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
 		path->reada = 1;
-	}
 	return path;
 }
 
@@ -69,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 
 /*
  * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
  */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
 {
 	int i;
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 		if (p->nodes[i] && p->locks[i])
 			btrfs_clear_lock_blocking(p->nodes[i]);
 	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
 }
 
 /* this also releases the path */
@@ -237,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * empty_size -- a hint that you plan on doing more cow.  This is the size in
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
 			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size,
-			     u64 prealloc_dest)
+			     u64 search_start, u64 empty_size)
 {
 	u64 parent_start;
 	struct extent_buffer *cow;
@@ -260,7 +272,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 
 	if (parent)
 		parent_start = parent->start;
@@ -274,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 
-	if (prealloc_dest) {
-		struct btrfs_key ins;
-
-		ins.objectid = prealloc_dest;
-		ins.offset = buf->len;
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
-						  root->root_key.objectid,
-						  trans->transid, level, &ins);
-		BUG_ON(ret);
-		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len);
-	} else {
-		cow = btrfs_alloc_free_block(trans, root, buf->len,
-					     parent_start,
-					     root->root_key.objectid,
-					     trans->transid, level,
-					     search_start, empty_size);
-	}
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
+				     parent_start, root->root_key.objectid,
+				     trans->transid, level,
+				     search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -396,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret, u64 prealloc_dest)
+		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
 	int ret;
@@ -419,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
-		WARN_ON(prealloc_dest);
 		return 0;
 	}
 
@@ -430,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_lock_blocking(buf);
 
 	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0,
-				 prealloc_dest);
+				 parent_slot, cow_ret, search_start, 0);
 	return ret;
 }
 
@@ -600,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
-					    (end_slot - i) * blocksize), 0);
+					    (end_slot - i) * blocksize));
 		if (err) {
 			btrfs_tree_unlock(cur);
 			free_extent_buffer(cur);
@@ -917,10 +911,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		BUG_ON(!child);
-		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
 		BUG_ON(ret);
 
 		spin_lock(&root->node_lock);
@@ -928,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		spin_unlock(&root->node_lock);
 
 		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      child->len,
 					      mid->start, child->start,
 					      root->root_key.objectid,
 					      trans->transid, level - 1);
@@ -954,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
+	if (trans->transaction->delayed_refs.flushing &&
+	    btrfs_header_nritems(mid) > 2)
+		return 0;
+
 	if (btrfs_header_nritems(mid) < 2)
 		err_on_enospc = 1;
 
@@ -962,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(left);
 		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
-				       parent, pslot - 1, &left, 0);
+				       parent, pslot - 1, &left);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -973,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(right);
 		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
-				       parent, pslot + 1, &right, 0);
+				       parent, pslot + 1, &right);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -1154,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			wret = 1;
 		} else {
 			ret = btrfs_cow_block(trans, root, left, parent,
-					      pslot - 1, &left, 0);
+					      pslot - 1, &left);
 			if (ret)
 				wret = 1;
 			else {
@@ -1205,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		} else {
 			ret = btrfs_cow_block(trans, root, right,
 					      parent, pslot + 1,
-					      &right, 0);
+					      &right);
 			if (ret)
 				wret = 1;
 			else {
@@ -1245,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  * readahead one full node of leaves, finding things that are close
  * to the block in 'slot', and triggering ra on them.
  */
-static noinline void reada_for_search(struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      int level, int slot, u64 objectid)
+static void reada_for_search(struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     int level, int slot, u64 objectid)
 {
 	struct extent_buffer *node;
 	struct btrfs_disk_key disk_key;
@@ -1448,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 }
 
 /*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer **eb_ret, int level, int slot,
+		       struct btrfs_key *key)
+{
+	u64 blocknr;
+	u64 gen;
+	u32 blocksize;
+	struct extent_buffer *b = *eb_ret;
+	struct extent_buffer *tmp;
+
+	blocknr = btrfs_node_blockptr(b, slot);
+	gen = btrfs_node_ptr_generation(b, slot);
+	blocksize = btrfs_level_size(root, level - 1);
+
+	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+		*eb_ret = tmp;
+		return 0;
+	}
+
+	/*
+	 * reduce lock contention at high levels
+	 * of the btree by dropping locks before
+	 * we read.
+	 */
+	btrfs_release_path(NULL, p);
+	if (tmp)
+		free_extent_buffer(tmp);
+	if (p->reada)
+		reada_for_search(root, p, level, slot, key->objectid);
+
+	tmp = read_tree_block(root, blocknr, blocksize, gen);
+	if (tmp)
+		free_extent_buffer(tmp);
+	return -EAGAIN;
+}
+
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *p,
+		       struct extent_buffer *b, int level, int ins_len)
+{
+	int ret;
+	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+		int sret;
+
+		sret = reada_for_balance(root, p, level);
+		if (sret)
+			goto again;
+
+		btrfs_set_path_blocking(p);
+		sret = split_node(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL);
+
+		BUG_ON(sret > 0);
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+	} else if (ins_len < 0 && btrfs_header_nritems(b) <
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+		int sret;
+
+		sret = reada_for_balance(root, p, level);
+		if (sret)
+			goto again;
+
+		btrfs_set_path_blocking(p);
+		sret = balance_level(trans, root, p, level);
+		btrfs_clear_path_blocking(p, NULL);
+
+		if (sret) {
+			ret = sret;
+			goto done;
+		}
+		b = p->nodes[level];
+		if (!b) {
+			btrfs_release_path(NULL, p);
+			goto again;
+		}
+		BUG_ON(btrfs_header_nritems(b) == 1);
+	}
+	return 0;
+
+again:
+	ret = -EAGAIN;
+done:
+	return ret;
+}
+
+/*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
  * level of the path (level 0)
@@ -1465,17 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow)
 {
 	struct extent_buffer *b;
-	struct extent_buffer *tmp;
 	int slot;
 	int ret;
 	int level;
-	int should_reada = p->reada;
 	int lowest_unlock = 1;
-	int blocksize;
 	u8 lowest_level = 0;
-	u64 blocknr;
-	u64 gen;
-	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
@@ -1484,8 +1588,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
-	prealloc_block.objectid = 0;
-
 again:
 	if (p->skip_locking)
 		b = btrfs_root_node(root);
@@ -1506,50 +1608,21 @@ again:
 		if (cow) {
 			int wret;
 
-			/* is a cow on this block not required */
+			/*
+			 * if we don't really need to cow this block
+			 * then we don't want to set the path blocking,
+			 * so we test it here
+			 */
 			if (btrfs_header_generation(b) == trans->transid &&
 			    btrfs_header_owner(b) == root->root_key.objectid &&
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
 				goto cow_done;
 			}
-
-			/* ok, we have to cow, is our old prealloc the right
-			 * size?
-			 */
-			if (prealloc_block.objectid &&
-			    prealloc_block.offset != b->len) {
-				btrfs_release_path(root, p);
-				btrfs_free_reserved_extent(root,
-					   prealloc_block.objectid,
-					   prealloc_block.offset);
-				prealloc_block.objectid = 0;
-				goto again;
-			}
-
-			/*
-			 * for higher level blocks, try not to allocate blocks
-			 * with the block and the parent locks held.
-			 */
-			if (level > 0 && !prealloc_block.objectid) {
-				u32 size = b->len;
-				u64 hint = b->start;
-
-				btrfs_release_path(root, p);
-				ret = btrfs_reserve_extent(trans, root,
-							   size, size, 0,
-							   hint, (u64)-1,
-							   &prealloc_block, 0);
-				BUG_ON(ret);
-				goto again;
-			}
-
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
-					       p->slots[level + 1],
-					       &b, prealloc_block.objectid);
-			prealloc_block.objectid = 0;
+					       p->slots[level + 1], &b);
 			if (wret) {
 				free_extent_buffer(b);
 				ret = wret;
@@ -1566,7 +1639,7 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
-		btrfs_clear_path_blocking(p);
+		btrfs_clear_path_blocking(p, NULL);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -1594,51 +1667,15 @@ cow_done:
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if ((p->search_for_split || ins_len > 0) &&
-			    btrfs_header_nritems(b) >=
-			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-				int sret;
-
-				sret = reada_for_balance(root, p, level);
-				if (sret)
-					goto again;
-
-				btrfs_set_path_blocking(p);
-				sret = split_node(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
-
-				BUG_ON(sret > 0);
-				if (sret) {
-					ret = sret;
-					goto done;
-				}
-				b = p->nodes[level];
-				slot = p->slots[level];
-			} else if (ins_len < 0 &&
-				   btrfs_header_nritems(b) <
-				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
-				int sret;
-
-				sret = reada_for_balance(root, p, level);
-				if (sret)
-					goto again;
-
-				btrfs_set_path_blocking(p);
-				sret = balance_level(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+			ret = setup_nodes_for_search(trans, root, p, b, level,
+						     ins_len);
+			if (ret == -EAGAIN)
+				goto again;
+			else if (ret)
+				goto done;
+			b = p->nodes[level];
+			slot = p->slots[level];
 
-				if (sret) {
-					ret = sret;
-					goto done;
-				}
-				b = p->nodes[level];
-				if (!b) {
-					btrfs_release_path(NULL, p);
-					goto again;
-				}
-				slot = p->slots[level];
-				BUG_ON(btrfs_header_nritems(b) == 1);
-			}
 			unlock_up(p, level, lowest_unlock);
 
 			/* this is only true while dropping a snapshot */
@@ -1647,54 +1684,21 @@ cow_done:
 				goto done;
 			}
 
-			blocknr = btrfs_node_blockptr(b, slot);
-			gen = btrfs_node_ptr_generation(b, slot);
-			blocksize = btrfs_level_size(root, level - 1);
+			ret = read_block_for_search(trans, root, p,
+						    &b, level, slot, key);
+			if (ret == -EAGAIN)
+				goto again;
 
-			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-				b = tmp;
-			} else {
-				/*
-				 * reduce lock contention at high levels
-				 * of the btree by dropping locks before
-				 * we read.
-				 */
-				if (level > 0) {
-					btrfs_release_path(NULL, p);
-					if (tmp)
-						free_extent_buffer(tmp);
-					if (should_reada)
-						reada_for_search(root, p,
-								 level, slot,
-								 key->objectid);
-
-					tmp = read_tree_block(root, blocknr,
-							 blocksize, gen);
-					if (tmp)
-						free_extent_buffer(tmp);
-					goto again;
-				} else {
-					btrfs_set_path_blocking(p);
-					if (tmp)
-						free_extent_buffer(tmp);
-					if (should_reada)
-						reada_for_search(root, p,
-								 level, slot,
-								 key->objectid);
-					b = read_node_slot(root, b, slot);
-				}
-			}
 			if (!p->skip_locking) {
 				int lret;
 
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 				lret = btrfs_try_spin_lock(b);
 
 				if (!lret) {
 					btrfs_set_path_blocking(p);
 					btrfs_tree_lock(b);
-					btrfs_clear_path_blocking(p);
+					btrfs_clear_path_blocking(p, b);
 				}
 			}
 		} else {
@@ -1706,7 +1710,7 @@ cow_done:
 				btrfs_set_path_blocking(p);
 				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -1725,12 +1729,8 @@ done:
 	 * we don't really know what they plan on doing with the path
 	 * from here on, so for now just mark it as blocking
 	 */
-	btrfs_set_path_blocking(p);
-	if (prealloc_block.objectid) {
-		btrfs_free_reserved_extent(root,
-			   prealloc_block.objectid,
-			   prealloc_block.offset);
-	}
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
 	return ret;
 }
 
@@ -1751,7 +1751,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	int ret;
 
 	eb = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
 	BUG_ON(ret);
 
 	btrfs_set_lock_blocking(eb);
@@ -1809,7 +1809,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
-					      &eb, 0);
+					      &eb);
 			BUG_ON(ret);
 
 			if (root->root_key.objectid ==
@@ -2122,7 +2122,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	spin_unlock(&root->node_lock);
 
 	ret = btrfs_update_extent_ref(trans, root, lower->start,
-				      lower->start, c->start,
+				      lower->len, lower->start, c->start,
 				      root->root_key.objectid,
 				      trans->transid, level - 1);
 	BUG_ON(ret);
@@ -2157,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(!path->nodes[level]);
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
-	if (slot > nritems)
-		BUG();
+	BUG_ON(slot > nritems);
 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
 		BUG();
 	if (slot != nritems) {
@@ -2204,7 +2203,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else {
+	} else if (!trans->transaction->delayed_refs.flushing) {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2312,66 +2311,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	return ret;
 }
 
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size,
-			   int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int data_size, int empty,
+				      struct extent_buffer *right,
+				      int free_space, u32 left_nritems)
 {
 	struct extent_buffer *left = path->nodes[0];
-	struct extent_buffer *right;
-	struct extent_buffer *upper;
+	struct extent_buffer *upper = path->nodes[1];
 	struct btrfs_disk_key disk_key;
 	int slot;
 	u32 i;
-	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
-	u32 left_nritems;
 	u32 nr;
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
 	int ret;
 
-	slot = path->slots[1];
-	if (!path->nodes[1])
-		return 1;
-
-	upper = path->nodes[1];
-	if (slot >= btrfs_header_nritems(upper) - 1)
-		return 1;
-
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
-
-	right = read_node_slot(root, upper, slot + 1);
-	btrfs_tree_lock(right);
-	btrfs_set_lock_blocking(right);
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, right, upper,
-			      slot + 1, &right, 0);
-	if (ret)
-		goto out_unlock;
-
-	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size)
-		goto out_unlock;
-
-	left_nritems = btrfs_header_nritems(left);
-	if (left_nritems == 0)
-		goto out_unlock;
-
 	if (empty)
 		nr = 0;
 	else
@@ -2380,6 +2340,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (path->slots[0] >= left_nritems)
 		push_space += data_size;
 
+	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
@@ -2511,24 +2472,82 @@ out_unlock:
 }
 
 /*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int data_size,
+			   int empty)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	int slot;
+	int free_space;
+	u32 left_nritems;
+	int ret;
+
+	if (!path->nodes[1])
+		return 1;
+
+	slot = path->slots[1];
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
+	return __push_leaf_right(trans, root, path, data_size, empty,
+				right, free_space, left_nritems);
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size,
-			  int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path, int data_size,
+				     int empty, struct extent_buffer *left,
+				     int free_space, int right_nritems)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
-	struct extent_buffer *left;
 	int slot;
 	int i;
-	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
 	u32 old_left_nritems;
-	u32 right_nritems;
 	u32 nr;
 	int ret = 0;
 	int wret;
@@ -2536,41 +2555,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 old_left_item_size;
 
 	slot = path->slots[1];
-	if (slot == 0)
-		return 1;
-	if (!path->nodes[1])
-		return 1;
-
-	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0)
-		return 1;
-
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
-
-	left = read_node_slot(root, path->nodes[1], slot - 1);
-	btrfs_tree_lock(left);
-	btrfs_set_lock_blocking(left);
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
-
-	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, left,
-			      path->nodes[1], slot - 1, &left, 0);
-	if (ret) {
-		/* we hit -ENOSPC, but it isn't fatal here */
-		ret = 1;
-		goto out;
-	}
-
-	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size) {
-		ret = 1;
-		goto out;
-	}
 
 	if (empty)
 		nr = right_nritems;
@@ -2738,6 +2722,154 @@ out:
 }
 
 /*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int data_size,
+			  int empty)
+{
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
+	int slot;
+	int free_space;
+	u32 right_nritems;
+	int ret = 0;
+
+	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	btrfs_assert_tree_locked(path->nodes[1]);
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	return __push_leaf_left(trans, root, path, data_size,
+			       empty, left, free_space, right_nritems);
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct extent_buffer *l,
+			       struct extent_buffer *right,
+			       int slot, int mid, int nritems)
+{
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	int ret = 0;
+	int wret;
+	struct btrfs_disk_key disk_key;
+
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	ret = 0;
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
+	if (wret)
+		ret = wret;
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
+
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+
+	return ret;
+}
+
+/*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
  *
@@ -2754,17 +2886,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
-	int data_copy_size;
-	int rt_data_off;
-	int i;
 	int ret = 0;
 	int wret;
 	int double_split;
 	int num_doubles = 0;
-	struct btrfs_disk_key disk_key;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+	    !trans->transaction->delayed_refs.flushing) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -2813,11 +2942,14 @@ again:
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
+
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
+				struct btrfs_disk_key disk_key;
+
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2845,6 +2977,8 @@ again:
 		if (leaf_space_used(l, 0, mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (!extend && data_size && slot == 0) {
+				struct btrfs_disk_key disk_key;
+
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2877,76 +3011,16 @@ again:
 			}
 		}
 	}
-	nritems = nritems - mid;
-	btrfs_set_header_nritems(right, nritems);
-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
-	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-			   btrfs_item_nr_offset(mid),
-			   nritems * sizeof(struct btrfs_item));
-
-	copy_extent_buffer(right, l,
-		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-		     data_copy_size, btrfs_leaf_data(l) +
-		     leaf_data_end(root, l), data_copy_size);
-
-	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-		      btrfs_item_end_nr(l, mid);
-
-	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
-		u32 ioff;
-
-		if (!right->map_token) {
-			map_extent_buffer(right, (unsigned long)item,
-					sizeof(struct btrfs_item),
-					&right->map_token, &right->kaddr,
-					&right->map_start, &right->map_len,
-					KM_USER1);
-		}
-
-		ioff = btrfs_item_offset(right, item);
-		btrfs_set_item_offset(right, item, ioff + rt_data_off);
-	}
-
-	if (right->map_token) {
-		unmap_extent_buffer(right, right->map_token, KM_USER1);
-		right->map_token = NULL;
-	}
 
-	btrfs_set_header_nritems(l, mid);
-	ret = 0;
-	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key, right->start,
-			  path->slots[1] + 1, 1);
-	if (wret)
-		ret = wret;
-
-	btrfs_mark_buffer_dirty(right);
-	btrfs_mark_buffer_dirty(l);
-	BUG_ON(path->slots[0] != slot);
-
-	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 	BUG_ON(ret);
 
-	if (mid <= slot) {
-		btrfs_tree_unlock(path->nodes[0]);
-		free_extent_buffer(path->nodes[0]);
-		path->nodes[0] = right;
-		path->slots[0] -= mid;
-		path->slots[1] += 1;
-	} else {
-		btrfs_tree_unlock(right);
-		free_extent_buffer(right);
-	}
-
-	BUG_ON(path->slots[0] < 0);
-
 	if (double_split) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
 	}
+
 	return ret;
 }
 
@@ -3004,26 +3078,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		return -EAGAIN;
 	}
 
+	btrfs_set_path_blocking(path);
 	ret = split_leaf(trans, root, &orig_key, path,
 			 sizeof(struct btrfs_item), 1);
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
+	btrfs_unlock_up_safe(path, 1);
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
 	/*
 	 * make sure any changes to the path from split_leaf leave it
 	 * in a blocking state
 	 */
 	btrfs_set_path_blocking(path);
 
-	leaf = path->nodes[0];
-	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
 	item = btrfs_item_nr(leaf, path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
-
 	buf = kmalloc(item_size, GFP_NOFS);
 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
 			    path->slots[0]), item_size);
@@ -3428,39 +3503,27 @@ out:
 }
 
 /*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
  */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct btrfs_path *path,
+		      struct btrfs_key *cpu_key, u32 *data_size,
+		      u32 total_data, u32 total_size, int nr)
 {
-	struct extent_buffer *leaf;
 	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int slot_orig;
 	int i;
 	u32 nritems;
-	u32 total_size = 0;
-	u32 total_data = 0;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
+	int ret;
+	struct extent_buffer *leaf;
+	int slot;
 
-	for (i = 0; i < nr; i++)
-		total_data += data_size[i];
-
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
+	slot = path->slots[0];
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
@@ -3472,9 +3535,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
 	if (slot != nritems) {
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
@@ -3530,21 +3590,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		data_end -= data_size[i];
 		btrfs_set_item_size(leaf, item, data_size[i]);
 	}
+
 	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
 	if (slot == 0) {
+		struct btrfs_disk_key disk_key;
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	}
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	int ret = 0;
+	int slot;
+	int i;
+	u32 total_size = 0;
+	u32 total_data = 0;
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+			       total_data, total_size, nr);
+
 out:
-	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3732,7 +3831,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+		    !trans->transaction->delayed_refs.flushing) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
@@ -3740,6 +3840,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
+			btrfs_set_path_blocking(path);
 			wret = push_leaf_left(trans, root, path, 1, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
@@ -3926,7 +4027,6 @@ find_next_key:
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
-				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3946,7 +4046,7 @@ find_next_key:
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
-		btrfs_clear_path_blocking(path);
+		btrfs_clear_path_blocking(path, NULL);
 	}
 out:
 	if (ret == 0)
@@ -4026,28 +4126,44 @@ next:
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	int slot;
-	int level = 1;
+	int level;
 	struct extent_buffer *c;
-	struct extent_buffer *next = NULL;
+	struct extent_buffer *next;
 	struct btrfs_key key;
 	u32 nritems;
 	int ret;
+	int old_spinning = path->leave_spinning;
+	int force_blocking = 0;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	if (nritems == 0)
 		return 1;
 
-	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+	/*
+	 * we take the blocks in an order that upsets lockdep.  Using
+	 * blocking mode is the only way around it.
+	 */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	force_blocking = 1;
+#endif
 
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+	level = 1;
+	next = NULL;
 	btrfs_release_path(root, path);
+
 	path->keep_locks = 1;
+
+	if (!force_blocking)
+		path->leave_spinning = 1;
+
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	path->keep_locks = 0;
 
 	if (ret < 0)
 		return ret;
 
-	btrfs_set_path_blocking(path);
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	/*
 	 * by releasing the path above we dropped all our locks.  A balance
@@ -4057,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	 */
 	if (nritems > 0 && path->slots[0] < nritems - 1) {
 		path->slots[0]++;
+		ret = 0;
 		goto done;
 	}
 
 	while (level < BTRFS_MAX_LEVEL) {
-		if (!path->nodes[level])
-			return 1;
+		if (!path->nodes[level]) {
+			ret = 1;
+			goto done;
+		}
 
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL)
-				return 1;
+			if (level == BTRFS_MAX_LEVEL) {
+				ret = 1;
+				goto done;
+			}
 			continue;
 		}
 
@@ -4078,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 		}
 
-		/* the path was set to blocking above */
-		if (level == 1 && (path->locks[1] || path->skip_locking) &&
-		    path->reada)
-			reada_for_search(root, path, level, slot, 0);
+		next = c;
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    slot, &key);
+		if (ret == -EAGAIN)
+			goto again;
 
-		next = read_node_slot(root, c, slot);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(c));
-			btrfs_tree_lock(next);
-			btrfs_set_lock_blocking(next);
+			ret = btrfs_try_spin_lock(next);
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_lock(next);
+				if (!force_blocking)
+					btrfs_clear_path_blocking(path, next);
+			}
+			if (force_blocking)
+				btrfs_set_lock_blocking(next);
 		}
 		break;
 	}
@@ -4097,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (path->locks[level])
 			btrfs_tree_unlock(c);
+
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
 		if (!path->skip_locking)
 			path->locks[level] = 1;
+
 		if (!level)
 			break;
 
-		btrfs_set_path_blocking(path);
-		if (level == 1 && path->locks[1] && path->reada)
-			reada_for_search(root, path, level, slot, 0);
-		next = read_node_slot(root, next, 0);
+		ret = read_block_for_search(NULL, root, path, &next, level,
+					    0, &key);
+		if (ret == -EAGAIN)
+			goto again;
+
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
-			btrfs_tree_lock(next);
-			btrfs_set_lock_blocking(next);
+			btrfs_assert_tree_locked(path->nodes[level]);
+			ret = btrfs_try_spin_lock(next);
+			if (!ret) {
+				btrfs_set_path_blocking(path);
+				btrfs_tree_lock(next);
+				if (!force_blocking)
+					btrfs_clear_path_blocking(path, next);
+			}
+			if (force_blocking)
+				btrfs_set_lock_blocking(next);
 		}
 	}
+	ret = 0;
 done:
 	unlock_up(path, 0, 1);
-	return 0;
+	path->leave_spinning = old_spinning;
+	if (!old_spinning)
+		btrfs_set_path_blocking(path);
+
+	return ret;
 }
 
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8bd..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,14 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
+
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -140,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_MAX		9
 
 /*
- * the key defines the order in the tree, and so it also defines (optimal)
- * block layout.  objectid corresonds to the inode number.  The flags
- * tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with flags of 1 might refer to the inode
- * data, flags of 2 may point to file data in the btree and flags == 3
- * may point to extents.
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
  *
  * offset is the starting byte offset for this key in the stream.
  *
@@ -197,7 +203,7 @@ struct btrfs_dev_item {
 
 	/*
 	 * starting byte of this partition on the device,
-	 * to allowr for stripe alignment in the future
+	 * to allow for stripe alignment in the future
 	 */
 	__le64 start_offset;
 
@@ -405,15 +411,16 @@ struct btrfs_path {
 	int locks[BTRFS_MAX_LEVEL];
 	int reada;
 	/* keep some upper locks as we walk down */
-	int keep_locks;
-	int skip_locking;
 	int lowest_level;
 
 	/*
 	 * set by btrfs_split_item, tells search_slot to keep all locks
 	 * and to force calls to keep space in the nodes
 	 */
-	int search_for_split;
+	unsigned int search_for_split:1;
+	unsigned int keep_locks:1;
+	unsigned int skip_locking:1;
+	unsigned int leave_spinning:1;
 };
 
 /*
@@ -600,13 +607,27 @@ struct btrfs_block_group_item {
 
 struct btrfs_space_info {
 	u64 flags;
-	u64 total_bytes;
-	u64 bytes_used;
-	u64 bytes_pinned;
-	u64 bytes_reserved;
-	u64 bytes_readonly;
-	int full;
-	int force_alloc;
+
+	u64 total_bytes;	/* total bytes in the space */
+	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	/* delalloc accounting */
+	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+				   this space is not necessarily reserved yet
+				   by the allocator */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc */
+
+	int full;		/* indicates that we cannot allocate any more
+				   chunks for this space */
+	int force_alloc;	/* set if we need to force a chunk alloc for
+				   this space */
+
 	struct list_head list;
 
 	/* for block groups in our same type */
@@ -615,18 +636,35 @@ struct btrfs_space_info {
 	struct rw_semaphore groups_sem;
 };
 
-struct btrfs_free_space {
-	struct rb_node bytes_index;
-	struct rb_node offset_index;
-	u64 offset;
-	u64 bytes;
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes.  They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+	spinlock_t lock;
+	spinlock_t refill_lock;
+	struct rb_root root;
+
+	/* largest extent in this cluster */
+	u64 max_size;
+
+	/* first extent starting offset */
+	u64 window_start;
+
+	struct btrfs_block_group_cache *block_group;
+	/*
+	 * when a cluster is allocated from a block group, we put the
+	 * cluster onto a list in the block group so that it can
+	 * be freed before the block group is freed.
+	 */
+	struct list_head block_group_list;
 };
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
-	struct mutex alloc_mutex;
 	struct mutex cache_mutex;
 	u64 pinned;
 	u64 reserved;
@@ -638,6 +676,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_space_info *space_info;
 
 	/* free space cache stuff */
+	spinlock_t tree_lock;
 	struct rb_root free_space_bytes;
 	struct rb_root free_space_offset;
 
@@ -649,6 +688,11 @@ struct btrfs_block_group_cache {
 
 	/* usage count */
 	atomic_t count;
+
+	/* List of struct btrfs_free_clusters for this block group.
+	 * Today it will only have one thing on it, but that may change
+	 */
+	struct list_head cluster_list;
 };
 
 struct btrfs_leaf_ref_tree {
@@ -678,15 +722,18 @@ struct btrfs_fs_info {
 	struct rb_root block_group_cache_tree;
 
 	struct extent_io_tree pinned_extents;
-	struct extent_io_tree pending_del;
-	struct extent_io_tree extent_ins;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
 
 	u64 generation;
 	u64 last_trans_committed;
-	u64 last_trans_new_blockgroup;
+
+	/*
+	 * this is updated to the current trans every time a full commit
+	 * is required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
@@ -707,12 +754,20 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex extent_ins_mutex;
-	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
+
+	/*
+	 * this protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make
+	 * sure the commit code doesn't find the list temporarily empty
+	 * because another function happens to be doing non-waiting preflush
+	 * before jumping into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -727,10 +782,29 @@ struct btrfs_fs_info {
 	 * ordered extents
 	 */
 	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
 	struct list_head ordered_extents;
+
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
 	struct list_head delalloc_inodes;
 
 	/*
+	 * special rename and truncate targets that must be on disk before
+	 * we're allowed to commit.  This is basically the ext3 style
+	 * data=ordered list.
+	 */
+	struct list_head ordered_operations;
+
+	/*
 	 * there is a pool of worker threads for checksumming during writes
 	 * and a pool for checksumming after reads.  This is because readers
 	 * can run with FS locks held, and the writers may be waiting for
@@ -771,15 +845,31 @@ struct btrfs_fs_info {
 	atomic_t throttle_gen;
 
 	u64 total_pinned;
+
+	/* protected by the delalloc lock, used to keep from writing
+	 * metadata until there is a nice batch
+	 */
+	u64 dirty_metadata_bytes;
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * the space_info list is almost entirely read only.  It only changes
+	 * when we add a new raid type to the FS, and that happens
+	 * very rarely.  RCU is used to protect it.
+	 */
 	struct list_head space_info;
+
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
-	u64 last_alloc;
-	u64 last_data_alloc;
+
+	/* data_alloc_cluster is only used in ssd mode */
+	struct btrfs_free_cluster data_alloc_cluster;
+
+	/* all metadata allocations go through this cluster */
+	struct btrfs_free_cluster meta_alloc_cluster;
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
@@ -871,7 +961,6 @@ struct btrfs_root {
 };
 
 /*
-
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
  * the FS
@@ -902,7 +991,7 @@ struct btrfs_root {
 #define BTRFS_EXTENT_CSUM_KEY	128
 
 /*
- * root items point to tree roots.  There are typically in the root
+ * root items point to tree roots.  They are typically in the root
  * tree used by the super block to find all the other trees
  */
 #define BTRFS_ROOT_ITEM_KEY	132
@@ -949,6 +1038,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SSD			(1 << 3)
 #define BTRFS_MOUNT_DEGRADED		(1 << 4)
 #define BTRFS_MOUNT_COMPRESS		(1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1687,18 +1778,16 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
@@ -1715,7 +1804,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize);
+					    u64 bytenr, u32 blocksize,
+					    int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1759,7 +1849,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 root_objectid, u64 ref_generation,
 			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
+			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
 			    u64 owner_objectid);
@@ -1785,6 +1875,18 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
@@ -1808,7 +1910,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret, u64 prealloc_dest);
+		    struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -1834,9 +1936,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2032,9 +2132,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
@@ -2107,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
-/* free-space-cache.c */
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 bytenr, u64 size);
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-			      u64 offset, u64 bytes);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			    u64 bytenr, u64 size);
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-				 u64 offset, u64 bytes);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-				   *block_group);
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-					       *block_group, u64 offset,
-					       u64 bytes);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
-			   u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+		      u64 bytenr, u64 parent)
+{
+	if (bytenr < ref->bytenr)
+		return -1;
+	if (bytenr > ref->bytenr)
+		return 1;
+	if (parent < ref->parent)
+		return -1;
+	if (parent > ref->parent)
+		return 1;
+	return 0;
+}
+
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+						  u64 bytenr, u64 parent,
+						  struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_delayed_ref_node *entry;
+	int cmp;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+				 rb_node);
+
+		cmp = comp_entry(entry, bytenr, parent);
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+	rb_link_node(node, parent_node, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+				  u64 bytenr, u64 parent,
+				  struct btrfs_delayed_ref_node **last)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int cmp;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+		if (last)
+			*last = entry;
+
+		cmp = comp_entry(entry, bytenr, parent);
+		if (cmp < 0)
+			n = n->rb_left;
+		else if (cmp > 0)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	assert_spin_locked(&delayed_refs->lock);
+	if (mutex_trylock(&head->mutex))
+		return 0;
+
+	atomic_inc(&head->node.refs);
+	spin_unlock(&delayed_refs->lock);
+
+	mutex_lock(&head->mutex);
+	spin_lock(&delayed_refs->lock);
+	if (!head->node.in_tree) {
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 start)
+{
+	int count = 0;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *head;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	if (start == 0) {
+		node = rb_first(&delayed_refs->root);
+	} else {
+		ref = NULL;
+		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+		if (ref) {
+			struct btrfs_delayed_ref_node *tmp;
+
+			node = rb_prev(&ref->rb_node);
+			while (node) {
+				tmp = rb_entry(node,
+					       struct btrfs_delayed_ref_node,
+					       rb_node);
+				if (tmp->bytenr < start)
+					break;
+				ref = tmp;
+				node = rb_prev(&ref->rb_node);
+			}
+			node = &ref->rb_node;
+		} else
+			node = rb_first(&delayed_refs->root);
+	}
+again:
+	while (node && count < 32) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		if (btrfs_delayed_ref_is_head(ref)) {
+			head = btrfs_delayed_node_to_head(ref);
+			if (list_empty(&head->cluster)) {
+				list_add_tail(&head->cluster, cluster);
+				delayed_refs->run_delayed_start =
+					head->node.bytenr;
+				count++;
+
+				WARN_ON(delayed_refs->num_heads_ready == 0);
+				delayed_refs->num_heads_ready--;
+			} else if (count) {
+				/* the goal of the clustering is to find extents
+				 * that are likely to end up in the same extent
+				 * leaf on disk.  So, we don't want them spread
+				 * all over the tree.  Stop now if we've hit
+				 * a head that was already in use
+				 */
+				break;
+			}
+		}
+		node = rb_next(node);
+	}
+	if (count) {
+		return 0;
+	} else if (start) {
+		/*
+		 * we've gone to the end of the rbtree without finding any
+		 * clusters.  start from the beginning and try again
+		 */
+		start = 0;
+		node = rb_first(&delayed_refs->root);
+		goto again;
+	}
+	return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr.  It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree.  There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *prev_node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	if (ref) {
+		prev_node = rb_prev(&ref->rb_node);
+		if (!prev_node)
+			goto out;
+		ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		if (ref->bytenr == bytenr)
+			ret = 1;
+	}
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u32 num_refs;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+	delayed_refs = &trans->transaction->delayed_refs;
+again:
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_extent_item);
+		num_refs = btrfs_extent_refs(leaf, ei);
+	} else {
+		num_refs = 0;
+		ret = 0;
+	}
+
+	spin_lock(&delayed_refs->lock);
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	if (ref) {
+		head = btrfs_delayed_node_to_head(ref);
+		if (mutex_trylock(&head->mutex)) {
+			num_refs += ref->ref_mod;
+			mutex_unlock(&head->mutex);
+			*refs = num_refs;
+			goto out;
+		}
+
+		atomic_inc(&ref->refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(root->fs_info->extent_root, path);
+
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(ref);
+		goto again;
+	} else {
+		*refs = num_refs;
+	}
+out:
+	spin_unlock(&delayed_refs->lock);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+		    struct btrfs_delayed_ref_root *delayed_refs,
+		    struct btrfs_delayed_ref_node *existing,
+		    struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref *existing_ref;
+	struct btrfs_delayed_ref *ref;
+
+	existing_ref = btrfs_delayed_node_to_ref(existing);
+	ref = btrfs_delayed_node_to_ref(update);
+
+	if (ref->pin)
+		existing_ref->pin = 1;
+
+	if (ref->action != existing_ref->action) {
+		/*
+		 * this is effectively undoing either an add or a
+		 * drop.  We decrement the ref_mod, and if it goes
+		 * down to zero we just delete the entry without
+		 * every changing the extent allocation tree.
+		 */
+		existing->ref_mod--;
+		if (existing->ref_mod == 0) {
+			rb_erase(&existing->rb_node,
+				 &delayed_refs->root);
+			existing->in_tree = 0;
+			btrfs_put_delayed_ref(existing);
+			delayed_refs->num_entries--;
+			if (trans->delayed_ref_updates)
+				trans->delayed_ref_updates--;
+		}
+	} else {
+		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+			/* if we're adding refs, make sure all the
+			 * details match up.  The extent could
+			 * have been totally freed and reallocated
+			 * by a different owner before the delayed
+			 * ref entries were removed.
+			 */
+			existing_ref->owner_objectid = ref->owner_objectid;
+			existing_ref->generation = ref->generation;
+			existing_ref->root = ref->root;
+			existing->num_bytes = update->num_bytes;
+		}
+		/*
+		 * the action on the existing ref matches
+		 * the action on the ref we're trying to add.
+		 * Bump the ref_mod by one so the backref that
+		 * is eventually added/removed has the correct
+		 * reference count
+		 */
+		existing->ref_mod += update->ref_mod;
+	}
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+			 struct btrfs_delayed_ref_node *update)
+{
+	struct btrfs_delayed_ref_head *existing_ref;
+	struct btrfs_delayed_ref_head *ref;
+
+	existing_ref = btrfs_delayed_node_to_head(existing);
+	ref = btrfs_delayed_node_to_head(update);
+
+	if (ref->must_insert_reserved) {
+		/* if the extent was freed and then
+		 * reallocated before the delayed ref
+		 * entries were processed, we can end up
+		 * with an existing head ref without
+		 * the must_insert_reserved flag set.
+		 * Set it again here
+		 */
+		existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+		/*
+		 * update the num_bytes so we make sure the accounting
+		 * is done correctly
+		 */
+		existing->num_bytes = update->num_bytes;
+
+	}
+
+	/*
+	 * update the reference mod on the head to reflect this new operation
+	 */
+	existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_delayed_ref_node *ref,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_ref *full_ref;
+	struct btrfs_delayed_ref_head *head_ref = NULL;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int count_mod = 1;
+	int must_insert_reserved = 0;
+
+	/*
+	 * the head node stores the sum of all the mods, so dropping a ref
+	 * should drop the sum in the head node by one.
+	 */
+	if (parent == (u64)-1) {
+		if (action == BTRFS_DROP_DELAYED_REF)
+			count_mod = -1;
+		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+			count_mod = 0;
+	}
+
+	/*
+	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+	 * the reserved accounting when the extent is finally added, or
+	 * if a later modification deletes the delayed ref without ever
+	 * inserting the extent into the extent allocation tree.
+	 * ref->must_insert_reserved is the flag used to record
+	 * that accounting mods are required.
+	 *
+	 * Once we record must_insert_reserved, switch the action to
+	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+	 */
+	if (action == BTRFS_ADD_DELAYED_EXTENT) {
+		must_insert_reserved = 1;
+		action = BTRFS_ADD_DELAYED_REF;
+	} else {
+		must_insert_reserved = 0;
+	}
+
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->parent = parent;
+	ref->ref_mod = count_mod;
+	ref->in_tree = 1;
+	ref->num_bytes = num_bytes;
+
+	if (btrfs_delayed_ref_is_head(ref)) {
+		head_ref = btrfs_delayed_node_to_head(ref);
+		head_ref->must_insert_reserved = must_insert_reserved;
+		INIT_LIST_HEAD(&head_ref->cluster);
+		mutex_init(&head_ref->mutex);
+	} else {
+		full_ref = btrfs_delayed_node_to_ref(ref);
+		full_ref->root = ref_root;
+		full_ref->generation = ref_generation;
+		full_ref->owner_objectid = owner_objectid;
+		full_ref->pin = pin;
+		full_ref->action = action;
+	}
+
+	existing = tree_insert(&delayed_refs->root, bytenr,
+			       parent, &ref->rb_node);
+
+	if (existing) {
+		if (btrfs_delayed_ref_is_head(ref))
+			update_existing_head_ref(existing, ref);
+		else
+			update_existing_ref(trans, delayed_refs, existing, ref);
+
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		if (btrfs_delayed_ref_is_head(ref)) {
+			delayed_refs->num_heads++;
+			delayed_refs->num_heads_ready++;
+		}
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin)
+{
+	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	/*
+	 * the parent = 0 case comes from cases where we don't actually
+	 * know the parent yet.  It will get updated later via a add/drop
+	 * pair.
+	 */
+	if (parent == 0)
+		parent = bytenr;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+				      (u64)-1, 0, 0, 0, action, pin);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+				      parent, ref_root, ref_generation,
+				      owner_objectid, action, pin);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	if (ref)
+		return btrfs_delayed_node_to_head(ref);
+	return NULL;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 orig_parent,
+			  u64 parent, u64 orig_ref_root, u64 ref_root,
+			  u64 orig_ref_generation, u64 ref_generation,
+			  u64 owner_objectid, int pin)
+{
+	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_ref *old_ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+	if (!old_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	/*
+	 * the parent = 0 case comes from cases where we don't actually
+	 * know the parent yet.  It will get updated later via a add/drop
+	 * pair.
+	 */
+	if (parent == 0)
+		parent = bytenr;
+	if (orig_parent == 0)
+		orig_parent = bytenr;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		kfree(old_ref);
+		return -ENOMEM;
+	}
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	/*
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
+	 */
+	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+				      (u64)-1, 0, 0, 0,
+				      BTRFS_UPDATE_DELAYED_HEAD, 0);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+				      parent, ref_root, ref_generation,
+				      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+
+	ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+				      orig_parent, orig_ref_root,
+				      orig_ref_generation, owner_objectid,
+				      BTRFS_DROP_DELAYED_REF, pin);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+	struct rb_node rb_node;
+
+	/* the starting bytenr of the extent */
+	u64 bytenr;
+
+	/* the parent our backref will point to */
+	u64 parent;
+
+	/* the size of the extent */
+	u64 num_bytes;
+
+	/* ref count on this data structure */
+	atomic_t refs;
+
+	/*
+	 * how many refs is this entry adding or deleting.  For
+	 * head refs, this may be a negative number because it is keeping
+	 * track of the total mods done to the reference count.
+	 * For individual refs, this will always be a positive number
+	 *
+	 * It may be more than one, since it is possible for a single
+	 * parent to have more than one ref on an extent
+	 */
+	int ref_mod;
+
+	/* is this node still in the rbtree? */
+	unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+	struct btrfs_delayed_ref_node node;
+
+	/*
+	 * the mutex is held while running the refs, and it is also
+	 * held when checking the sum of reference modifications.
+	 */
+	struct mutex mutex;
+
+	struct list_head cluster;
+
+	/*
+	 * when a new extent is allocated, it is just reserved in memory
+	 * The actual extent isn't inserted into the extent allocation tree
+	 * until the delayed ref is processed.  must_insert_reserved is
+	 * used to flag a delayed ref so the accounting can be updated
+	 * when a full insert is done.
+	 *
+	 * It is possible the extent will be freed before it is ever
+	 * inserted into the extent allocation tree.  In this case
+	 * we need to update the in ram accounting to properly reflect
+	 * the free has happened.
+	 */
+	unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+	struct btrfs_delayed_ref_node node;
+
+	/* the root objectid our ref will point to */
+	u64 root;
+
+	/* the generation for the backref */
+	u64 generation;
+
+	/* owner_objectid of the backref  */
+	u64 owner_objectid;
+
+	/* operation done by this entry in the rbtree */
+	u8 action;
+
+	/* if pin == 1, when the extent is freed it will be pinned until
+	 * transaction commit
+	 */
+	unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+	struct rb_root root;
+
+	/* this spin lock protects the rbtree and the entries inside */
+	spinlock_t lock;
+
+	/* how many delayed ref updates we've queued, used by the
+	 * throttling code
+	 */
+	unsigned long num_entries;
+
+	/* total number of head nodes in tree */
+	unsigned long num_heads;
+
+	/* total number of head nodes ready for processing */
+	unsigned long num_heads_ready;
+
+	/*
+	 * set when the tree is flushing before a transaction commit,
+	 * used by the throttling code to decide if new updates need
+	 * to be run right away
+	 */
+	int flushing;
+
+	u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+	WARN_ON(atomic_read(&ref->refs) == 0);
+	if (atomic_dec_and_test(&ref->refs)) {
+		WARN_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid, int action,
+			  int pin);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+			  u64 bytenr, u64 num_bytes, u64 orig_parent,
+			  u64 parent, u64 orig_ref_root, u64 ref_root,
+			  u64 orig_ref_generation, u64 ref_generation,
+			  u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+			   struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+	return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(!btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = btrfs_name_hash(name, name_len);
+
 	path = btrfs_alloc_path();
+	path->leave_spinning = 1;
+
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd71193..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
 #include "locking.h"
 #include "ref-cache.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -75,6 +76,40 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -347,6 +382,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -392,6 +436,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	found_level = btrfs_header_level(eb);
 
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
 	ret = csum_tree_block(root, eb, 1);
 	if (ret)
 		ret = -EIO;
@@ -623,14 +669,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct extent_buffer *eb;
+	int was_dirty;
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (!(current->flags & PF_MEMALLOC)) {
+		return extent_write_full_page(tree, page,
+					      btree_get_extent, wbc);
+	}
 
-	if (current->flags & PF_MEMALLOC) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
+	redirty_page_for_writepage(wbc, page);
+	eb = btrfs_find_tree_block(root, page_offset(page),
+				      PAGE_CACHE_SIZE);
+	WARN_ON(!eb);
+
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+		spin_unlock(&root->fs_info->delalloc_lock);
 	}
-	return extent_write_full_page(tree, page, btree_get_extent, wbc);
+	free_extent_buffer(eb);
+
+	unlock_page(page);
+	return 0;
 }
 
 static int btree_writepages(struct address_space *mapping,
@@ -639,15 +702,15 @@ static int btree_writepages(struct address_space *mapping,
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
+		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
 		u64 num_dirty;
-		u64 start = 0;
 		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
-		num_dirty = count_range_bits(tree, &start, (u64)-1,
-					     thresh, EXTENT_DIRTY);
+		/* this is a bit racy, but that's ok */
+		num_dirty = root->fs_info->dirty_metadata_bytes;
 		if (num_dirty < thresh)
 			return 0;
 	}
@@ -812,11 +875,19 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
-		WARN_ON(!btrfs_tree_locked(buf));
+		btrfs_assert_tree_locked(buf);
+
+		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+			spin_lock(&root->fs_info->delalloc_lock);
+			if (root->fs_info->dirty_metadata_bytes >= buf->len)
+				root->fs_info->dirty_metadata_bytes -= buf->len;
+			else
+				WARN_ON(1);
+			spin_unlock(&root->fs_info->delalloc_lock);
+		}
 
-		/* ugh, clear_extent_buffer_dirty can be expensive */
+		/* ugh, clear_extent_buffer_dirty needs to lock the page */
 		btrfs_set_lock_blocking(buf);
-
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -1342,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
 
 	ret = extent_range_uptodate(io_tree, start + length,
 				    start + buf_len - 1);
-	if (ret == 1)
-		return ret;
 	return ret;
 }
 
@@ -1426,12 +1495,6 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-			printk(KERN_INFO "btrfs: total reference cache "
-			       "size %llu\n",
-			       root->fs_info->total_ref_cache_size);
-		}
-
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
@@ -1448,6 +1511,7 @@ static int transaction_kthread(void *arg)
 		mutex_unlock(&root->fs_info->trans_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		ret = btrfs_commit_transaction(trans, root);
+
 sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1507,6 +1571,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -1566,10 +1631,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->pending_del,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->extent_ins,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
 	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1582,15 +1643,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	insert_inode_hash(fs_info->btree_inode);
 
 	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
-	mutex_init(&fs_info->extent_ins_mutex);
-	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
 	mutex_init(&fs_info->tree_reloc_mutex);
+
+	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1777,7 +1841,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	dev_root->track_dirty = 1;
-
 	if (ret)
 		goto fail_extent_root;
 
@@ -2314,10 +2377,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	int was_dirty;
 
-	btrfs_set_lock_blocking(buf);
-
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
@@ -2326,7 +2388,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+					    buf);
+	if (!was_dirty) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->dirty_metadata_bytes += buf->len;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2341,7 +2409,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	unsigned long thresh = 32 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
-	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+	if (current->flags & PF_MEMALLOC)
 		return;
 
 	num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2366,6 +2434,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2381,6 +2450,16 @@ int btree_lock_page_hook(struct page *page)
 
 	btrfs_tree_lock(eb);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		if (root->fs_info->dirty_metadata_bytes >= eb->len)
+			root->fs_info->dirty_metadata_bytes -= eb->len;
+		else
+			WARN_ON(1);
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb2986..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
@@ -101,4 +102,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2d..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -20,6 +20,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -30,6 +31,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "free-space-cache.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -48,17 +50,27 @@ struct pending_extent_op {
 	int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root, u64 parent,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+				   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int ref_to_drop);
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
@@ -155,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	u64 extent_start, extent_end, size;
 	int ret;
 
-	mutex_lock(&info->pinned_mutex);
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
@@ -181,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 		ret = btrfs_add_free_space(block_group, start, size);
 		BUG_ON(ret);
 	}
-	mutex_unlock(&info->pinned_mutex);
 
 	return 0;
 }
@@ -280,8 +290,8 @@ next:
 			   block_group->key.objectid +
 			   block_group->key.offset);
 
-	remove_sb_from_cache(root, block_group);
 	block_group->cached = 1;
+	remove_sb_from_cache(root, block_group);
 	ret = 0;
 err:
 	btrfs_free_path(path);
@@ -315,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 	return cache;
 }
 
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
 	if (atomic_dec_and_test(&cache->count))
 		kfree(cache);
@@ -326,13 +336,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 {
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
-	list_for_each_entry(found, head, list) {
-		if (found->flags == flags)
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags == flags) {
+			rcu_read_unlock();
 			return found;
+		}
 	}
+	rcu_read_unlock();
 	return NULL;
 }
 
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -368,12 +398,12 @@ again:
 			    div_factor(cache->key.offset, factor)) {
 				group_start = cache->key.objectid;
 				spin_unlock(&cache->lock);
-				put_block_group(cache);
+				btrfs_put_block_group(cache);
 				goto found;
 			}
 		}
 		spin_unlock(&cache->lock);
-		put_block_group(cache);
+		btrfs_put_block_group(cache);
 		cond_resched();
 	}
 	if (!wrapped) {
@@ -529,262 +559,13 @@ out:
 	return ret;
 }
 
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *extent_root,
-				    struct btrfs_path *path,
-				    struct list_head *update_list)
-{
-	struct btrfs_key key;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct pending_extent_op *op;
-	struct extent_buffer *leaf;
-	int ret = 0;
-	struct list_head *cur = update_list->next;
-	u64 ref_objectid;
-	u64 ref_root = extent_root->root_key.objectid;
-
-	op = list_entry(cur, struct pending_extent_op, list);
-
-search:
-	key.objectid = op->bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = op->orig_parent;
-
-	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-	BUG_ON(ret);
-
-	leaf = path->nodes[0];
-
-loop:
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
-	ref_objectid = btrfs_ref_objectid(leaf, ref);
-
-	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-	    (ref_objectid != op->level &&
-	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-		       "root %llu, owner %u\n",
-		       (unsigned long long)op->bytenr,
-		       (unsigned long long)op->orig_parent,
-		       (unsigned long long)ref_root, op->level);
-		btrfs_print_leaf(extent_root, leaf);
-		BUG();
-	}
-
-	key.objectid = op->bytenr;
-	key.offset = op->parent;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-	BUG_ON(ret);
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	btrfs_set_ref_generation(leaf, ref, op->generation);
-
-	cur = cur->next;
-
-	list_del_init(&op->list);
-	unlock_extent(&info->extent_ins, op->bytenr,
-		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-	kfree(op);
-
-	if (cur == update_list) {
-		btrfs_mark_buffer_dirty(path->nodes[0]);
-		btrfs_release_path(extent_root, path);
-		goto out;
-	}
-
-	op = list_entry(cur, struct pending_extent_op, list);
-
-	path->slots[0]++;
-	while (path->slots[0] < btrfs_header_nritems(leaf)) {
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid == op->bytenr &&
-		    key.type == BTRFS_EXTENT_REF_KEY)
-			goto loop;
-		path->slots[0]++;
-	}
-
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	btrfs_release_path(extent_root, path);
-	goto search;
-
-out:
-	return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *extent_root,
-				   struct btrfs_path *path,
-				   struct list_head *insert_list, int nr)
-{
-	struct btrfs_key *keys;
-	u32 *data_size;
-	struct pending_extent_op *op;
-	struct extent_buffer *leaf;
-	struct list_head *cur = insert_list->next;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	u64 ref_root = extent_root->root_key.objectid;
-	int i = 0, last = 0, ret;
-	int total = nr * 2;
-
-	if (!nr)
-		return 0;
-
-	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-	if (!keys)
-		return -ENOMEM;
-
-	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-	if (!data_size) {
-		kfree(keys);
-		return -ENOMEM;
-	}
-
-	list_for_each_entry(op, insert_list, list) {
-		keys[i].objectid = op->bytenr;
-		keys[i].offset = op->num_bytes;
-		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-		data_size[i] = sizeof(struct btrfs_extent_item);
-		i++;
-
-		keys[i].objectid = op->bytenr;
-		keys[i].offset = op->parent;
-		keys[i].type = BTRFS_EXTENT_REF_KEY;
-		data_size[i] = sizeof(struct btrfs_extent_ref);
-		i++;
-	}
-
-	op = list_entry(cur, struct pending_extent_op, list);
-	i = 0;
-	while (i < total) {
-		int c;
-		ret = btrfs_insert_some_items(trans, extent_root, path,
-					      keys+i, data_size+i, total-i);
-		BUG_ON(ret < 0);
-
-		if (last && ret > 1)
-			BUG();
-
-		leaf = path->nodes[0];
-		for (c = 0; c < ret; c++) {
-			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
-			/*
-			 * if the first item we inserted was a backref, then
-			 * the EXTENT_ITEM will be the odd c's, else it will
-			 * be the even c's
-			 */
-			if ((ref_first && (c % 2)) ||
-			    (!ref_first && !(c % 2))) {
-				struct btrfs_extent_item *itm;
-
-				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-						     struct btrfs_extent_item);
-				btrfs_set_extent_refs(path->nodes[0], itm, 1);
-				op->del++;
-			} else {
-				struct btrfs_extent_ref *ref;
-
-				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-						     struct btrfs_extent_ref);
-				btrfs_set_ref_root(leaf, ref, ref_root);
-				btrfs_set_ref_generation(leaf, ref,
-							 op->generation);
-				btrfs_set_ref_objectid(leaf, ref, op->level);
-				btrfs_set_ref_num_refs(leaf, ref, 1);
-				op->del++;
-			}
-
-			/*
-			 * using del to see when its ok to free up the
-			 * pending_extent_op.  In the case where we insert the
-			 * last item on the list in order to help do batching
-			 * we need to not free the extent op until we actually
-			 * insert the extent_item
-			 */
-			if (op->del == 2) {
-				unlock_extent(&info->extent_ins, op->bytenr,
-					      op->bytenr + op->num_bytes - 1,
-					      GFP_NOFS);
-				cur = cur->next;
-				list_del_init(&op->list);
-				kfree(op);
-				if (cur != insert_list)
-					op = list_entry(cur,
-						struct pending_extent_op,
-						list);
-			}
-		}
-		btrfs_mark_buffer_dirty(leaf);
-		btrfs_release_path(extent_root, path);
-
-		/*
-		 * Ok backref's and items usually go right next to eachother,
-		 * but if we could only insert 1 item that means that we
-		 * inserted on the end of a leaf, and we have no idea what may
-		 * be on the next leaf so we just play it safe.  In order to
-		 * try and help this case we insert the last thing on our
-		 * insert list so hopefully it will end up being the last
-		 * thing on the leaf and everything else will be before it,
-		 * which will let us insert a whole bunch of items at the same
-		 * time.
-		 */
-		if (ret == 1 && !last && (i + ret < total)) {
-			/*
-			 * last: where we will pick up the next time around
-			 * i: our current key to insert, will be total - 1
-			 * cur: the current op we are screwing with
-			 * op: duh
-			 */
-			last = i + ret;
-			i = total - 1;
-			cur = insert_list->prev;
-			op = list_entry(cur, struct pending_extent_op, list);
-		} else if (last) {
-			/*
-			 * ok we successfully inserted the last item on the
-			 * list, lets reset everything
-			 *
-			 * i: our current key to insert, so where we left off
-			 *    last time
-			 * last: done with this
-			 * cur: the op we are messing with
-			 * op: duh
-			 * total: since we inserted the last key, we need to
-			 *        decrement total so we dont overflow
-			 */
-			i = last;
-			last = 0;
-			total--;
-			if (i < total) {
-				cur = insert_list->next;
-				op = list_entry(cur, struct pending_extent_op,
-						list);
-			}
-		} else {
-			i += ret;
-		}
-
-		cond_resched();
-	}
-	ret = 0;
-	kfree(keys);
-	kfree(data_size);
-	return ret;
-}
-
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
 					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid)
+					  u64 owner_objectid,
+					  int refs_to_add)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -804,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_root(leaf, ref, ref_root);
 		btrfs_set_ref_generation(leaf, ref, ref_generation);
 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_num_refs(leaf, ref, 1);
+		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
 	} else if (ret == -EEXIST) {
 		u64 existing_owner;
+
 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
 		leaf = path->nodes[0];
 		ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -820,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
 		num_refs = btrfs_ref_num_refs(leaf, ref);
 		BUG_ON(num_refs == 0);
-		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
 
 		existing_owner = btrfs_ref_objectid(leaf, ref);
 		if (existing_owner != owner_objectid &&
@@ -832,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 	} else {
 		goto out;
 	}
+	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
@@ -840,7 +623,8 @@ out:
 
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
-					  struct btrfs_path *path)
+					  struct btrfs_path *path,
+					  int refs_to_drop)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_ref *ref;
@@ -850,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
 	num_refs = btrfs_ref_num_refs(leaf, ref);
-	BUG_ON(num_refs == 0);
-	num_refs -= 1;
+	BUG_ON(num_refs < refs_to_drop);
+	num_refs -= refs_to_drop;
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
@@ -902,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root,
-				 struct list_head *del_list)
-{
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_path *path;
-	struct btrfs_key key, found_key;
-	struct extent_buffer *leaf;
-	struct list_head *cur;
-	struct pending_extent_op *op;
-	struct btrfs_extent_item *ei;
-	int ret, num_to_del, extent_slot = 0, found_extent = 0;
-	u32 refs;
-	u64 bytes_freed = 0;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = 1;
-
-search:
-	/* search for the backref for the current ref we want to delete */
-	cur = del_list->next;
-	op = list_entry(cur, struct pending_extent_op, list);
-	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-				    op->orig_parent,
-				    extent_root->root_key.objectid,
-				    op->orig_generation, op->level, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-		       "root %llu gen %llu owner %u\n",
-		       (unsigned long long)op->bytenr,
-		       (unsigned long long)extent_root->root_key.objectid,
-		       (unsigned long long)op->orig_generation, op->level);
-		btrfs_print_leaf(extent_root, path->nodes[0]);
-		WARN_ON(1);
-		goto out;
-	}
-
-	extent_slot = path->slots[0];
-	num_to_del = 1;
-	found_extent = 0;
-
-	/*
-	 * if we aren't the first item on the leaf we can move back one and see
-	 * if our ref is right next to our extent item
-	 */
-	if (likely(extent_slot)) {
-		extent_slot--;
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      extent_slot);
-		if (found_key.objectid == op->bytenr &&
-		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-		    found_key.offset == op->num_bytes) {
-			num_to_del++;
-			found_extent = 1;
-		}
-	}
-
-	/*
-	 * if we didn't find the extent we need to delete the backref and then
-	 * search for the extent item key so we can update its ref count
-	 */
-	if (!found_extent) {
-		key.objectid = op->bytenr;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = op->num_bytes;
-
-		ret = remove_extent_backref(trans, extent_root, path);
-		BUG_ON(ret);
-		btrfs_release_path(extent_root, path);
-		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-		BUG_ON(ret);
-		extent_slot = path->slots[0];
-	}
-
-	/* this is where we update the ref count for the extent */
-	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs == 0);
-	refs--;
-	btrfs_set_extent_refs(leaf, ei, refs);
-
-	btrfs_mark_buffer_dirty(leaf);
-
-	/*
-	 * This extent needs deleting.  The reason cur_slot is extent_slot +
-	 * num_to_del is because extent_slot points to the slot where the extent
-	 * is, and if the backref was not right next to the extent we will be
-	 * deleting at least 1 item, and will want to start searching at the
-	 * slot directly next to extent_slot.  However if we did find the
-	 * backref next to the extent item them we will be deleting at least 2
-	 * items and will want to start searching directly after the ref slot
-	 */
-	if (!refs) {
-		struct list_head *pos, *n, *end;
-		int cur_slot = extent_slot+num_to_del;
-		u64 super_used;
-		u64 root_used;
-
-		path->slots[0] = extent_slot;
-		bytes_freed = op->num_bytes;
-
-		mutex_lock(&info->pinned_mutex);
-		ret = pin_down_bytes(trans, extent_root, op->bytenr,
-				     op->num_bytes, op->level >=
-				     BTRFS_FIRST_FREE_OBJECTID);
-		mutex_unlock(&info->pinned_mutex);
-		BUG_ON(ret < 0);
-		op->del = ret;
-
-		/*
-		 * we need to see if we can delete multiple things at once, so
-		 * start looping through the list of extents we are wanting to
-		 * delete and see if their extent/backref's are right next to
-		 * eachother and the extents only have 1 ref
-		 */
-		for (pos = cur->next; pos != del_list; pos = pos->next) {
-			struct pending_extent_op *tmp;
-
-			tmp = list_entry(pos, struct pending_extent_op, list);
-
-			/* we only want to delete extent+ref at this stage */
-			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-				break;
-
-			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-			if (found_key.objectid != tmp->bytenr ||
-			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-			    found_key.offset != tmp->num_bytes)
-				break;
-
-			/* check to make sure this extent only has one ref */
-			ei = btrfs_item_ptr(leaf, cur_slot,
-					    struct btrfs_extent_item);
-			if (btrfs_extent_refs(leaf, ei) != 1)
-				break;
-
-			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-			if (found_key.objectid != tmp->bytenr ||
-			    found_key.type != BTRFS_EXTENT_REF_KEY ||
-			    found_key.offset != tmp->orig_parent)
-				break;
-
-			/*
-			 * the ref is right next to the extent, we can set the
-			 * ref count to 0 since we will delete them both now
-			 */
-			btrfs_set_extent_refs(leaf, ei, 0);
-
-			/* pin down the bytes for this extent */
-			mutex_lock(&info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-					     tmp->num_bytes, tmp->level >=
-					     BTRFS_FIRST_FREE_OBJECTID);
-			mutex_unlock(&info->pinned_mutex);
-			BUG_ON(ret < 0);
-
-			/*
-			 * use the del field to tell if we need to go ahead and
-			 * free up the extent when we delete the item or not.
-			 */
-			tmp->del = ret;
-			bytes_freed += tmp->num_bytes;
-
-			num_to_del += 2;
-			cur_slot += 2;
-		}
-		end = pos;
-
-		/* update the free space counters */
-		spin_lock(&info->delalloc_lock);
-		super_used = btrfs_super_bytes_used(&info->super_copy);
-		btrfs_set_super_bytes_used(&info->super_copy,
-					   super_used - bytes_freed);
-
-		root_used = btrfs_root_used(&extent_root->root_item);
-		btrfs_set_root_used(&extent_root->root_item,
-				    root_used - bytes_freed);
-		spin_unlock(&info->delalloc_lock);
-
-		/* delete the items */
-		ret = btrfs_del_items(trans, extent_root, path,
-				      path->slots[0], num_to_del);
-		BUG_ON(ret);
-
-		/*
-		 * loop through the extents we deleted and do the cleanup work
-		 * on them
-		 */
-		for (pos = cur, n = pos->next; pos != end;
-		     pos = n, n = pos->next) {
-			struct pending_extent_op *tmp;
-			tmp = list_entry(pos, struct pending_extent_op, list);
-
-			/*
-			 * remember tmp->del tells us wether or not we pinned
-			 * down the extent
-			 */
-			ret = update_block_group(trans, extent_root,
-						 tmp->bytenr, tmp->num_bytes, 0,
-						 tmp->del);
-			BUG_ON(ret);
-
-			list_del_init(&tmp->list);
-			unlock_extent(&info->extent_ins, tmp->bytenr,
-				      tmp->bytenr + tmp->num_bytes - 1,
-				      GFP_NOFS);
-			kfree(tmp);
-		}
-	} else if (refs && found_extent) {
-		/*
-		 * the ref and extent were right next to eachother, but the
-		 * extent still has a ref, so just free the backref and keep
-		 * going
-		 */
-		ret = remove_extent_backref(trans, extent_root, path);
-		BUG_ON(ret);
-
-		list_del_init(&op->list);
-		unlock_extent(&info->extent_ins, op->bytenr,
-			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-		kfree(op);
-	} else {
-		/*
-		 * the extent has multiple refs and the backref we were looking
-		 * for was not right next to it, so just unlock and go next,
-		 * we're good to go
-		 */
-		list_del_init(&op->list);
-		unlock_extent(&info->extent_ins, op->bytenr,
-			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-		kfree(op);
-	}
-
-	btrfs_release_path(extent_root, path);
-	if (!list_empty(del_list))
-		goto search;
-
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 bytenr,
+				     u64 num_bytes,
 				     u64 orig_parent, u64 parent,
 				     u64 orig_root, u64 ref_root,
 				     u64 orig_generation, u64 ref_generation,
 				     u64 owner_objectid)
 {
 	int ret;
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_path *path;
-
-	if (root == root->fs_info->extent_root) {
-		struct pending_extent_op *extent_op;
-		u64 num_bytes;
-
-		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-		num_bytes = btrfs_level_size(root, (int)owner_objectid);
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-			u64 priv;
-			ret = get_state_private(&root->fs_info->extent_ins,
-						bytenr, &priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-							(unsigned long)priv;
-			BUG_ON(extent_op->parent != orig_parent);
-			BUG_ON(extent_op->generation != orig_generation);
+	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-			extent_op->parent = parent;
-			extent_op->generation = ref_generation;
-		} else {
-			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-			BUG_ON(!extent_op);
-
-			extent_op->type = PENDING_BACKREF_UPDATE;
-			extent_op->bytenr = bytenr;
-			extent_op->num_bytes = num_bytes;
-			extent_op->parent = parent;
-			extent_op->orig_parent = orig_parent;
-			extent_op->generation = ref_generation;
-			extent_op->orig_generation = orig_generation;
-			extent_op->level = (int)owner_objectid;
-			INIT_LIST_HEAD(&extent_op->list);
-			extent_op->del = 0;
-
-			set_extent_bits(&root->fs_info->extent_ins,
-					bytenr, bytenr + num_bytes - 1,
-					EXTENT_WRITEBACK, GFP_NOFS);
-			set_state_private(&root->fs_info->extent_ins,
-					  bytenr, (unsigned long)extent_op);
-		}
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		return 0;
-	}
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, orig_parent, orig_root,
-				    orig_generation, owner_objectid, 1);
-	if (ret)
-		goto out;
-	ret = remove_extent_backref(trans, extent_root, path);
-	if (ret)
-		goto out;
-	ret = insert_extent_backref(trans, extent_root, path, bytenr,
-				    parent, ref_root, ref_generation,
-				    owner_objectid);
+	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+				       orig_parent, parent, orig_root,
+				       ref_root, orig_generation,
+				       ref_generation, owner_objectid, pin);
 	BUG_ON(ret);
-	finish_current_insert(trans, extent_root, 0);
-	del_pending_extents(trans, extent_root, 0);
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
-			    u64 orig_parent, u64 parent,
+			    u64 num_bytes, u64 orig_parent, u64 parent,
 			    u64 ref_root, u64 ref_generation,
 			    u64 owner_objectid)
 {
@@ -1235,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-					parent, ref_root, ref_root,
-					ref_generation, ref_generation,
-					owner_objectid);
+
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+					orig_parent, parent, ref_root,
+					ref_root, ref_generation,
+					ref_generation, owner_objectid);
 	return ret;
 }
-
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root, u64 bytenr,
+				  u64 num_bytes,
 				  u64 orig_parent, u64 parent,
 				  u64 orig_root, u64 ref_root,
 				  u64 orig_generation, u64 ref_generation,
 				  u64 owner_objectid)
 {
+	int ret;
+
+	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+				    ref_generation, owner_objectid,
+				    BTRFS_ADD_DELAYED_REF, 0);
+	BUG_ON(ret);
+	return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 bytenr,
+			  u64 num_bytes, u64 parent, u64 ref_root,
+			  u64 ref_generation, u64 owner_objectid,
+			  int refs_to_add)
+{
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
@@ -1261,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
+	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = (u64)-1;
+	key.offset = num_bytes;
 
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-				0, 1);
-	if (ret < 0)
+	/* first find the extent item and update its reference count */
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
+	if (ret < 0) {
+		btrfs_set_path_blocking(path);
 		return ret;
-	BUG_ON(ret == 0 || path->slots[0] == 0);
+	}
 
-	path->slots[0]--;
+	if (ret > 0) {
+		WARN_ON(1);
+		btrfs_free_path(path);
+		return -EIO;
+	}
 	l = path->nodes[0];
 
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1285,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
 	refs = btrfs_extent_refs(l, item);
-	btrfs_set_extent_refs(l, item, refs + 1);
+	btrfs_set_extent_refs(l, item, refs + refs_to_add);
+	btrfs_unlock_up_safe(path, 1);
+
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
+	path->leave_spinning = 1;
+
+	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent,
 				    ref_root, ref_generation,
-				    owner_objectid);
+				    owner_objectid, refs_to_add);
 	BUG_ON(ret);
-	finish_current_insert(trans, root->fs_info->extent_root, 0);
-	del_pending_extents(trans, root->fs_info->extent_root, 0);
-
 	btrfs_free_path(path);
 	return 0;
 }
@@ -1314,51 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
 				     0, ref_root, 0, ref_generation,
 				     owner_objectid);
 	return ret;
 }
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node)
+{
+	int ret = 0;
+	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+	BUG_ON(node->ref_mod == 0);
+	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+				  node->parent, ref->root, ref->generation,
+				  ref->owner_objectid, ref->pin, node->ref_mod);
+
+	return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_delayed_ref_node *node,
+					int insert_reserved)
 {
-	finish_current_insert(trans, root->fs_info->extent_root, 1);
-	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	int ret;
+	struct btrfs_delayed_ref *ref;
+
+	if (node->parent == (u64)-1) {
+		struct btrfs_delayed_ref_head *head;
+		/*
+		 * we've hit the end of the chain and we were supposed
+		 * to insert this extent into the tree.  But, it got
+		 * deleted before we ever needed to insert it, so all
+		 * we have to do is clean up the accounting
+		 */
+		if (insert_reserved) {
+			update_reserved_extents(root, node->bytenr,
+						node->num_bytes, 0);
+		}
+		head = btrfs_delayed_node_to_head(node);
+		mutex_unlock(&head->mutex);
+		return 0;
+	}
+
+	ref = btrfs_delayed_node_to_ref(node);
+	if (ref->action == BTRFS_ADD_DELAYED_REF) {
+		if (insert_reserved) {
+			struct btrfs_key ins;
+
+			ins.objectid = node->bytenr;
+			ins.offset = node->num_bytes;
+			ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+			/* record the full extent allocation */
+			ret = __btrfs_alloc_reserved_extent(trans, root,
+					node->parent, ref->root,
+					ref->generation, ref->owner_objectid,
+					&ins, node->ref_mod);
+			update_reserved_extents(root, node->bytenr,
+						node->num_bytes, 0);
+		} else {
+			/* just add one backref */
+			ret = add_extent_ref(trans, root, node->bytenr,
+				     node->num_bytes,
+				     node->parent, ref->root, ref->generation,
+				     ref->owner_objectid, node->ref_mod);
+		}
+		BUG_ON(ret);
+	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+		WARN_ON(insert_reserved);
+		ret = drop_delayed_ref(trans, root, node);
+	}
 	return 0;
 }
 
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs)
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
-	struct btrfs_path *path;
+	struct rb_node *node;
+	struct btrfs_delayed_ref_node *ref;
+	int action = BTRFS_ADD_DELAYED_REF;
+again:
+	/*
+	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * this prevents ref count from going down to zero when
+	 * there still are pending delayed ref.
+	 */
+	node = rb_prev(&head->node.rb_node);
+	while (1) {
+		if (!node)
+			break;
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+		if (btrfs_delayed_node_to_ref(ref)->action == action)
+			return ref;
+		node = rb_prev(node);
+	}
+	if (action == BTRFS_ADD_DELAYED_REF) {
+		action = BTRFS_DROP_DELAYED_REF;
+		goto again;
+	}
+	return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct list_head *cluster)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	int ret;
-	struct btrfs_key key;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
+	int count = 0;
+	int must_insert_reserved = 0;
 
-	WARN_ON(num_bytes < root->sectorsize);
-	path = btrfs_alloc_path();
-	path->reada = 1;
-	key.objectid = bytenr;
-	key.offset = num_bytes;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-				0, 0);
-	if (ret < 0)
-		goto out;
-	if (ret != 0) {
-		btrfs_print_leaf(root, path->nodes[0]);
-		printk(KERN_INFO "btrfs failed to find block number %llu\n",
-		       (unsigned long long)bytenr);
-		BUG();
+	delayed_refs = &trans->transaction->delayed_refs;
+	while (1) {
+		if (!locked_ref) {
+			/* pick a new head ref from the cluster list */
+			if (list_empty(cluster))
+				break;
+
+			locked_ref = list_entry(cluster->next,
+				     struct btrfs_delayed_ref_head, cluster);
+
+			/* grab the lock that says we are going to process
+			 * all the refs for this head */
+			ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+			/*
+			 * we may have dropped the spin lock to get the head
+			 * mutex lock, and that might have given someone else
+			 * time to free the head.  If that's true, it has been
+			 * removed from our list and we can move on.
+			 */
+			if (ret == -EAGAIN) {
+				locked_ref = NULL;
+				count++;
+				continue;
+			}
+		}
+
+		/*
+		 * record the must insert reserved flag before we
+		 * drop the spin lock.
+		 */
+		must_insert_reserved = locked_ref->must_insert_reserved;
+		locked_ref->must_insert_reserved = 0;
+
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+		if (!ref) {
+			/* All delayed refs have been processed, Go ahead
+			 * and send the head node to run_one_delayed_ref,
+			 * so that any accounting fixes can happen
+			 */
+			ref = &locked_ref->node;
+			list_del_init(&locked_ref->cluster);
+			locked_ref = NULL;
+		}
+
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
+		spin_unlock(&delayed_refs->lock);
+
+		ret = run_one_delayed_ref(trans, root, ref,
+					  must_insert_reserved);
+		BUG_ON(ret);
+		btrfs_put_delayed_ref(ref);
+
+		count++;
+		cond_resched();
+		spin_lock(&delayed_refs->lock);
+	}
+	return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count)
+{
+	struct rb_node *node;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct list_head cluster;
+	int ret;
+	int run_all = count == (unsigned long)-1;
+	int run_most = 0;
+
+	if (root == root->fs_info->extent_root)
+		root = root->fs_info->tree_root;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	INIT_LIST_HEAD(&cluster);
+again:
+	spin_lock(&delayed_refs->lock);
+	if (count == 0) {
+		count = delayed_refs->num_entries * 2;
+		run_most = 1;
+	}
+	while (1) {
+		if (!(run_all || run_most) &&
+		    delayed_refs->num_heads_ready < 64)
+			break;
+
+		/*
+		 * go find something we can process in the rbtree.  We start at
+		 * the beginning of the tree, and then build a cluster
+		 * of refs to process starting at the first one we are able to
+		 * lock
+		 */
+		ret = btrfs_find_ref_cluster(trans, &cluster,
+					     delayed_refs->run_delayed_start);
+		if (ret)
+			break;
+
+		ret = run_clustered_refs(trans, root, &cluster);
+		BUG_ON(ret < 0);
+
+		count -= min_t(unsigned long, ret, count);
+
+		if (count == 0)
+			break;
+	}
+
+	if (run_all) {
+		node = rb_first(&delayed_refs->root);
+		if (!node)
+			goto out;
+		count = (unsigned long)-1;
+
+		while (node) {
+			ref = rb_entry(node, struct btrfs_delayed_ref_node,
+				       rb_node);
+			if (btrfs_delayed_ref_is_head(ref)) {
+				struct btrfs_delayed_ref_head *head;
+
+				head = btrfs_delayed_node_to_head(ref);
+				atomic_inc(&ref->refs);
+
+				spin_unlock(&delayed_refs->lock);
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+
+				btrfs_put_delayed_ref(ref);
+				cond_resched();
+				goto again;
+			}
+			node = rb_next(node);
+		}
+		spin_unlock(&delayed_refs->lock);
+		schedule_timeout(1);
+		goto again;
 	}
-	l = path->nodes[0];
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	*refs = btrfs_extent_refs(l, item);
 out:
-	btrfs_free_path(path);
+	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
@@ -1582,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 	int refi = 0;
 	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	ref_generation = btrfs_header_generation(buf);
@@ -1654,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 
 		if (level == 0) {
 			btrfs_item_key_to_cpu(buf, &key, slot);
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
 
 			ret = process_func(trans, root, bytenr,
-					   orig_buf->start, buf->start,
-					   orig_root, ref_root,
-					   orig_generation, ref_generation,
-					   key.objectid);
+				   btrfs_file_extent_disk_num_bytes(buf, fi),
+				   orig_buf->start, buf->start,
+				   orig_root, ref_root,
+				   orig_generation, ref_generation,
+				   key.objectid);
 
 			if (ret) {
 				faili = slot;
@@ -1667,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 				goto fail;
 			}
 		} else {
-			ret = process_func(trans, root, bytenr,
+			ret = process_func(trans, root, bytenr, buf->len,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
@@ -1744,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			if (bytenr == 0)
 				continue;
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    orig_buf->start, buf->start,
-					    orig_root, ref_root,
-					    orig_generation, ref_generation,
-					    key.objectid);
+				    btrfs_file_extent_disk_num_bytes(buf, fi),
+				    orig_buf->start, buf->start,
+				    orig_root, ref_root, orig_generation,
+				    ref_generation, key.objectid);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    orig_buf->start, buf->start,
-					    orig_root, ref_root,
+					    buf->len, orig_buf->start,
+					    buf->start, orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
 			if (ret)
@@ -1773,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 				 struct btrfs_block_group_cache *cache)
 {
 	int ret;
-	int pending_ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	unsigned long bi;
 	struct extent_buffer *leaf;
@@ -1789,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(extent_root, path);
 fail:
-	finish_current_insert(trans, extent_root, 0);
-	pending_ret = del_pending_extents(trans, extent_root, 0);
 	if (ret)
 		return ret;
-	if (pending_ret)
-		return pending_ret;
 	return 0;
 
 }
@@ -1858,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
 	if (!block_group || block_group->ro)
 		readonly = 1;
 	if (block_group)
-		put_block_group(block_group);
+		btrfs_put_block_group(block_group);
 	return readonly;
 }
 
@@ -1882,7 +1617,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
-	list_add(&found->list, &info->space_info);
 	INIT_LIST_HEAD(&found->block_groups);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
@@ -1892,9 +1626,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
+	found->bytes_delalloc = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
+	list_add_rcu(&found->list, &info->space_info);
 	return 0;
 }
 
@@ -1955,6 +1691,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 alloc_profile;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+
+	return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	u64 alloc_target;
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 alloc_target, thresh;
+	int committed = 0, ret;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+again:
+	spin_lock(&meta_sinfo->lock);
+	if (!meta_sinfo->full)
+		thresh = meta_sinfo->total_bytes * 80;
+	else
+		thresh = meta_sinfo->total_bytes * 95;
+
+	do_div(thresh, 100);
+
+	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		struct btrfs_trans_handle *trans;
+		if (!meta_sinfo->full) {
+			meta_sinfo->force_alloc = 1;
+			spin_unlock(&meta_sinfo->lock);
+
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     2 * 1024 * 1024, alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			goto again;
+		}
+		spin_unlock(&meta_sinfo->lock);
+
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		return -ENOSPC;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	int ret = 0, committed = 0;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    data_sinfo->bytes_may_use < bytes) {
+		struct btrfs_trans_handle *trans;
+
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+
+			data_sinfo->force_alloc = 1;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
+		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+		       ", %llu bytes_used, %llu bytes_reserved, "
+		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				  u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* get the space info for where this inode will be storing its data */
+	data_sinfo = BTRFS_I(inode)->space_info;
+
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_delalloc += bytes;
+
+	/*
+	 * we are adding a delalloc extent without calling
+	 * btrfs_check_data_free_space first.  This happens on a weird
+	 * writepage condition, but shouldn't hurt our accounting
+	 */
+	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+		BTRFS_I(inode)->reserved_bytes = 0;
+	} else {
+		data_sinfo->bytes_may_use -= bytes;
+		BTRFS_I(inode)->reserved_bytes -= bytes;
+	}
+
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes)
+{
+	struct btrfs_space_info *info;
+
+	info = BTRFS_I(inode)->space_info;
+
+	spin_lock(&info->lock);
+	info->bytes_delalloc -= bytes;
+	spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -2054,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				WARN_ON(ret);
 			}
 		}
-		put_block_group(cache);
+		btrfs_put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
 	}
@@ -2071,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 		return 0;
 
 	bytenr = cache->key.objectid;
-	put_block_group(cache);
+	btrfs_put_block_group(cache);
 
 	return bytenr;
 }
@@ -2083,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2091,6 +2053,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		clear_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
 	}
+
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
@@ -2115,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			if (cache->cached)
 				btrfs_add_free_space(cache, bytenr, len);
 		}
-		put_block_group(cache);
+		btrfs_put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2146,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
-		put_block_group(cache);
+		btrfs_put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2161,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
-	mutex_lock(&root->fs_info->pinned_mutex);
 	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
@@ -2170,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
-	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -2182,7 +2143,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 end;
 	int ret;
 
-	mutex_lock(&root->fs_info->pinned_mutex);
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -2191,215 +2151,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
+		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-		if (need_resched()) {
-			mutex_unlock(&root->fs_info->pinned_mutex);
-			cond_resched();
-			mutex_lock(&root->fs_info->pinned_mutex);
-		}
+		cond_resched();
 	}
-	mutex_unlock(&root->fs_info->pinned_mutex);
 	return ret;
 }
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root, int all)
-{
-	u64 start;
-	u64 end;
-	u64 priv;
-	u64 search = 0;
-	u64 skipped = 0;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_path *path;
-	struct pending_extent_op *extent_op, *tmp;
-	struct list_head insert_list, update_list;
-	int ret;
-	int num_inserts = 0, max_inserts;
-
-	path = btrfs_alloc_path();
-	INIT_LIST_HEAD(&insert_list);
-	INIT_LIST_HEAD(&update_list);
-
-	max_inserts = extent_root->leafsize /
-		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-		 sizeof(struct btrfs_extent_ref) +
-		 sizeof(struct btrfs_extent_item));
-again:
-	mutex_lock(&info->extent_ins_mutex);
-	while (1) {
-		ret = find_first_extent_bit(&info->extent_ins, search, &start,
-					    &end, EXTENT_WRITEBACK);
-		if (ret) {
-			if (skipped && all && !num_inserts &&
-			    list_empty(&update_list)) {
-				skipped = 0;
-				search = 0;
-				continue;
-			}
-			mutex_unlock(&info->extent_ins_mutex);
-			break;
-		}
-
-		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-		if (!ret) {
-			skipped = 1;
-			search = end + 1;
-			if (need_resched()) {
-				mutex_unlock(&info->extent_ins_mutex);
-				cond_resched();
-				mutex_lock(&info->extent_ins_mutex);
-			}
-			continue;
-		}
-
-		ret = get_state_private(&info->extent_ins, start, &priv);
-		BUG_ON(ret);
-		extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
-		if (extent_op->type == PENDING_EXTENT_INSERT) {
-			num_inserts++;
-			list_add_tail(&extent_op->list, &insert_list);
-			search = end + 1;
-			if (num_inserts == max_inserts) {
-				mutex_unlock(&info->extent_ins_mutex);
-				break;
-			}
-		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-			list_add_tail(&extent_op->list, &update_list);
-			search = end + 1;
-		} else {
-			BUG();
-		}
-	}
-
-	/*
-	 * process the update list, clear the writeback bit for it, and if
-	 * somebody marked this thing for deletion then just unlock it and be
-	 * done, the free_extents will handle it
-	 */
-	mutex_lock(&info->extent_ins_mutex);
-	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-				  extent_op->bytenr + extent_op->num_bytes - 1,
-				  EXTENT_WRITEBACK, GFP_NOFS);
-		if (extent_op->del) {
-			list_del_init(&extent_op->list);
-			unlock_extent(&info->extent_ins, extent_op->bytenr,
-				      extent_op->bytenr + extent_op->num_bytes
-				      - 1, GFP_NOFS);
-			kfree(extent_op);
-		}
-	}
-	mutex_unlock(&info->extent_ins_mutex);
-
-	/*
-	 * still have things left on the update list, go ahead an update
-	 * everything
-	 */
-	if (!list_empty(&update_list)) {
-		ret = update_backrefs(trans, extent_root, path, &update_list);
-		BUG_ON(ret);
-	}
-
-	/*
-	 * if no inserts need to be done, but we skipped some extents and we
-	 * need to make sure everything is cleaned then reset everything and
-	 * go back to the beginning
-	 */
-	if (!num_inserts && all && skipped) {
-		search = 0;
-		skipped = 0;
-		INIT_LIST_HEAD(&update_list);
-		INIT_LIST_HEAD(&insert_list);
-		goto again;
-	} else if (!num_inserts) {
-		goto out;
-	}
-
-	/*
-	 * process the insert extents list.  Again if we are deleting this
-	 * extent, then just unlock it, pin down the bytes if need be, and be
-	 * done with it.  Saves us from having to actually insert the extent
-	 * into the tree and then subsequently come along and delete it
-	 */
-	mutex_lock(&info->extent_ins_mutex);
-	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-				  extent_op->bytenr + extent_op->num_bytes - 1,
-				  EXTENT_WRITEBACK, GFP_NOFS);
-		if (extent_op->del) {
-			u64 used;
-			list_del_init(&extent_op->list);
-			unlock_extent(&info->extent_ins, extent_op->bytenr,
-				      extent_op->bytenr + extent_op->num_bytes
-				      - 1, GFP_NOFS);
-
-			mutex_lock(&extent_root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root,
-					     extent_op->bytenr,
-					     extent_op->num_bytes, 0);
-			mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-			spin_lock(&info->delalloc_lock);
-			used = btrfs_super_bytes_used(&info->super_copy);
-			btrfs_set_super_bytes_used(&info->super_copy,
-					used - extent_op->num_bytes);
-			used = btrfs_root_used(&extent_root->root_item);
-			btrfs_set_root_used(&extent_root->root_item,
-					used - extent_op->num_bytes);
-			spin_unlock(&info->delalloc_lock);
-
-			ret = update_block_group(trans, extent_root,
-						 extent_op->bytenr,
-						 extent_op->num_bytes,
-						 0, ret > 0);
-			BUG_ON(ret);
-			kfree(extent_op);
-			num_inserts--;
-		}
-	}
-	mutex_unlock(&info->extent_ins_mutex);
-
-	ret = insert_extents(trans, extent_root, path, &insert_list,
-			     num_inserts);
-	BUG_ON(ret);
-
-	/*
-	 * if we broke out of the loop in order to insert stuff because we hit
-	 * the maximum number of inserts at a time we can handle, then loop
-	 * back and pick up where we left off
-	 */
-	if (num_inserts == max_inserts) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		num_inserts = 0;
-		goto again;
-	}
-
-	/*
-	 * again, if we need to make absolutely sure there are no more pending
-	 * extent operations left and we know that we skipped some, go back to
-	 * the beginning and do it all again
-	 */
-	if (all && skipped) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		search = 0;
-		skipped = 0;
-		num_inserts = 0;
-		goto again;
-	}
-out:
-	btrfs_free_path(path);
-	return 0;
-}
-
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
-			  u64 bytenr, u64 num_bytes, int is_data)
+			  struct btrfs_path *path,
+			  u64 bytenr, u64 num_bytes, int is_data,
+			  struct extent_buffer **must_clean)
 {
 	int err = 0;
 	struct extent_buffer *buf;
@@ -2422,17 +2187,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			clean_tree_block(NULL, root, buf);
-			btrfs_tree_unlock(buf);
-			free_extent_buffer(buf);
+			*must_clean = buf;
 			return 1;
 		}
 		btrfs_tree_unlock(buf);
 	}
 	free_extent_buffer(buf);
 pinit:
+	btrfs_set_path_blocking(path);
+	/* unlocks the pinned mutex */
 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
 	BUG_ON(err < 0);
@@ -2446,7 +2212,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, int pin, int mark_free)
+			 u64 owner_objectid, int pin, int mark_free,
+			 int refs_to_drop)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -2468,6 +2235,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
+	path->leave_spinning = 1;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, parent, root_objectid,
 				    ref_generation, owner_objectid, 1);
@@ -2489,9 +2257,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				break;
 		}
 		if (!found_extent) {
-			ret = remove_extent_backref(trans, extent_root, path);
+			ret = remove_extent_backref(trans, extent_root, path,
+						    refs_to_drop);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -2507,8 +2277,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "root %llu gen %llu owner %llu\n",
+		       "parent %llu root %llu gen %llu owner %llu\n",
 		       (unsigned long long)bytenr,
+		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
 		       (unsigned long long)ref_generation,
 		       (unsigned long long)owner_objectid);
@@ -2518,17 +2289,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs == 0);
-	refs -= 1;
-	btrfs_set_extent_refs(leaf, ei, refs);
 
+	/*
+	 * we're not allowed to delete the extent item if there
+	 * are other delayed ref updates pending
+	 */
+
+	BUG_ON(refs < refs_to_drop);
+	refs -= refs_to_drop;
+	btrfs_set_extent_refs(leaf, ei, refs);
 	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+	if (refs == 0 && found_extent &&
+	    path->slots[0] == extent_slot + 1) {
 		struct btrfs_extent_ref *ref;
 		ref = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_extent_ref);
-		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
 		/* if the back ref and the extent are next to each other
 		 * they get deleted below in one shot
 		 */
@@ -2536,11 +2313,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		num_to_del = 2;
 	} else if (found_extent) {
 		/* otherwise delete the extent back ref */
-		ret = remove_extent_backref(trans, extent_root, path);
+		ret = remove_extent_backref(trans, extent_root, path,
+					    refs_to_drop);
 		BUG_ON(ret);
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
 			btrfs_release_path(extent_root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_search_slot(trans, extent_root, &key, path,
 						-1, 1);
 			BUG_ON(ret);
@@ -2550,16 +2329,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
+		struct extent_buffer *must_clean = NULL;
 
 		if (pin) {
-			mutex_lock(&root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
-				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
-			mutex_unlock(&root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, root, path,
+				bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+				&must_clean);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
 		}
+
 		/* block accounting for super block */
 		spin_lock(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2571,14 +2352,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_root_used(&root->root_item,
 					   root_used - num_bytes);
 		spin_unlock(&info->delalloc_lock);
+
+		/*
+		 * it is going to be very rare for someone to be waiting
+		 * on the block we're freeing.  del_items might need to
+		 * schedule, so rather than get fancy, just force it
+		 * to blocking here
+		 */
+		if (must_clean)
+			btrfs_set_lock_blocking(must_clean);
+
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
 		btrfs_release_path(extent_root, path);
 
+		if (must_clean) {
+			clean_tree_block(NULL, root, must_clean);
+			btrfs_tree_unlock(must_clean);
+			free_extent_buffer(must_clean);
+		}
+
 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
+		} else {
+			invalidate_mapping_pages(info->btree_inode->i_mapping,
+			     bytenr >> PAGE_CACHE_SHIFT,
+			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
 		}
 
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2586,216 +2387,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root, 0);
 	return ret;
 }
 
 /*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
  */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root, int all)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes, u64 parent,
+					u64 root_objectid, u64 ref_generation,
+					u64 owner_objectid, int pin,
+					int refs_to_drop)
 {
-	int ret;
-	int err = 0;
-	u64 start;
-	u64 end;
-	u64 priv;
-	u64 search = 0;
-	int nr = 0, skipped = 0;
-	struct extent_io_tree *pending_del;
-	struct extent_io_tree *extent_ins;
-	struct pending_extent_op *extent_op;
-	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct list_head delete_list;
-
-	INIT_LIST_HEAD(&delete_list);
-	extent_ins = &extent_root->fs_info->extent_ins;
-	pending_del = &extent_root->fs_info->pending_del;
-
-again:
-	mutex_lock(&info->extent_ins_mutex);
-	while (1) {
-		ret = find_first_extent_bit(pending_del, search, &start, &end,
-					    EXTENT_WRITEBACK);
-		if (ret) {
-			if (all && skipped && !nr) {
-				search = 0;
-				skipped = 0;
-				continue;
-			}
-			mutex_unlock(&info->extent_ins_mutex);
-			break;
-		}
-
-		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-		if (!ret) {
-			search = end+1;
-			skipped = 1;
-
-			if (need_resched()) {
-				mutex_unlock(&info->extent_ins_mutex);
-				cond_resched();
-				mutex_lock(&info->extent_ins_mutex);
-			}
-
-			continue;
-		}
-		BUG_ON(ret < 0);
-
-		ret = get_state_private(pending_del, start, &priv);
-		BUG_ON(ret);
-		extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-				  GFP_NOFS);
-		if (!test_range_bit(extent_ins, start, end,
-				    EXTENT_WRITEBACK, 0)) {
-			list_add_tail(&extent_op->list, &delete_list);
-			nr++;
-		} else {
-			kfree(extent_op);
-
-			ret = get_state_private(&info->extent_ins, start,
-						&priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-						(unsigned long)priv;
-
-			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_WRITEBACK, GFP_NOFS);
-
-			if (extent_op->type == PENDING_BACKREF_UPDATE) {
-				list_add_tail(&extent_op->list, &delete_list);
-				search = end + 1;
-				nr++;
-				continue;
-			}
-
-			mutex_lock(&extent_root->fs_info->pinned_mutex);
-			ret = pin_down_bytes(trans, extent_root, start,
-					     end + 1 - start, 0);
-			mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-			ret = update_block_group(trans, extent_root, start,
-						end + 1 - start, 0, ret > 0);
-
-			unlock_extent(extent_ins, start, end, GFP_NOFS);
-			BUG_ON(ret);
-			kfree(extent_op);
-		}
-		if (ret)
-			err = ret;
-
-		search = end + 1;
-
-		if (need_resched()) {
-			mutex_unlock(&info->extent_ins_mutex);
-			cond_resched();
-			mutex_lock(&info->extent_ins_mutex);
-		}
-	}
+	WARN_ON(num_bytes < root->sectorsize);
 
-	if (nr) {
-		ret = free_extents(trans, extent_root, &delete_list);
-		BUG_ON(ret);
-	}
+	/*
+	 * if metadata always pin
+	 * if data pin when any transaction has committed this
+	 */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	    ref_generation != trans->transid)
+		pin = 1;
 
-	if (all && skipped) {
-		INIT_LIST_HEAD(&delete_list);
-		search = 0;
-		nr = 0;
-		goto again;
-	}
+	if (ref_generation != trans->transid)
+		pin = 1;
 
-	return err;
+	return __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0, refs_to_drop);
 }
 
 /*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
  */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 bytenr, u64 num_bytes, u64 parent,
-			       u64 root_objectid, u64 ref_generation,
-			       u64 owner_objectid, int pin)
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root, u64 bytenr)
 {
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	int pending_ret;
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_node *ref;
+	struct rb_node *node;
 	int ret;
 
-	WARN_ON(num_bytes < root->sectorsize);
-	if (root == extent_root) {
-		struct pending_extent_op *extent_op = NULL;
-
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-			u64 priv;
-			ret = get_state_private(&root->fs_info->extent_ins,
-						bytenr, &priv);
-			BUG_ON(ret);
-			extent_op = (struct pending_extent_op *)
-						(unsigned long)priv;
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
 
-			extent_op->del = 1;
-			if (extent_op->type == PENDING_EXTENT_INSERT) {
-				mutex_unlock(&root->fs_info->extent_ins_mutex);
-				return 0;
-			}
-		}
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out;
 
-		if (extent_op) {
-			ref_generation = extent_op->orig_generation;
-			parent = extent_op->orig_parent;
-		}
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-
-		extent_op->type = PENDING_EXTENT_DELETE;
-		extent_op->bytenr = bytenr;
-		extent_op->num_bytes = num_bytes;
-		extent_op->parent = parent;
-		extent_op->orig_parent = parent;
-		extent_op->generation = ref_generation;
-		extent_op->orig_generation = ref_generation;
-		extent_op->level = (int)owner_objectid;
-		INIT_LIST_HEAD(&extent_op->list);
-		extent_op->del = 0;
-
-		set_extent_bits(&root->fs_info->pending_del,
-				bytenr, bytenr + num_bytes - 1,
-				EXTENT_WRITEBACK, GFP_NOFS);
-		set_state_private(&root->fs_info->pending_del,
-				  bytenr, (unsigned long)extent_op);
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		return 0;
-	}
-	/* if metadata always pin */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-			mutex_lock(&root->fs_info->pinned_mutex);
-			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-			mutex_unlock(&root->fs_info->pinned_mutex);
-			update_reserved_extents(root, bytenr, num_bytes, 0);
-			return 0;
-		}
-		pin = 1;
-	}
+	/* there are still entries for this ref, we can't drop it */
+	if (ref->bytenr == bytenr)
+		goto out;
 
-	/* if data pin when any transaction has committed this */
-	if (ref_generation != trans->transid)
-		pin = 1;
+	/*
+	 * waiting for the lock here would deadlock.  If someone else has it
+	 * locked they are already in the process of dropping it anyway
+	 */
+	if (!mutex_trylock(&head->mutex))
+		goto out;
 
-	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation,
-			    owner_objectid, pin, pin == 0);
+	/*
+	 * at this point we have a head with no other entries.  Go
+	 * ahead and process it.
+	 */
+	head->node.in_tree = 0;
+	rb_erase(&head->node.rb_node, &delayed_refs->root);
 
-	finish_current_insert(trans, root->fs_info->extent_root, 0);
-	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
-	return ret ? ret : pending_ret;
+	delayed_refs->num_entries--;
+
+	/*
+	 * we don't take a ref on the node because we're removing it from the
+	 * tree, so we just steal the ref the tree was holding.
+	 */
+	delayed_refs->num_heads--;
+	if (list_empty(&head->cluster))
+		delayed_refs->num_heads_ready--;
+
+	list_del_init(&head->cluster);
+	spin_unlock(&delayed_refs->lock);
+
+	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+				  &head->node, head->must_insert_reserved);
+	BUG_ON(ret);
+	btrfs_put_delayed_ref(&head->node);
+	return 0;
+out:
+	spin_unlock(&delayed_refs->lock);
+	return 0;
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -2806,9 +2494,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
-				  root_objectid, ref_generation,
-				  owner_objectid, pin);
+	/*
+	 * tree log blocks never actually go into the extent allocation
+	 * tree, just update pinning info and exit early.
+	 *
+	 * data extents referenced by the tree log do need to have
+	 * their reference counts bumped.
+	 */
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		/* unlocks the pinned mutex */
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+		update_reserved_extents(root, bytenr, num_bytes, 0);
+		ret = 0;
+	} else {
+		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+				       root_objectid, ref_generation,
+				       owner_objectid,
+				       BTRFS_DROP_DELAYED_REF, 1);
+		BUG_ON(ret);
+		ret = check_ref_cleanup(trans, root, bytenr);
+		BUG_ON(ret);
+	}
 	return ret;
 }
 
@@ -2837,227 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
-	u64 total_needed = num_bytes;
-	u64 *last_ptr = NULL;
-	u64 last_wanted = 0;
+	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
-	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
-	struct list_head *head = NULL, *cur = NULL;
-	int loop = 0;
-	int extra_loop = 0;
 	struct btrfs_space_info *space_info;
+	int last_ptr_loop = 0;
+	int loop = 0;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 	ins->objectid = 0;
 	ins->offset = 0;
 
+	space_info = __find_space_info(root->fs_info, data);
+
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
-		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 64 * 1024;
+		last_ptr = &root->fs_info->meta_alloc_cluster;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
-		last_ptr = &root->fs_info->last_data_alloc;
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+		last_ptr = &root->fs_info->data_alloc_cluster;
+	}
 
 	if (last_ptr) {
-		if (*last_ptr) {
-			hint_byte = *last_ptr;
-			last_wanted = *last_ptr;
-		} else
-			empty_size += empty_cluster;
-	} else {
-		empty_cluster = 0;
+		spin_lock(&last_ptr->lock);
+		if (last_ptr->block_group)
+			hint_byte = last_ptr->window_start;
+		spin_unlock(&last_ptr->lock);
 	}
+
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
 
-	if (last_wanted && search_start != last_wanted) {
-		last_wanted = 0;
-		empty_size += empty_cluster;
+	if (!last_ptr) {
+		empty_cluster = 0;
+		loop = 1;
 	}
 
-	total_needed += empty_size;
-	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
-	if (!block_group)
-		block_group = btrfs_lookup_first_block_group(root->fs_info,
-							     search_start);
-	space_info = __find_space_info(root->fs_info, data);
+	if (search_start == hint_byte) {
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						       search_start);
+		if (block_group && block_group_bits(block_group, data)) {
+			down_read(&space_info->groups_sem);
+			goto have_block_group;
+		} else if (block_group) {
+			btrfs_put_block_group(block_group);
+		}
+	}
 
+search:
 	down_read(&space_info->groups_sem);
-	while (1) {
-		struct btrfs_free_space *free_space;
-		/*
-		 * the only way this happens if our hint points to a block
-		 * group thats not of the proper type, while looping this
-		 * should never happen
-		 */
-		if (empty_size)
-			extra_loop = 1;
+	list_for_each_entry(block_group, &space_info->block_groups, list) {
+		u64 offset;
 
-		if (!block_group)
-			goto new_group_no_lock;
+		atomic_inc(&block_group->count);
+		search_start = block_group->key.objectid;
 
+have_block_group:
 		if (unlikely(!block_group->cached)) {
 			mutex_lock(&block_group->cache_mutex);
 			ret = cache_block_group(root, block_group);
 			mutex_unlock(&block_group->cache_mutex);
-			if (ret)
+			if (ret) {
+				btrfs_put_block_group(block_group);
 				break;
+			}
 		}
 
-		mutex_lock(&block_group->alloc_mutex);
-		if (unlikely(!block_group_bits(block_group, data)))
-			goto new_group;
-
 		if (unlikely(block_group->ro))
-			goto new_group;
+			goto loop;
 
-		free_space = btrfs_find_free_space(block_group, search_start,
-						   total_needed);
-		if (free_space) {
-			u64 start = block_group->key.objectid;
-			u64 end = block_group->key.objectid +
-				block_group->key.offset;
+		if (last_ptr) {
+			/*
+			 * the refill lock keeps out other
+			 * people trying to start a new cluster
+			 */
+			spin_lock(&last_ptr->refill_lock);
+			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+						 num_bytes, search_start);
+			if (offset) {
+				/* we have a block, we're done */
+				spin_unlock(&last_ptr->refill_lock);
+				goto checks;
+			}
 
-			search_start = stripe_align(root, free_space->offset);
+			spin_lock(&last_ptr->lock);
+			/*
+			 * whoops, this cluster doesn't actually point to
+			 * this block group.  Get a ref on the block
+			 * group is does point to and try again
+			 */
+			if (!last_ptr_loop && last_ptr->block_group &&
+			    last_ptr->block_group != block_group) {
+
+				btrfs_put_block_group(block_group);
+				block_group = last_ptr->block_group;
+				atomic_inc(&block_group->count);
+				spin_unlock(&last_ptr->lock);
+				spin_unlock(&last_ptr->refill_lock);
+
+				last_ptr_loop = 1;
+				search_start = block_group->key.objectid;
+				goto have_block_group;
+			}
+			spin_unlock(&last_ptr->lock);
 
-			/* move on to the next group */
-			if (search_start + num_bytes >= search_end)
-				goto new_group;
+			/*
+			 * this cluster didn't work out, free it and
+			 * start over
+			 */
+			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-			/* move on to the next group */
-			if (search_start + num_bytes > end)
-				goto new_group;
+			last_ptr_loop = 0;
 
-			if (last_wanted && search_start != last_wanted) {
-				total_needed += empty_cluster;
-				empty_size += empty_cluster;
-				last_wanted = 0;
+			/* allocate a cluster in this block group */
+			ret = btrfs_find_space_cluster(trans,
+					       block_group, last_ptr,
+					       offset, num_bytes,
+					       empty_cluster + empty_size);
+			if (ret == 0) {
 				/*
-				 * if search_start is still in this block group
-				 * then we just re-search this block group
+				 * now pull our allocation out of this
+				 * cluster
 				 */
-				if (search_start >= start &&
-				    search_start < end) {
-					mutex_unlock(&block_group->alloc_mutex);
-					continue;
+				offset = btrfs_alloc_from_cluster(block_group,
+						  last_ptr, num_bytes,
+						  search_start);
+				if (offset) {
+					/* we found one, proceed */
+					spin_unlock(&last_ptr->refill_lock);
+					goto checks;
 				}
-
-				/* else we go to the next block group */
-				goto new_group;
 			}
-
-			if (exclude_nr > 0 &&
-			    (search_start + num_bytes > exclude_start &&
-			     search_start < exclude_start + exclude_nr)) {
-				search_start = exclude_start + exclude_nr;
-				/*
-				 * if search_start is still in this block group
-				 * then we just re-search this block group
-				 */
-				if (search_start >= start &&
-				    search_start < end) {
-					mutex_unlock(&block_group->alloc_mutex);
-					last_wanted = 0;
-					continue;
-				}
-
-				/* else we go to the next block group */
-				goto new_group;
+			/*
+			 * at this point we either didn't find a cluster
+			 * or we weren't able to allocate a block from our
+			 * cluster.  Free the cluster we've been trying
+			 * to use, and go to the next block group
+			 */
+			if (loop < 2) {
+				btrfs_return_cluster_to_free_space(NULL,
+								   last_ptr);
+				spin_unlock(&last_ptr->refill_lock);
+				goto loop;
 			}
+			spin_unlock(&last_ptr->refill_lock);
+		}
 
-			ins->objectid = search_start;
-			ins->offset = num_bytes;
+		offset = btrfs_find_space_for_alloc(block_group, search_start,
+						    num_bytes, empty_size);
+		if (!offset)
+			goto loop;
+checks:
+		search_start = stripe_align(root, offset);
 
-			btrfs_remove_free_space_lock(block_group, search_start,
-						     num_bytes);
-			/* we are all good, lets return */
-			mutex_unlock(&block_group->alloc_mutex);
-			break;
+		/* move on to the next group */
+		if (search_start + num_bytes >= search_end) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
 		}
-new_group:
-		mutex_unlock(&block_group->alloc_mutex);
-		put_block_group(block_group);
-		block_group = NULL;
-new_group_no_lock:
-		/* don't try to compare new allocations against the
-		 * last allocation any more
-		 */
-		last_wanted = 0;
 
-		/*
-		 * Here's how this works.
-		 * loop == 0: we were searching a block group via a hint
-		 *		and didn't find anything, so we start at
-		 *		the head of the block groups and keep searching
-		 * loop == 1: we're searching through all of the block groups
-		 *		if we hit the head again we have searched
-		 *		all of the block groups for this space and we
-		 *		need to try and allocate, if we cant error out.
-		 * loop == 2: we allocated more space and are looping through
-		 *		all of the block groups again.
-		 */
-		if (loop == 0) {
-			head = &space_info->block_groups;
-			cur = head->next;
-			loop++;
-		} else if (loop == 1 && cur == head) {
-			int keep_going;
-
-			/* at this point we give up on the empty_size
-			 * allocations and just try to allocate the min
-			 * space.
-			 *
-			 * The extra_loop field was set if an empty_size
-			 * allocation was attempted above, and if this
-			 * is try we need to try the loop again without
-			 * the additional empty_size.
+		/* move on to the next group */
+		if (search_start + num_bytes >
+		    block_group->key.objectid + block_group->key.offset) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			goto loop;
+		}
+
+		if (exclude_nr > 0 &&
+		    (search_start + num_bytes > exclude_start &&
+		     search_start < exclude_start + exclude_nr)) {
+			search_start = exclude_start + exclude_nr;
+
+			btrfs_add_free_space(block_group, offset, num_bytes);
+			/*
+			 * if search_start is still in this block group
+			 * then we just re-search this block group
 			 */
-			total_needed -= empty_size;
-			empty_size = 0;
-			keep_going = extra_loop;
-			loop++;
+			if (search_start >= block_group->key.objectid &&
+			    search_start < (block_group->key.objectid +
+					    block_group->key.offset))
+				goto have_block_group;
+			goto loop;
+		}
 
-			if (allowed_chunk_alloc && !chunk_alloc_done) {
-				up_read(&space_info->groups_sem);
-				ret = do_chunk_alloc(trans, root, num_bytes +
-						     2 * 1024 * 1024, data, 1);
-				down_read(&space_info->groups_sem);
-				if (ret < 0)
-					goto loop_check;
-				head = &space_info->block_groups;
-				/*
-				 * we've allocated a new chunk, keep
-				 * trying
-				 */
-				keep_going = 1;
-				chunk_alloc_done = 1;
-			} else if (!allowed_chunk_alloc) {
-				space_info->force_alloc = 1;
-			}
-loop_check:
-			if (keep_going) {
-				cur = head->next;
-				extra_loop = 0;
-			} else {
-				break;
-			}
-		} else if (cur == head) {
-			break;
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
+		if (offset < search_start)
+			btrfs_add_free_space(block_group, offset,
+					     search_start - offset);
+		BUG_ON(offset > search_start);
+
+		/* we are all good, lets return */
+		break;
+loop:
+		btrfs_put_block_group(block_group);
+	}
+	up_read(&space_info->groups_sem);
+
+	/* loop == 0, try to find a clustered alloc in every block group
+	 * loop == 1, try again after forcing a chunk allocation
+	 * loop == 2, set empty_size and empty_cluster to 0 and try again
+	 */
+	if (!ins->objectid && loop < 3 &&
+	    (empty_size || empty_cluster || allowed_chunk_alloc)) {
+		if (loop >= 2) {
+			empty_size = 0;
+			empty_cluster = 0;
 		}
 
-		block_group = list_entry(cur, struct btrfs_block_group_cache,
-					 list);
-		atomic_inc(&block_group->count);
+		if (allowed_chunk_alloc) {
+			ret = do_chunk_alloc(trans, root, num_bytes +
+					     2 * 1024 * 1024, data, 1);
+			allowed_chunk_alloc = 0;
+		} else {
+			space_info->force_alloc = 1;
+		}
 
-		search_start = block_group->key.objectid;
-		cur = cur->next;
+		if (loop < 3) {
+			loop++;
+			goto search;
+		}
+		ret = -ENOSPC;
+	} else if (!ins->objectid) {
+		ret = -ENOSPC;
 	}
 
 	/* we found what we needed */
@@ -3065,21 +2782,10 @@ loop_check:
 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
 			trans->block_group = block_group->key.objectid;
 
-		if (last_ptr)
-			*last_ptr = ins->objectid + ins->offset;
+		btrfs_put_block_group(block_group);
 		ret = 0;
-	} else if (!ret) {
-		printk(KERN_ERR "btrfs searching for %llu bytes, "
-		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
-		       (unsigned long long)total_needed,
-		       (unsigned long long)num_bytes,
-		       loop, allowed_chunk_alloc);
-		ret = -ENOSPC;
 	}
-	if (block_group)
-		put_block_group(block_group);
 
-	up_read(&space_info->groups_sem);
 	return ret;
 }
 
@@ -3091,6 +2797,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+	       " may_use=%llu, used=%llu\n", info->total_bytes,
+	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+	       info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
@@ -3117,24 +2827,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
 
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
+	data = btrfs_get_alloc_profile(root, data);
 again:
-	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
@@ -3194,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	ret = btrfs_discard_extent(root, start, len);
 
 	btrfs_add_free_space(cache, start, len);
-	put_block_group(cache);
+	btrfs_put_block_group(cache);
 	update_reserved_extents(root, start, len, 0);
 
 	return ret;
@@ -3218,10 +2914,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins)
+					 u64 owner, struct btrfs_key *ins,
+					 int ref_mod)
 {
 	int ret;
-	int pending_ret;
 	u64 super_used;
 	u64 root_used;
 	u64 num_bytes = ins->offset;
@@ -3246,33 +2942,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 	spin_unlock(&info->delalloc_lock);
 
-	if (root == extent_root) {
-		struct pending_extent_op *extent_op;
-
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-
-		extent_op->type = PENDING_EXTENT_INSERT;
-		extent_op->bytenr = ins->objectid;
-		extent_op->num_bytes = ins->offset;
-		extent_op->parent = parent;
-		extent_op->orig_parent = 0;
-		extent_op->generation = ref_generation;
-		extent_op->orig_generation = 0;
-		extent_op->level = (int)owner;
-		INIT_LIST_HEAD(&extent_op->list);
-		extent_op->del = 0;
-
-		mutex_lock(&root->fs_info->extent_ins_mutex);
-		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-				ins->objectid + ins->offset - 1,
-				EXTENT_WRITEBACK, GFP_NOFS);
-		set_state_private(&root->fs_info->extent_ins,
-				  ins->objectid, (unsigned long)extent_op);
-		mutex_unlock(&root->fs_info->extent_ins_mutex);
-		goto update_block;
-	}
-
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].objectid = ins->objectid;
 	keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3283,37 +2952,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
 				       sizes, 2);
 	BUG_ON(ret);
 
 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_extent_ref);
 
 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	trans->alloc_exclude_start = 0;
 	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root, 0);
-	pending_ret = del_pending_extents(trans, extent_root, 0);
 
 	if (ret)
 		goto out;
-	if (pending_ret) {
-		ret = pending_ret;
-		goto out;
-	}
 
-update_block:
 	ret = update_block_group(trans, root, ins->objectid,
 				 ins->offset, 1, 0);
 	if (ret) {
@@ -3335,9 +2998,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins);
-	update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+	ret = btrfs_add_delayed_ref(trans, ins->objectid,
+				    ins->offset, parent, root_objectid,
+				    ref_generation, owner,
+				    BTRFS_ADD_DELAYED_EXTENT, 0);
+	BUG_ON(ret);
 	return ret;
 }
 
@@ -3362,9 +3028,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_remove_free_space(block_group, ins->objectid,
 				      ins->offset);
 	BUG_ON(ret);
-	put_block_group(block_group);
+	btrfs_put_block_group(block_group);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins);
+					    ref_generation, owner, ins, 1);
 	return ret;
 }
 
@@ -3383,26 +3049,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
-
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
 	BUG_ON(ret);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					root_objectid, ref_generation,
-					owner_objectid, ins);
+		ret = btrfs_add_delayed_ref(trans, ins->objectid,
+					    ins->offset, parent, root_objectid,
+					    ref_generation, owner_objectid,
+					    BTRFS_ADD_DELAYED_EXTENT, 0);
 		BUG_ON(ret);
-
-	} else {
-		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
+					    u64 bytenr, u32 blocksize,
+					    int level)
 {
 	struct extent_buffer *buf;
 
@@ -3410,6 +3075,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 
@@ -3453,7 +3119,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
 	return buf;
 }
 
@@ -3529,7 +3196,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
-		ret = __btrfs_free_extent(trans, root, disk_bytenr,
+		ret = btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
@@ -3569,7 +3236,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < ref->nritems; i++) {
 		info = ref->extents + sorted[i].slot;
-		ret = __btrfs_free_extent(trans, root, info->bytenr,
+		ret = btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
@@ -3586,12 +3253,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3699,7 +3367,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 		 * we just decrement it below and don't update any
 		 * of the refs the leaf points to.
 		 */
-		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+						blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1)
 			continue;
@@ -3750,7 +3419,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
 	 */
 	for (i = 0; i < refi; i++) {
 		bytenr = sorted[i].bytenr;
-		ret = __btrfs_free_extent(trans, root, bytenr,
+		ret = btrfs_free_extent(trans, root, bytenr,
 					blocksize, eb->start,
 					root_owner, root_gen, 0, 1);
 		BUG_ON(ret);
@@ -3793,7 +3462,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
 				path->nodes[*level]->len, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -3844,7 +3513,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
-		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		ret = drop_snap_lookup_refcount(trans, root, bytenr,
+						blocksize, &refs);
 		BUG_ON(ret);
 
 		/*
@@ -3859,7 +3529,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			ret = __btrfs_free_extent(trans, root, bytenr,
+			ret = btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
@@ -3905,7 +3575,7 @@ out:
 	 * cleanup and free the reference on the last node
 	 * we processed
 	 */
-	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
@@ -4094,6 +3764,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
+	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4135,6 +3806,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 	}
 	while (1) {
+		unsigned long update;
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -4147,12 +3819,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		if (trans->transaction->in_commit) {
+		if (trans->transaction->in_commit ||
+		    trans->transaction->delayed_refs.flushing) {
 			ret = -EAGAIN;
 			break;
 		}
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
+		for (update_count = 0; update_count < 16; update_count++) {
+			update = trans->delayed_ref_updates;
+			trans->delayed_ref_updates = 0;
+			if (update)
+				btrfs_run_delayed_refs(trans, root, update);
+			else
+				break;
+		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
@@ -4179,13 +3860,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	BUG_ON(!btrfs_tree_locked(parent));
+	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
 	extent_buffer_get(parent);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
-	BUG_ON(!btrfs_tree_locked(node));
+	btrfs_assert_tree_locked(node);
 	level = btrfs_header_level(node);
 	extent_buffer_get(node);
 	path->nodes[level] = node;
@@ -5197,6 +4878,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					root->root_key.objectid,
 					trans->transid, key.objectid);
 		BUG_ON(ret);
+
 		ret = btrfs_free_extent(trans, root,
 					bytenr, num_bytes, leaf->start,
 					btrfs_header_owner(leaf),
@@ -5508,9 +5190,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 				ref_path, NULL, NULL);
 	BUG_ON(ret);
 
-	if (root == root->fs_info->extent_root)
-		btrfs_extent_post_op(trans, root);
-
 	return 0;
 }
 
@@ -5641,7 +5320,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
+		mutex_lock(&extent_root->fs_info->trans_mutex);
 		btrfs_record_root_in_trans(found_root);
+		mutex_unlock(&extent_root->fs_info->trans_mutex);
 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			/*
 			 * try to update data extent references while
@@ -5776,6 +5457,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
 		goto out;
@@ -5946,6 +5628,9 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
+
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -6032,7 +5717,7 @@ next:
 	WARN_ON(block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
 	spin_unlock(&block_group->lock);
-	put_block_group(block_group);
+	btrfs_put_block_group(block_group);
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -6079,6 +5764,7 @@ out:
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
 	struct rb_node *n;
 
 	spin_lock(&info->block_group_cache_lock);
@@ -6100,6 +5786,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		spin_lock(&info->block_group_cache_lock);
 	}
 	spin_unlock(&info->block_group_cache_lock);
+
+	/* now that all the block groups are freed, go through and
+	 * free all the space_info structs.  This is only called during
+	 * the final stages of unmount, and so we know nobody is
+	 * using them.  We call synchronize_rcu() once before we start,
+	 * just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	while(!list_empty(&info->space_info)) {
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+
+		list_del(&space_info->list);
+		kfree(space_info);
+	}
 	return 0;
 }
 
@@ -6141,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		atomic_set(&cache->count, 1);
 		spin_lock_init(&cache->lock);
-		mutex_init(&cache->alloc_mutex);
+		spin_lock_init(&cache->tree_lock);
 		mutex_init(&cache->cache_mutex);
 		INIT_LIST_HEAD(&cache->list);
+		INIT_LIST_HEAD(&cache->cluster_list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
@@ -6186,7 +5890,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	extent_root = root->fs_info->extent_root;
 
-	root->fs_info->last_trans_new_blockgroup = trans->transid;
+	root->fs_info->last_trans_log_full_commit = trans->transid;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	if (!cache)
@@ -6197,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
-	mutex_init(&cache->alloc_mutex);
+	spin_lock_init(&cache->tree_lock);
 	mutex_init(&cache->cache_mutex);
 	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6220,9 +5925,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				sizeof(cache->item));
 	BUG_ON(ret);
 
-	finish_current_insert(trans, extent_root, 0);
-	ret = del_pending_extents(trans, extent_root, 0);
-	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
 	return 0;
@@ -6262,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&block_group->space_info->lock);
 	block_group->space_info->full = 0;
 
-	put_block_group(block_group);
-	put_block_group(block_group);
+	btrfs_put_block_group(block_group);
+	btrfs_put_block_group(block_group);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b79..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -2886,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		disko = 0;
 		flags = 0;
 
-		switch (em->block_start) {
-		case EXTENT_MAP_LAST_BYTE:
+		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
 			end = 1;
 			flags |= FIEMAP_EXTENT_LAST;
-			break;
-		case EXTENT_MAP_HOLE:
+		} else if (em->block_start == EXTENT_MAP_HOLE) {
 			flags |= FIEMAP_EXTENT_UNWRITTEN;
-			break;
-		case EXTENT_MAP_INLINE:
+		} else if (em->block_start == EXTENT_MAP_INLINE) {
 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
 				  FIEMAP_EXTENT_NOT_ALIGNED);
-			break;
-		case EXTENT_MAP_DELALLOC:
+		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
-			break;
-		default:
+		} else {
 			disko = em->block_start;
-			break;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
@@ -3126,20 +3118,15 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb)
 {
-	int set;
 	unsigned long i;
 	unsigned long num_pages;
 	struct page *page;
 
-	u64 start = eb->start;
-	u64 end = start + eb->len - 1;
-
-	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
 	num_pages = num_extent_pages(eb->start, eb->len);
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!set && !PageDirty(page))
+		if (!PageDirty(page))
 			continue;
 
 		lock_page(page);
@@ -3148,22 +3135,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 		else
 			set_page_private(page, EXTENT_PAGE_PRIVATE);
 
-		/*
-		 * if we're on the last page or the first page and the
-		 * block isn't aligned on a page boundary, do extra checks
-		 * to make sure we don't clean page that is partially dirty
-		 */
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			start = (u64)page->index << PAGE_CACHE_SHIFT;
-			end  = start + PAGE_CACHE_SIZE - 1;
-			if (test_range_bit(tree, start, end,
-					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
-				continue;
-			}
-		}
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
@@ -3189,29 +3160,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
 	unsigned long i;
 	unsigned long num_pages;
+	int was_dirty = 0;
 
+	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = extent_buffer_page(eb, i);
-		/* writepage may need to do something special for the
-		 * first page, we have to make sure page->private is
-		 * properly set.  releasepage may drop page->private
-		 * on us if the page isn't already dirty.
-		 */
-		lock_page(page);
-		if (i == 0) {
-			set_page_extent_head(page, eb->len);
-		} else if (PagePrivate(page) &&
-			   page->private != EXTENT_PAGE_PRIVATE) {
-			set_page_extent_mapped(page);
-		}
+	for (i = 0; i < num_pages; i++)
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		set_extent_dirty(tree, page_offset(page),
-				 page_offset(page) + PAGE_CACHE_SIZE - 1,
-				 GFP_NOFS);
-		unlock_page(page);
-	}
-	return 0;
+	return was_dirty;
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3791,6 +3746,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 		ret = 0;
 		goto out;
 	}
+	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		ret = 0;
+		goto out;
+	}
 	/* at this point we can safely release the extent buffer */
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
 		ret = -EEXIST;
-		free_extent_map(merge);
 		goto out;
 	}
 	atomic_inc(&em->refs);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	file_key.offset = pos;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
 	if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
 
+		path->leave_spinning = 1;
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret > 0) {
 			if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
 	} else {
 		ins_size = csum_size;
 	}
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
+	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
 	if (ret != 0) {
@@ -776,7 +780,6 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-	cond_resched();
 next_sector:
 
 	if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	cond_resched();
 	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
+		cond_resched();
 		goto again;
 	}
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff7..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
 			btrfs_release_path(root, path);
+			path->leave_spinning = 1;
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
 			BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
 							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent, found_type);
 
+			btrfs_unlock_up_safe(path, 1);
 			btrfs_mark_buffer_dirty(path->nodes[0]);
+			btrfs_set_lock_blocking(path->nodes[0]);
 
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
-						disk_bytenr, orig_parent,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						orig_parent,
 						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
 
 				BUG_ON(ret);
 			}
+			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 
 	if (orig_parent != leaf->start) {
-		ret = btrfs_update_extent_ref(trans, root, bytenr,
+		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
 					      orig_parent, leaf->start,
 					      root->root_key.objectid,
 					      trans->transid, inode->i_ino);
@@ -1091,19 +1097,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_free_space(root, write_bytes, 0);
+		ret = btrfs_check_data_free_space(root, inode, write_bytes);
 		if (ret)
 			goto out;
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			btrfs_drop_pages(pages, num_pages);
 			goto out;
 		}
@@ -1111,8 +1122,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1150,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		err = ret;
 
 out_nolock:
 	kfree(pages);
@@ -1145,6 +1161,20 @@ out_nolock:
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 
+	/*
+	 * we want to make sure fsync finds this change
+	 * but we haven't joined a transaction running right now.
+	 *
+	 * Later on, someone is sure to update the inode and get the
+	 * real transid recorded.
+	 *
+	 * We set last_trans now to the fs_info generation + 1,
+	 * this will either be one more than the running transaction
+	 * or the generation used for the next transaction if there isn't
+	 * one running right now.
+	 */
+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
 	if (num_written > 0 && will_write) {
 		struct btrfs_trans_handle *trans;
 
@@ -1157,8 +1187,11 @@ out_nolock:
 			ret = btrfs_log_dentry_safe(trans, root,
 						    file->f_dentry);
 			if (ret == 0) {
-				btrfs_sync_log(trans, root);
-				btrfs_end_transaction(trans, root);
+				ret = btrfs_sync_log(trans, root);
+				if (ret == 0)
+					btrfs_end_transaction(trans, root);
+				else
+					btrfs_commit_transaction(trans, root);
 			} else {
 				btrfs_commit_transaction(trans, root);
 			}
@@ -1175,6 +1208,18 @@ out_nolock:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+	/*
+	 * ordered_data_close is set by settattr when we are about to truncate
+	 * a file from a non-zero size to a zero size.  This tries to
+	 * flush down new bytes that may have been written if the
+	 * application were using truncate to replace a file in place.
+	 */
+	if (BTRFS_I(inode)->ordered_data_close) {
+		BTRFS_I(inode)->ordered_data_close = 0;
+		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+			filemap_flush(inode->i_mapping);
+	}
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
@@ -1222,7 +1267,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file->private_data)
+	if (file && file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1276,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0)
 		goto out;
 
@@ -1245,15 +1290,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
-		btrfs_sync_log(trans, root);
-		ret = btrfs_end_transaction(trans, root);
+		ret = btrfs_sync_log(trans, root);
+		if (ret == 0)
+			ret = btrfs_end_transaction(trans, root);
+		else
+			ret = btrfs_commit_transaction(trans, root);
 	}
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
 
 #include <linux/sched.h>
 #include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+
+struct btrfs_free_space {
+	struct rb_node bytes_index;
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
+};
 
 static int tree_insert_offset(struct rb_root *root, u64 offset,
 			      struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 }
 
 /*
- * searches the tree for the given offset.  If contains is set we will return
- * the free space that contains the given offset.  If contains is not set we
- * will return the free space that starts at or after the given offset and is
- * at least bytes long.
+ * searches the tree for the given offset.
+ *
+ * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * to look for free space.  Because the hint may not be completely on an offset
+ * mark, or the hint may no longer point to free space we need to fudge our
+ * results a bit.  So we look for free space starting at or after offset with at
+ * least bytes size.  We prefer to find as close to the given offset as we can.
+ * Also if the offset is within a free space range, then we will return the free
+ * space that contains the given offset, which means we can return a free space
+ * chunk with an offset before the provided offset.
+ *
+ * fuzzy == 0: this is just a normal tree search.  Give us the free space that
+ * starts at the given offset which is at least bytes size, and if its not there
+ * return NULL.
  */
 static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
 						   u64 offset, u64 bytes,
-						   int contains)
+						   int fuzzy)
 {
 	struct rb_node *n = root->rb_node;
 	struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
 
 		if (offset < entry->offset) {
-			if (!contains &&
+			if (fuzzy &&
 			    (!ret || entry->offset < ret->offset) &&
 			    (bytes <= entry->bytes))
 				ret = entry;
 			n = n->rb_left;
 		} else if (offset > entry->offset) {
-			if ((entry->offset + entry->bytes - 1) >= offset &&
+			if (fuzzy &&
+			    (entry->offset + entry->bytes - 1) >= offset &&
 			    bytes <= entry->bytes) {
 				ret = entry;
 				break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	int ret = 0;
 
 
+	BUG_ON(!info->bytes);
 	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
 				 &info->offset_index);
 	if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
-static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-				  u64 offset, u64 bytes)
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *right_info;
 	struct btrfs_free_space *left_info;
 	struct btrfs_free_space *info = NULL;
-	struct btrfs_free_space *alloc_info;
 	int ret = 0;
 
-	alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-	if (!alloc_info)
+	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!info)
 		return -ENOMEM;
 
+	info->offset = offset;
+	info->bytes = bytes;
+
+	spin_lock(&block_group->tree_lock);
+
 	/*
 	 * first we want to see if there is free space adjacent to the range we
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
 	right_info = tree_search_offset(&block_group->free_space_offset,
-					offset+bytes, 0, 1);
+					offset+bytes, 0, 0);
 	left_info = tree_search_offset(&block_group->free_space_offset,
 				       offset-1, 0, 1);
 
-	if (right_info && right_info->offset == offset+bytes) {
+	if (right_info) {
 		unlink_free_space(block_group, right_info);
-		info = right_info;
-		info->offset = offset;
-		info->bytes += bytes;
-	} else if (right_info && right_info->offset != offset+bytes) {
-		printk(KERN_ERR "btrfs adding space in the middle of an "
-		       "existing free space area. existing: "
-		       "offset=%llu, bytes=%llu. new: offset=%llu, "
-		       "bytes=%llu\n", (unsigned long long)right_info->offset,
-		       (unsigned long long)right_info->bytes,
-		       (unsigned long long)offset,
-		       (unsigned long long)bytes);
-		BUG();
+		info->bytes += right_info->bytes;
+		kfree(right_info);
 	}
 
-	if (left_info) {
+	if (left_info && left_info->offset + left_info->bytes == offset) {
 		unlink_free_space(block_group, left_info);
-
-		if (unlikely((left_info->offset + left_info->bytes) !=
-			     offset)) {
-			printk(KERN_ERR "btrfs free space to the left "
-			       "of new free space isn't "
-			       "quite right. existing: offset=%llu, "
-			       "bytes=%llu. new: offset=%llu, bytes=%llu\n",
-			       (unsigned long long)left_info->offset,
-			       (unsigned long long)left_info->bytes,
-			       (unsigned long long)offset,
-			       (unsigned long long)bytes);
-			BUG();
-		}
-
-		if (info) {
-			info->offset = left_info->offset;
-			info->bytes += left_info->bytes;
-			kfree(left_info);
-		} else {
-			info = left_info;
-			info->bytes += bytes;
-		}
+		info->offset = left_info->offset;
+		info->bytes += left_info->bytes;
+		kfree(left_info);
 	}
 
-	if (info) {
-		ret = link_free_space(block_group, info);
-		if (!ret)
-			info = NULL;
-		goto out;
-	}
-
-	info = alloc_info;
-	alloc_info = NULL;
-	info->offset = offset;
-	info->bytes = bytes;
-
 	ret = link_free_space(block_group, info);
 	if (ret)
 		kfree(info);
-out:
+
+	spin_unlock(&block_group->tree_lock);
+
 	if (ret) {
 		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
-		if (ret == -EEXIST)
-			BUG();
+		BUG_ON(ret == -EEXIST);
 	}
 
-	kfree(alloc_info);
-
 	return ret;
 }
 
-static int
-__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			  u64 offset, u64 bytes)
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
 	int ret = 0;
 
+	spin_lock(&block_group->tree_lock);
+
 	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
 				  1);
-
 	if (info && info->offset == offset) {
 		if (info->bytes < bytes) {
 			printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			       (unsigned long long)bytes);
 			WARN_ON(1);
 			ret = -EINVAL;
+			spin_unlock(&block_group->tree_lock);
 			goto out;
 		}
 		unlink_free_space(block_group, info);
 
 		if (info->bytes == bytes) {
 			kfree(info);
+			spin_unlock(&block_group->tree_lock);
 			goto out;
 		}
 
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 		info->bytes -= bytes;
 
 		ret = link_free_space(block_group, info);
+		spin_unlock(&block_group->tree_lock);
 		BUG_ON(ret);
 	} else if (info && info->offset < offset &&
 		   info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			 */
 			kfree(info);
 		}
-
+		spin_unlock(&block_group->tree_lock);
 		/* step two, insert a new info struct to cover anything
 		 * before the hole
 		 */
-		ret = __btrfs_add_free_space(block_group, old_start,
-					     offset - old_start);
+		ret = btrfs_add_free_space(block_group, old_start,
+					   offset - old_start);
 		BUG_ON(ret);
 	} else {
+		spin_unlock(&block_group->tree_lock);
+		if (!info) {
+			printk(KERN_ERR "couldn't find space %llu to free\n",
+			       (unsigned long long)offset);
+			printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
+			       block_group->cached, block_group->key.objectid,
+			       block_group->key.offset);
+			btrfs_dump_free_space(block_group, bytes);
+		} else if (info) {
+			printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
+			       "but wanted offset=%llu bytes=%llu\n",
+			       info->offset, info->bytes, offset, bytes);
+		}
 		WARN_ON(1);
 	}
 out:
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
-{
-	int ret;
-	struct btrfs_free_space *sp;
-
-	mutex_lock(&block_group->alloc_mutex);
-	ret = __btrfs_add_free_space(block_group, offset, bytes);
-	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-	BUG_ON(!sp);
-	mutex_unlock(&block_group->alloc_mutex);
-
-	return ret;
-}
-
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-			      u64 offset, u64 bytes)
-{
-	int ret;
-	struct btrfs_free_space *sp;
-
-	ret = __btrfs_add_free_space(block_group, offset, bytes);
-	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-	BUG_ON(!sp);
-
-	return ret;
-}
-
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			    u64 offset, u64 bytes)
-{
-	int ret = 0;
-
-	mutex_lock(&block_group->alloc_mutex);
-	ret = __btrfs_remove_free_space(block_group, offset, bytes);
-	mutex_unlock(&block_group->alloc_mutex);
-
-	return ret;
-}
-
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-				 u64 offset, u64 bytes)
-{
-	int ret;
-
-	ret = __btrfs_remove_free_space(block_group, offset, bytes);
-
-	return ret;
-}
-
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes)
 {
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
+		printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
+		       info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
 	return ret;
 }
 
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster)
+{
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+
+	spin_lock(&cluster->lock);
+	if (cluster->block_group != block_group)
+		goto out;
+
+	cluster->window_start = 0;
+	node = rb_first(&cluster->root);
+	while(node) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		node = rb_next(&entry->offset_index);
+		rb_erase(&entry->offset_index, &cluster->root);
+		link_free_space(block_group, entry);
+	}
+	list_del_init(&cluster->block_group_list);
+
+	btrfs_put_block_group(cluster->block_group);
+	cluster->block_group = NULL;
+	cluster->root.rb_node = NULL;
+out:
+	spin_unlock(&cluster->lock);
+	return 0;
+}
+
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
 	struct btrfs_free_space *info;
 	struct rb_node *node;
+	struct btrfs_free_cluster *cluster;
+	struct btrfs_free_cluster *safe;
+
+	spin_lock(&block_group->tree_lock);
+
+	list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+				 block_group_list) {
+
+		WARN_ON(cluster->block_group != block_group);
+		__btrfs_return_cluster_to_free_space(block_group, cluster);
+	}
 
-	mutex_lock(&block_group->alloc_mutex);
 	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
 		info = rb_entry(node, struct btrfs_free_space, bytes_index);
 		unlink_free_space(block_group, info);
 		kfree(info);
 		if (need_resched()) {
-			mutex_unlock(&block_group->alloc_mutex);
+			spin_unlock(&block_group->tree_lock);
 			cond_resched();
-			mutex_lock(&block_group->alloc_mutex);
+			spin_lock(&block_group->tree_lock);
 		}
 	}
-	mutex_unlock(&block_group->alloc_mutex);
+	spin_unlock(&block_group->tree_lock);
 }
 
-#if 0
-static struct btrfs_free_space *btrfs_find_free_space_offset(struct
-						      btrfs_block_group_cache
-						      *block_group, u64 offset,
-						      u64 bytes)
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size)
 {
-	struct btrfs_free_space *ret;
+	struct btrfs_free_space *entry = NULL;
+	u64 ret = 0;
 
-	mutex_lock(&block_group->alloc_mutex);
-	ret = tree_search_offset(&block_group->free_space_offset, offset,
-				 bytes, 0);
-	mutex_unlock(&block_group->alloc_mutex);
+	spin_lock(&block_group->tree_lock);
+	entry = tree_search_offset(&block_group->free_space_offset, offset,
+				   bytes + empty_size, 1);
+	if (!entry)
+		entry = tree_search_bytes(&block_group->free_space_bytes,
+					  offset, bytes + empty_size);
+	if (entry) {
+		unlink_free_space(block_group, entry);
+		ret = entry->offset;
+		entry->offset += bytes;
+		entry->bytes -= bytes;
+
+		if (!entry->bytes)
+			kfree(entry);
+		else
+			link_free_space(block_group, entry);
+	}
+	spin_unlock(&block_group->tree_lock);
 
 	return ret;
 }
 
-static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
-						     btrfs_block_group_cache
-						     *block_group, u64 offset,
-						     u64 bytes)
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache.  If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster)
 {
-	struct btrfs_free_space *ret;
+	int ret;
 
-	mutex_lock(&block_group->alloc_mutex);
+	/* first, get a safe pointer to the block group */
+	spin_lock(&cluster->lock);
+	if (!block_group) {
+		block_group = cluster->block_group;
+		if (!block_group) {
+			spin_unlock(&cluster->lock);
+			return 0;
+		}
+	} else if (cluster->block_group != block_group) {
+		/* someone else has already freed it don't redo their work */
+		spin_unlock(&cluster->lock);
+		return 0;
+	}
+	atomic_inc(&block_group->count);
+	spin_unlock(&cluster->lock);
 
-	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
-	mutex_unlock(&block_group->alloc_mutex);
+	/* now return any extents the cluster had on it */
+	spin_lock(&block_group->tree_lock);
+	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&block_group->tree_lock);
 
+	/* finally drop our ref */
+	btrfs_put_block_group(block_group);
 	return ret;
 }
-#endif
 
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-					       *block_group, u64 offset,
-					       u64 bytes)
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start)
+{
+	struct btrfs_free_space *entry = NULL;
+	struct rb_node *node;
+	u64 ret = 0;
+
+	spin_lock(&cluster->lock);
+	if (bytes > cluster->max_size)
+		goto out;
+
+	if (cluster->block_group != block_group)
+		goto out;
+
+	node = rb_first(&cluster->root);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+	while(1) {
+		if (entry->bytes < bytes || entry->offset < min_start) {
+			struct rb_node *node;
+
+			node = rb_next(&entry->offset_index);
+			if (!node)
+				break;
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+			continue;
+		}
+		ret = entry->offset;
+
+		entry->offset += bytes;
+		entry->bytes -= bytes;
+
+		if (entry->bytes == 0) {
+			rb_erase(&entry->offset_index, &cluster->root);
+			kfree(entry);
+		}
+		break;
+	}
+out:
+	spin_unlock(&cluster->lock);
+	return ret;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size)
 {
-	struct btrfs_free_space *ret = NULL;
+	struct btrfs_free_space *entry = NULL;
+	struct rb_node *node;
+	struct btrfs_free_space *next;
+	struct btrfs_free_space *last;
+	u64 min_bytes;
+	u64 window_start;
+	u64 window_free;
+	u64 max_extent = 0;
+	int total_retries = 0;
+	int ret;
+
+	/* for metadata, allow allocates with more holes */
+	if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+		/*
+		 * we want to do larger allocations when we are
+		 * flushing out the delayed refs, it helps prevent
+		 * making more work as we go along.
+		 */
+		if (trans->transaction->delayed_refs.flushing)
+			min_bytes = max(bytes, (bytes + empty_size) >> 1);
+		else
+			min_bytes = max(bytes, (bytes + empty_size) >> 4);
+	} else
+		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+	spin_lock(&block_group->tree_lock);
+	spin_lock(&cluster->lock);
+
+	/* someone already found a cluster, hooray */
+	if (cluster->block_group) {
+		ret = 0;
+		goto out;
+	}
+again:
+	min_bytes = min(min_bytes, bytes + empty_size);
+	entry = tree_search_bytes(&block_group->free_space_bytes,
+				  offset, min_bytes);
+	if (!entry) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	window_start = entry->offset;
+	window_free = entry->bytes;
+	last = entry;
+	max_extent = entry->bytes;
+
+	while(1) {
+		/* out window is just right, lets fill it */
+		if (window_free >= bytes + empty_size)
+			break;
 
-	ret = tree_search_offset(&block_group->free_space_offset, offset,
-				 bytes, 0);
-	if (!ret)
-		ret = tree_search_bytes(&block_group->free_space_bytes,
-					offset, bytes);
+		node = rb_next(&last->offset_index);
+		if (!node) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		next = rb_entry(node, struct btrfs_free_space, offset_index);
+
+		/*
+		 * we haven't filled the empty size and the window is
+		 * very large.  reset and try again
+		 */
+		if (next->offset - window_start > (bytes + empty_size) * 2) {
+			entry = next;
+			window_start = entry->offset;
+			window_free = entry->bytes;
+			last = entry;
+			max_extent = 0;
+			total_retries++;
+			if (total_retries % 256 == 0) {
+				if (min_bytes >= (bytes + empty_size)) {
+					ret = -ENOSPC;
+					goto out;
+				}
+				/*
+				 * grow our allocation a bit, we're not having
+				 * much luck
+				 */
+				min_bytes *= 2;
+				goto again;
+			}
+		} else {
+			last = next;
+			window_free += next->bytes;
+			if (entry->bytes > max_extent)
+				max_extent = entry->bytes;
+		}
+	}
+
+	cluster->window_start = entry->offset;
+
+	/*
+	 * now we've found our entries, pull them out of the free space
+	 * cache and put them into the cluster rbtree
+	 *
+	 * The cluster includes an rbtree, but only uses the offset index
+	 * of each free space cache entry.
+	 */
+	while(1) {
+		node = rb_next(&entry->offset_index);
+		unlink_free_space(block_group, entry);
+		ret = tree_insert_offset(&cluster->root, entry->offset,
+					 &entry->offset_index);
+		BUG_ON(ret);
+
+		if (!node || entry == last)
+			break;
+
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	}
+	ret = 0;
+	cluster->max_size = max_extent;
+	atomic_inc(&block_group->count);
+	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+	cluster->block_group = block_group;
+out:
+	spin_unlock(&cluster->lock);
+	spin_unlock(&block_group->tree_lock);
 
 	return ret;
 }
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+	spin_lock_init(&cluster->lock);
+	spin_lock_init(&cluster->refill_lock);
+	cluster->root.rb_node = NULL;
+	cluster->max_size = 0;
+	INIT_LIST_HEAD(&cluster->block_group_list);
+	cluster->block_group = NULL;
+}
+
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				   *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+			       u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster,
+			     u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+			     struct btrfs_free_cluster *cluster, u64 bytes,
+			     u64 min_start);
+int btrfs_return_cluster_to_free_space(
+			       struct btrfs_block_group_cache *block_group,
+			       struct btrfs_free_cluster *cluster);
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
+
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      ins_len);
 	if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	search_key.type = 0;
 	search_key.offset = 0;
 
-	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a47..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,34 +102,6 @@ static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 }
 
 /*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del)
-{
-	u64 total;
-	u64 used;
-	u64 thresh;
-	int ret = 0;
-
-	spin_lock(&root->fs_info->delalloc_lock);
-	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-	if (for_del)
-		thresh = total * 90;
-	else
-		thresh = total * 85;
-
-	do_div(thresh, 100);
-
-	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
-	return ret;
-}
-
-/*
  * this does all the hard work for inserting an inline extent into
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
@@ -162,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->leave_spinning = 1;
 	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
@@ -195,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
-			kaddr = kmap(cpage);
+			kaddr = kmap_atomic(cpage, KM_USER0);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap(cpage);
+			kunmap_atomic(kaddr, KM_USER0);
 
 			i++;
 			ptr += cur_size;
@@ -232,7 +205,7 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode, u64 start, u64 end,
 				 size_t compressed_size,
@@ -882,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	u64 cur_end;
 	int limit = 10 * 1024 * 1042;
 
-	if (!btrfs_test_opt(root, COMPRESS)) {
-		return cow_file_range(inode, locked_page, start, end,
-				      page_started, nr_written, 1);
-	}
-
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
 	while (start < end) {
@@ -963,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+				       struct page *locked_page,
 			      u64 start, u64 end, int *page_started, int force,
 			      unsigned long *nr_written)
 {
@@ -1161,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 			      unsigned long *nr_written)
 {
 	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1168,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
+	else if (!btrfs_test_opt(root, COMPRESS))
+		ret = cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
-
 	return ret;
 }
 
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			       (unsigned long long)end - start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
+			btrfs_delalloc_free_space(root, inode, (u64)-1);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
+			btrfs_delalloc_free_space(root, inode,
+						  end - start + 1);
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
@@ -1477,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	path->leave_spinning = 1;
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
 				 file_pos + num_bytes, file_pos, &hint);
 	BUG_ON(ret);
@@ -1499,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_compression(leaf, fi, compression);
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+	btrfs_unlock_up_safe(path, 1);
+	btrfs_set_lock_blocking(leaf);
+
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_add_bytes(inode, num_bytes);
@@ -1511,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 					  root->root_key.objectid,
 					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
-
 	btrfs_free_path(path);
+
 	return 0;
 }
 
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_ordered_extent *ordered_extent)
+{
+	struct btrfs_ordered_sum *sum;
+	u64 bytenr;
+
+	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+			 list);
+	bytenr = sum->sums[0].bytenr;
+
+	/*
+	 * we don't care about the results, the point of this search is
+	 * just to get the btree leaves into ram
+	 */
+	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1524,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_ordered_extent *ordered_extent;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
 	int compressed = 0;
 	int ret;
 
@@ -1533,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (!ret)
 		return 0;
 
+	/*
+	 * before we join the transaction, try to do some of our IO.
+	 * This will limit the amount of IO that we have to do with
+	 * the transaction running.  We're unlikely to need to do any
+	 * IO if the file extents are new, the disk_i_size checks
+	 * covers the most common case.
+	 */
+	if (start < BTRFS_I(inode)->disk_i_size) {
+		path = btrfs_alloc_path();
+		if (path) {
+			ret = btrfs_lookup_file_extent(NULL, root, path,
+						       inode->i_ino,
+						       start, 0);
+			ordered_extent = btrfs_lookup_ordered_extent(inode,
+								     start);
+			if (!list_empty(&ordered_extent->list)) {
+				btrfs_release_path(root, path);
+				reada_csum(root, path, ordered_extent);
+			}
+			btrfs_free_path(path);
+		}
+	}
+
 	trans = btrfs_join_transaction(root, 1);
 
-	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	if (!ordered_extent)
+		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
@@ -2125,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	path->leave_spinning = 1;
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -2171,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 
+	path->leave_spinning = 1;
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -2214,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
 	BUG_ON(ret != 0 && ret != -ENOENT);
-	if (ret != -ENOENT)
-		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -2245,13 +2275,12 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	root = BTRFS_I(dir)->root;
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
+
+	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
 
@@ -2261,7 +2290,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2284,10 +2312,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2304,7 +2328,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (ret && !err)
@@ -2531,9 +2554,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
-	btrfs_init_path(path);
-
 search_again:
+	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto error;
@@ -2680,6 +2702,7 @@ delete:
 			break;
 		}
 		if (found_extent) {
+			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
 						leaf->start, root_owner,
@@ -2820,7 +2843,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		return err;
 
@@ -2884,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-		err = btrfs_cont_expand(inode, attr->ia_size);
-		if (err)
-			return err;
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+		if (attr->ia_size > inode->i_size) {
+			err = btrfs_cont_expand(inode, attr->ia_size);
+			if (err)
+				return err;
+		} else if (inode->i_size > 0 &&
+			   attr->ia_size == 0) {
+
+			/* we're truncating a file that used to have good
+			 * data down to zero.  Make sure it gets into
+			 * the ordered flush list so that any new writes
+			 * get down to disk quickly.
+			 */
+			BTRFS_I(inode)->ordered_data_close = 1;
+		}
 	}
 
 	err = inode_setattr(inode, attr);
@@ -3016,16 +3049,18 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
+	bi->reserved_bytes = 0;
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
-	bi->log_dirty_trans = 0;
+	bi->last_unlink_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3037,6 +3072,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3445,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
 	if (dir) {
 		ret = btrfs_set_inode_index(dir, index);
-		if (ret)
+		if (ret) {
+			iput(inode);
 			return ERR_PTR(ret);
+		}
 	}
 	/*
 	 * index_cnt is ignored for everything but a dir,
@@ -3457,6 +3495,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
+	btrfs_set_inode_space_info(root, inode);
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -3482,6 +3521,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	sizes[0] = sizeof(struct btrfs_inode_item);
 	sizes[1] = name_len + sizeof(*ref);
 
+	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
 	if (ret != 0)
 		goto fail;
@@ -3527,6 +3567,7 @@ fail:
 	if (dir)
 		BTRFS_I(dir)->index_cnt--;
 	btrfs_free_path(path);
+	iput(inode);
 	return ERR_PTR(ret);
 }
 
@@ -3604,7 +3645,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 
@@ -3667,7 +3708,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	trans = btrfs_start_transaction(root, 1);
@@ -3735,7 +3776,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -ENOENT;
 
 	btrfs_inc_nlink(inode);
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	err = btrfs_set_inode_index(dir, &index);
@@ -3760,6 +3801,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
+
+	btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
@@ -3781,7 +3824,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_unlock;
 
@@ -4263,7 +4306,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	return __btrfs_releasepage(page, gfp_flags);
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4325,8 +4368,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4338,11 +4382,16 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	if (ret)
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+	if (ret) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
 		goto out;
+	}
 
-	ret = -EINVAL;
+	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
 	size = i_size_read(inode);
@@ -4351,6 +4400,7 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -4389,6 +4439,8 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
+
+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
@@ -4414,6 +4466,27 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
 	trans = btrfs_start_transaction(root, 1);
+
+	/*
+	 * setattr is responsible for setting the ordered_data_close flag,
+	 * but that is only tested during the last file release.  That
+	 * could happen well after the next commit, leaving a great big
+	 * window where new writes may get lost if someone chooses to write
+	 * to this file after truncating to zero
+	 *
+	 * The inode doesn't have any dirty data here, and so if we commit
+	 * this is a noop.  If someone immediately starts writing to the inode
+	 * it is very likely we'll catch some of their writes in this
+	 * transaction, and the commit will find this file on the ordered
+	 * data list with good things to send down.
+	 *
+	 * This is a best effort solution, there is still a window where
+	 * using truncate to replace the contents of the file will
+	 * end up with a zero length file after a crash.
+	 */
+	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+		btrfs_add_ordered_operation(trans, root, inode);
+
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_i_size_write(inode, inode->i_size);
 
@@ -4490,12 +4563,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
 	INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->ordered_operations);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
 	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
@@ -4506,13 +4582,24 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	spin_lock(&BTRFS_I(inode)->root->list_lock);
+	/*
+	 * Make sure we're properly removed from the ordered operation
+	 * lists.
+	 */
+	smp_mb();
+	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		spin_lock(&root->fs_info->ordered_extent_lock);
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+	}
+
+	spin_lock(&root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&BTRFS_I(inode)->root->list_lock);
+	spin_unlock(&root->list_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4633,12 +4720,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -EXDEV;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * we're using rename to replace one file with another.
+	 * and the replacement file is large.  Start IO on it now so
+	 * we don't add too much work to the end of the transaction
+	 */
+	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+	    new_inode->i_size &&
+	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+		filemap_flush(old_inode->i_mapping);
+
 	trans = btrfs_start_transaction(root, 1);
 
+	/*
+	 * make sure the inode gets flushed if it is replacing
+	 * something.
+	 */
+	if (new_inode && new_inode->i_size &&
+	    old_inode && S_ISREG(old_inode->i_mode)) {
+		btrfs_add_ordered_operation(trans, root, old_inode);
+	}
+
+	/*
+	 * this is an ugly little race, but the rename is required to make
+	 * sure that if we crash, the inode is either at the old name
+	 * or the new one.  pinning the log transaction lets us make sure
+	 * we don't allow a log commit to come in after we unlink the
+	 * name but before we add the new name back in.
+	 */
+	btrfs_pin_log_trans(root);
+
 	btrfs_set_trans_block_group(trans, new_dir);
 
 	btrfs_inc_nlink(old_dentry->d_inode);
@@ -4646,6 +4761,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
+	if (old_dentry->d_parent != new_dentry->d_parent)
+		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
 	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
 				 old_dentry->d_name.name,
 				 old_dentry->d_name.len);
@@ -4677,7 +4795,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (ret)
 		goto out_fail;
 
+	btrfs_log_new_name(trans, old_inode, old_dir,
+				       new_dentry->d_parent);
 out_fail:
+
+	/* this btrfs_end_log_trans just allows the current
+	 * log-sub transaction to complete
+	 */
+	btrfs_end_log_trans(root);
 	btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	return ret;
@@ -4751,7 +4876,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_fail;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8b49eb..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_commit;
 
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_unlock;
 
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		goto out_dput;
 
 	if (!IS_POSIXACL(parent->dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	error = mnt_want_write(parent->mnt);
 	if (error)
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
 	if (ret)
 		return -ENOSPC;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129b..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
 
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
 	spin_lock(&eb->lock);
 }
-#endif
 
 /*
  * Setting a lock to blocking will drop the spinlock and set the
@@ -71,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 
 /*
  * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for every long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * don't end up blocking for very long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin on the blocking bit
  * at all, the context switch rate can jump up to 400,000/sec or more.
  *
  * So, we're still stuck with this crummy spin on the blocking bit,
@@ -82,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
 	int i;
+
 	for (i = 0; i < 512; i++) {
-		cpu_relax();
 		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
 			return 1;
 		if (need_resched())
 			break;
+		cpu_relax();
 	}
 	return 0;
 }
@@ -106,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
 	int i;
 
-	spin_nested(eb);
-	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-		return 1;
-	spin_unlock(&eb->lock);
-
+	if (btrfs_spin_on_block(eb)) {
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
 	/* spin for a bit on the BLOCKING flag */
 	for (i = 0; i < 2; i++) {
+		cpu_relax();
 		if (!btrfs_spin_on_block(eb))
 			break;
 
@@ -159,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 	DEFINE_WAIT(wait);
 	wait.func = btrfs_wake_function;
 
+	if (!btrfs_spin_on_block(eb))
+		goto sleep;
+
 	while(1) {
 		spin_nested(eb);
 
@@ -176,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 		 * spin for a bit, and if the blocking flag goes away,
 		 * loop around
 		 */
+		cpu_relax();
 		if (btrfs_spin_on_block(eb))
 			continue;
-
+sleep:
 		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
@@ -231,8 +227,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
 	return 0;
 }
 
-int btrfs_tree_locked(struct extent_buffer *eb)
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
-			spin_is_locked(&eb->lock);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afbff928..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
 
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
+
+	/*
+	 * we have no more ordered extents for this inode and
+	 * no dirty pages.  We can safely remove it from the
+	 * list of ordered extents
+	 */
+	if (RB_EMPTY_ROOT(&tree->tree) &&
+	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		list_del_init(&BTRFS_I(inode)->ordered_operations);
+	}
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
 	mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 }
 
 /*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+	struct btrfs_inode *btrfs_inode;
+	struct inode *inode;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+	list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+	while (!list_empty(&splice)) {
+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+				   ordered_operations);
+
+		inode = &btrfs_inode->vfs_inode;
+
+		list_del_init(&btrfs_inode->ordered_operations);
+
+		/*
+		 * the inode may be getting freed (in sys_unlink path).
+		 */
+		inode = igrab(inode);
+
+		if (!wait && inode) {
+			list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+		}
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (inode) {
+			if (wait)
+				btrfs_wait_ordered_range(inode, 0, (u64)-1);
+			else
+				filemap_flush(inode->i_mapping);
+			iput(inode);
+		}
+
+		cond_resched();
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	if (wait && !list_empty(&root->fs_info->ordered_operations))
+		goto again;
+
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return 0;
+}
+
+/*
  * Used to start IO or wait for a given ordered extent to finish.
  *
  * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
 	return ret;
 }
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode)
+{
+	u64 last_mod;
+
+	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+	/*
+	 * if this file hasn't been changed since the last transaction
+	 * commit, we can safely return without doing anything
+	 */
+	if (last_mod < root->fs_info->last_trans_committed)
+		return 0;
+
+	/*
+	 * the transaction is already committing.  Just start the IO and
+	 * don't bother with all of this list nonsense
+	 */
+	if (trans && root->fs_info->running_transaction->blocked) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		return 0;
+	}
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+		list_add_tail(&BTRFS_I(inode)->ordered_operations,
+			      &root->fs_info->ordered_operations);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc38..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+	Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
 	{Opt_noacl, "noacl"},
+	{Opt_notreelog, "notreelog"},
+	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_err, NULL},
 };
 
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_noacl:
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
+		case Opt_notreelog:
+			printk(KERN_INFO "btrfs: disabling tree log\n");
+			btrfs_set_opt(info->mount_opt, NOTREELOG);
+			break;
+		case Opt_flushoncommit:
+			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+			break;
 		default:
 			break;
 		}
@@ -363,9 +375,8 @@ fail_close:
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
+	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
-	root = btrfs_sb(sb);
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
@@ -379,13 +390,47 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
-	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
 	return ret;
 }
 
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+	struct btrfs_fs_info *info = root->fs_info;
+
+	if (btrfs_test_opt(root, DEGRADED))
+		seq_puts(seq, ",degraded");
+	if (btrfs_test_opt(root, NODATASUM))
+		seq_puts(seq, ",nodatasum");
+	if (btrfs_test_opt(root, NODATACOW))
+		seq_puts(seq, ",nodatacow");
+	if (btrfs_test_opt(root, NOBARRIER))
+		seq_puts(seq, ",nobarrier");
+	if (info->max_extent != (u64)-1)
+		seq_printf(seq, ",max_extent=%llu", info->max_extent);
+	if (info->max_inline != 8192 * 1024)
+		seq_printf(seq, ",max_inline=%llu", info->max_inline);
+	if (info->alloc_start != 0)
+		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+	if (info->thread_pool_size !=  min_t(unsigned long,
+					     num_online_cpus() + 2, 8))
+		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+	if (btrfs_test_opt(root, COMPRESS))
+		seq_puts(seq, ",compress");
+	if (btrfs_test_opt(root, SSD))
+		seq_puts(seq, ",ssd");
+	if (btrfs_test_opt(root, NOTREELOG))
+		seq_puts(seq, ",no-treelog");
+	if (btrfs_test_opt(root, FLUSHONCOMMIT))
+		seq_puts(seq, ",flush-on-commit");
+	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+		seq_puts(seq, ",noacl");
+	return 0;
+}
+
 static void btrfs_write_super(struct super_block *sb)
 {
 	sb->s_dirt = 0;
@@ -511,6 +556,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
@@ -627,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
-	.show_options	= generic_show_options,
+	.show_options	= btrfs_show_options,
 	.write_inode	= btrfs_write_inode,
 	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9a..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 					     GFP_NOFS);
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
-		root->fs_info->last_alloc = 0;
-		root->fs_info->last_data_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
+
+		cur_trans->delayed_refs.root.rb_node = NULL;
+		cur_trans->delayed_refs.num_entries = 0;
+		cur_trans->delayed_refs.num_heads_ready = 0;
+		cur_trans->delayed_refs.num_heads = 0;
+		cur_trans->delayed_refs.flushing = 0;
+		cur_trans->delayed_refs.run_delayed_start = 0;
+		spin_lock_init(&cur_trans->delayed_refs.lock);
+
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->block_group = 0;
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
+	h->delayed_ref_updates = 0;
+
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
-
 	throttle_on_drops(root);
 }
 
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *info = root->fs_info;
+	int count = 0;
+
+	while (count < 4) {
+		unsigned long cur = trans->delayed_ref_updates;
+		trans->delayed_ref_updates = 0;
+		if (cur &&
+		    trans->transaction->delayed_refs.num_heads_ready > 64) {
+			trans->delayed_ref_updates = 0;
+
+			/*
+			 * do a full flush if the transaction is trying
+			 * to close
+			 */
+			if (trans->transaction->delayed_refs.flushing)
+				cur = 0;
+			btrfs_run_delayed_refs(trans, root, cur);
+		} else {
+			break;
+		}
+		count++;
+	}
 
 	mutex_lock(&info->trans_mutex);
 	cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	u64 old_root_bytenr;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
-	btrfs_extent_post_op(trans, root);
 	btrfs_write_dirty_block_groups(trans, root);
-	btrfs_extent_post_op(trans, root);
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 				     btrfs_header_level(root->node));
 		btrfs_set_root_generation(&root->root_item, trans->transid);
 
-		btrfs_extent_post_op(trans, root);
-
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
 		BUG_ON(ret);
 		btrfs_write_dirty_block_groups(trans, root);
-		btrfs_extent_post_op(trans, root);
+
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
 	struct extent_buffer *eb;
+	int ret;
 
-	btrfs_extent_post_op(trans, fs_info->tree_root);
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
-	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
-	btrfs_extent_post_op(trans, fs_info->tree_root);
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
 		update_cowonly_root(trans, root);
+
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -635,6 +671,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 }
 
 /*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+	DEFINE_WAIT(wait);
+
+	mutex_lock(&info->trans_mutex);
+	while (info->running_transaction &&
+	       info->running_transaction->delayed_refs.flushing) {
+		prepare_to_wait(&info->transaction_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&info->trans_mutex);
+		schedule();
+		mutex_lock(&info->trans_mutex);
+		finish_wait(&info->transaction_wait, &wait);
+	}
+	mutex_unlock(&info->trans_mutex);
+	return 0;
+}
+
+/*
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
  */
@@ -661,7 +722,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_inc(&root->fs_info->throttles);
 
 		while (1) {
+			/*
+			 * we don't want to jump in and create a bunch of
+			 * delayed refs if the transaction is starting to close
+			 */
+			wait_transaction_pre_flush(tree_root->fs_info);
 			trans = btrfs_start_transaction(tree_root, 1);
+
+			/*
+			 * we've joined a transaction, make sure it isn't
+			 * closing right now
+			 */
+			if (trans->transaction->delayed_refs.flushing) {
+				btrfs_end_transaction(trans, tree_root);
+				continue;
+			}
+
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN)
@@ -688,7 +764,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
 			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
@@ -764,7 +842,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
@@ -892,12 +970,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
+	int should_grow = 0;
+	unsigned long now = get_seconds();
+	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+
+	btrfs_run_ordered_operations(root, 0);
+
+	/* make a pass through all the delayed refs we have so far
+	 * any runnings procs may add more while we are here
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	BUG_ON(ret);
+
+	cur_trans = trans->transaction;
+	/*
+	 * set the flushing flag so procs in this transaction have to
+	 * start sending their work down.
+	 */
+	cur_trans->delayed_refs.flushing = 1;
+
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	BUG_ON(ret);
 
-	INIT_LIST_HEAD(&dirty_fs_roots);
 	mutex_lock(&root->fs_info->trans_mutex);
-	if (trans->transaction->in_commit) {
-		cur_trans = trans->transaction;
-		trans->transaction->use_count++;
+	INIT_LIST_HEAD(&dirty_fs_roots);
+	if (cur_trans->in_commit) {
+		cur_trans->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
@@ -920,7 +1018,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
-	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
@@ -935,6 +1032,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+		should_grow = 1;
+
 	do {
 		int snap_pending = 0;
 		joined = cur_trans->num_joined;
@@ -947,26 +1047,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		if (cur_trans->num_writers > 1)
 			timeout = MAX_SCHEDULE_TIMEOUT;
-		else
+		else if (should_grow)
 			timeout = 1;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		if (snap_pending) {
+		if (flush_on_commit || snap_pending) {
+			if (flush_on_commit)
+				btrfs_start_delalloc_inodes(root);
 			ret = btrfs_wait_ordered_extents(root, 1);
 			BUG_ON(ret);
 		}
 
-		schedule_timeout(timeout);
+		/*
+		 * rename don't use btrfs_join_transaction, so, once we
+		 * set the transaction to blocked above, we aren't going
+		 * to get any new ordered operations.  We can safely run
+		 * it here and no for sure that nothing new will be added
+		 * to the list
+		 */
+		btrfs_run_ordered_operations(root, 1);
+
+		smp_mb();
+		if (cur_trans->num_writers > 1 || should_grow)
+			schedule_timeout(timeout);
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
 	} while (cur_trans->num_writers > 1 ||
-		 (cur_trans->num_joined != joined));
+		 (should_grow && cur_trans->num_joined != joined));
 
 	ret = create_pending_snapshots(trans, root->fs_info);
 	BUG_ON(ret);
 
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	BUG_ON(ret);
+
 	WARN_ON(cur_trans != trans->transaction);
 
 	/* btrfs_commit_tree_roots is responsible for getting the
@@ -1030,6 +1146,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_copy_pinned(root, pinned_copy);
 
 	trans->transaction->blocked = 0;
+
 	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
@@ -1056,6 +1173,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
+
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "delayed-ref.h"
 
 struct btrfs_transaction {
 	u64 transid;
+	/*
+	 * total writers in this transaction, it must be zero before the
+	 * transaction can end
+	 */
 	unsigned long num_writers;
+
 	unsigned long num_joined;
 	int in_commit;
 	int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
+	struct btrfs_delayed_ref_root delayed_refs;
 };
 
 struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
 	u64 block_group;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
+	unsigned long delayed_ref_updates;
 };
 
 struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_release_path(root, path);
-	if (is_extent)
-		btrfs_extent_post_op(trans, root);
 out:
 	if (path)
 		btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
 #define LOG_INODE_EXISTS 1
 
 /*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
  * stages for the tree walking.  The first
  * stage (0) is to only pin down the blocks we find
  * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
 
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid, int del_all);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
 }
 
 /*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	mutex_lock(&root->log_mutex);
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+/*
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
 		smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct extent_buffer *eb,
 			      struct walk_control *wc, u64 gen)
 {
-	if (wc->pin) {
-		mutex_lock(&log->fs_info->pinned_mutex);
+	if (wc->pin)
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
-		mutex_unlock(&log->fs_info->pinned_mutex);
-	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
 	BUG_ON(ret);
+
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	BUG_ON(ret);
 	kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(root, path);
+
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		key.offset--;
 		btrfs_release_path(root, path);
 	}
-	btrfs_free_path(path);
+	btrfs_release_path(root, path);
 	if (nlink != inode->i_nlink) {
 		inode->i_nlink = nlink;
 		btrfs_update_inode(trans, root, inode);
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 
+	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+		ret = replay_dir_deletes(trans, root, NULL, path,
+					 inode->i_ino, 1);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+
 	return 0;
 }
 
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 
 		iput(inode);
 
-		if (key.offset == 0)
-			break;
-		key.offset--;
+		/*
+		 * fixup on a directory may create new entries,
+		 * make sure we always look for the highset possible
+		 * offset
+		 */
+		key.offset = (u64)-1;
 	}
 	btrfs_release_path(root, path);
 	return 0;
@@ -1150,8 +1222,7 @@ insert:
 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
 			      name, name_len, log_type, &log_key);
 
-	if (ret && ret != -ENOENT)
-		BUG();
+	BUG_ON(ret && ret != -ENOENT);
 	goto out;
 }
 
@@ -1313,11 +1384,11 @@ again:
 		read_extent_buffer(eb, name, (unsigned long)(di + 1),
 				  name_len);
 		log_di = NULL;
-		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
 			log_di = btrfs_lookup_dir_item(trans, log, log_path,
 						       dir_key->objectid,
 						       name, name_len, 0);
-		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
 			log_di = btrfs_lookup_dir_index_item(trans, log,
 						     log_path,
 						     dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
-				       u64 dirid)
+				       u64 dirid, int del_all)
 {
 	u64 range_start;
 	u64 range_end;
@@ -1408,10 +1479,14 @@ again:
 	range_start = 0;
 	range_end = 0;
 	while (1) {
-		ret = find_dir_range(log, path, dirid, key_type,
-				     &range_start, &range_end);
-		if (ret != 0)
-			break;
+		if (del_all)
+			range_end = (u64)-1;
+		else {
+			ret = find_dir_range(log, path, dirid, key_type,
+					     &range_start, &range_end);
+			if (ret != 0)
+				break;
+		}
 
 		dir_key.offset = range_start;
 		while (1) {
@@ -1437,7 +1512,8 @@ again:
 				break;
 
 			ret = check_item_in_log(trans, root, log, path,
-						log_path, dir, &found_key);
+						log_path, dir,
+						&found_key);
 			BUG_ON(ret);
 			if (found_key.offset == (u64)-1)
 				break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			mode = btrfs_inode_mode(eb, inode_item);
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
-					 root, log, path, key.objectid);
+					 root, log, path, key.objectid, 0);
 				BUG_ON(ret);
 			}
 			ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 					root, inode, inode->i_size,
 					BTRFS_EXTENT_DATA_KEY);
 				BUG_ON(ret);
+
+				/* if the nlink count is zero here, the iput
+				 * will free the inode.  We bump it to make
+				 * sure it doesn't get freed until the link
+				 * count fixup is done
+				 */
+				if (inode->i_nlink == 0) {
+					btrfs_inc_nlink(inode);
+					btrfs_update_inode(wc->trans,
+							   root, inode);
+				}
 				iput(inode);
 			}
 			ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->log_transid < transid + 2 &&
+
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && root->log_transid < transid + 2 &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
+
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
 	} while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 	return 0;
 }
 
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
 	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (atomic_read(&root->log_writers))
+		if (root->fs_info->last_trans_log_full_commit !=
+		    trans->transid && atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->log_mutex);
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(root, root->log_transid);
+		wait_log_commit(trans, root, root->log_transid);
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(root, root->log_transid - 1);
+		wait_log_commit(trans, root, root->log_transid - 1);
 
 	while (1) {
 		unsigned long batch = root->log_batch;
 		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&root->log_mutex);
-		wait_for_writer(root);
+
+		wait_for_writer(trans, root);
 		if (batch == root->log_batch)
 			break;
 	}
 
+	/* bail out if we need to do a full commit */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		ret = -EAGAIN;
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
+
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
-		wait_log_commit(log_root_tree, log_root_tree->log_transid);
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
-	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
-		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid - 1);
+	}
+
+	wait_for_writer(trans, log_root_tree);
 
-	wait_for_writer(log_root_tree);
+	/*
+	 * now that we've moved on to the tree of log tree roots,
+	 * check the full commit flag again
+	 */
+	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out_wake_log_root;
+	}
 
 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
 				&log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	write_ctree_super(trans, root->fs_info->tree_root, 2);
+	ret = 0;
 
+out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
 	return 0;
 }
 
-/* * free all the extents used by the tree log.  This should be called
+/*
+ * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	btrfs_free_path(path);
 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
-	end_log_trans(root);
+	btrfs_end_log_trans(root);
 
 	return 0;
 }
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
 				  dirid, &index);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
-	end_log_trans(root);
+	btrfs_end_log_trans(root);
 
 	return ret;
 }
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  *
  * This handles both files and directories.
  */
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only)
 {
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	min_key.offset = 0;
 
 	max_key.objectid = inode->i_ino;
+
+	/* today the code can only do partial logging of directories */
+	if (!S_ISDIR(inode->i_mode))
+	    inode_only = LOG_INODE_ALL;
+
 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	/*
-	 * if this inode has already been logged and we're in inode_only
-	 * mode, we don't want to delete the things that have already
-	 * been written to the log.
-	 *
-	 * But, if the inode has been through an inode_only log,
-	 * the logged_trans field is not set.  This allows us to catch
-	 * any new names for this inode in the backrefs by logging it
-	 * again
-	 */
-	if (inode_only == LOG_INODE_EXISTS &&
-	    BTRFS_I(inode)->logged_trans == trans->transid) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		goto out;
-	}
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
 	/*
@@ -2693,7 +2806,6 @@ next_slot:
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
-		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
@@ -2702,19 +2814,69 @@ next_slot:
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-out:
 	return 0;
 }
 
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+					       struct inode *inode,
+					       struct dentry *parent,
+					       struct super_block *sb,
+					       u64 last_committed)
 {
-	int ret;
+	int ret = 0;
+	struct btrfs_root *root;
 
-	start_log_trans(trans, root);
-	ret = __btrfs_log_inode(trans, root, inode, inode_only);
-	end_log_trans(root);
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto out;
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			goto out;
+		inode = parent->d_inode;
+	}
+
+	while (1) {
+		BTRFS_I(inode)->logged_trans = trans->transid;
+		smp_mb();
+
+		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+			root = BTRFS_I(inode)->root;
+
+			/*
+			 * make sure any commits to the log are forced
+			 * to be full commits
+			 */
+			root->fs_info->last_trans_log_full_commit =
+				trans->transid;
+			ret = 1;
+			break;
+		}
+
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+			break;
+
+		if (parent == sb->s_root)
+			break;
+
+		parent = parent->d_parent;
+		inode = parent->d_inode;
+
+	}
+out:
 	return ret;
 }
 
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only)
 {
-	int inode_only = LOG_INODE_ALL;
+	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
+	u64 last_committed = root->fs_info->last_trans_committed;
+
+	sb = inode->i_sb;
+
+	if (btrfs_test_opt(root, NOTREELOG)) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	if (root->fs_info->last_trans_log_full_commit >
+	    root->fs_info->last_trans_committed) {
+		ret = 1;
+		goto end_no_trans;
+	}
+
+	ret = check_parent_dirs_for_sync(trans, inode, parent,
+					 sb, last_committed);
+	if (ret)
+		goto end_no_trans;
 
 	start_log_trans(trans, root);
-	sb = dentry->d_inode->i_sb;
-	while (1) {
-		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
-					inode_only);
-		BUG_ON(ret);
-		inode_only = LOG_INODE_EXISTS;
 
-		dentry = dentry->d_parent;
-		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+	ret = btrfs_log_inode(trans, root, inode, inode_only);
+	BUG_ON(ret);
+
+	/*
+	 * for regular files, if its inode is already on disk, we don't
+	 * have to worry about the parents at all.  This is because
+	 * we can use the last_unlink_trans field to record renames
+	 * and other fun in this file.
+	 */
+	if (S_ISREG(inode->i_mode) &&
+	    BTRFS_I(inode)->generation <= last_committed &&
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
+			goto no_parent;
+
+	inode_only = LOG_INODE_EXISTS;
+	while (1) {
+		if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
 			break;
 
-		if (BTRFS_I(dentry->d_inode)->generation <=
-		    root->fs_info->last_trans_committed)
+		inode = parent->d_inode;
+		if (BTRFS_I(inode)->generation >
+		    root->fs_info->last_trans_committed) {
+			ret = btrfs_log_inode(trans, root, inode, inode_only);
+			BUG_ON(ret);
+		}
+		if (parent == sb->s_root)
 			break;
+
+		parent = parent->d_parent;
 	}
-	end_log_trans(root);
-	return 0;
+no_parent:
+	ret = 0;
+	btrfs_end_log_trans(root);
+end_no_trans:
+	return ret;
 }
 
 /*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry)
 {
-	u64 gen;
-	gen = root->fs_info->last_trans_new_blockgroup;
-	if (gen > root->fs_info->last_trans_committed)
-		return 1;
-	else
-		return btrfs_log_dentry(trans, root, dentry);
+	return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+				      dentry->d_parent, 0);
 }
 
 /*
@@ -2832,7 +3029,9 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
+		mutex_lock(&fs_info->trans_mutex);
 		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
@@ -2882,3 +3081,94 @@ again:
 	kfree(log_root_tree);
 	return 0;
 }
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename)
+{
+	/*
+	 * when we're logging a file, if it hasn't been renamed
+	 * or unlinked, and its inode is fully committed on disk,
+	 * we don't have to worry about walking up the directory chain
+	 * to log its parents.
+	 *
+	 * So, we use the last_unlink_trans field to put this transid
+	 * into the file.  When the file is logged we check it and
+	 * don't log the parents if the file is fully on disk.
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this directory was already logged any new
+	 * names for this file/dir will get recorded
+	 */
+	smp_mb();
+	if (BTRFS_I(dir)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * if the inode we're about to unlink was logged,
+	 * the log will be properly updated for any new names
+	 */
+	if (BTRFS_I(inode)->logged_trans == trans->transid)
+		return;
+
+	/*
+	 * when renaming files across directories, if the directory
+	 * there we're unlinking from gets fsync'd later on, there's
+	 * no way to find the destination directory later and fsync it
+	 * properly.  So, we have to be conservative and force commits
+	 * so the new name gets discovered.
+	 */
+	if (for_rename)
+		goto record;
+
+	/* we can safely do the unlink without any special recording */
+	return;
+
+record:
+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent)
+{
+	struct btrfs_root * root = BTRFS_I(inode)->root;
+
+	/*
+	 * this will force the logging code to walk the dentry chain
+	 * up for the file
+	 */
+	if (S_ISREG(inode->i_mode))
+		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+	/*
+	 * if this inode hasn't been logged and directory we're renaming it
+	 * from hasn't been logged, we don't need to log it
+	 */
+	if (BTRFS_I(inode)->logged_trans <=
+	    root->fs_info->last_trans_committed &&
+	    (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+		    root->fs_info->last_trans_committed))
+		return 0;
+
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       const char *name, int name_len,
 			       struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+			     struct inode *dir, struct inode *inode,
+			     int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+			struct inode *inode, struct inode *old_dir,
+			struct dentry *parent);
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae1..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/iocontext.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	int again = 0;
 	unsigned long num_run = 0;
 	unsigned long limit;
+	unsigned long last_waited = 0;
 
-	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = blk_get_backing_dev_info(device->bdev);
 	fs_info = device->dev_root->fs_info;
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
 		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
+			struct io_context *ioc;
 
+			ioc = current->io_context;
+
+			/*
+			 * the main goal here is that we don't want to
+			 * block if we're going to be able to submit
+			 * more requests without blocking.
+			 *
+			 * This code does two great things, it pokes into
+			 * the elevator code from a filesystem _and_
+			 * it makes assumptions about how batching works.
+			 */
+			if (ioc && ioc->nr_batch_requests > 0 &&
+			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+			    (last_waited == 0 ||
+			     ioc->last_waited == last_waited)) {
+				/*
+				 * we want to go through our batch of
+				 * requests and stop.  So, we copy out
+				 * the ioc->last_waited time and test
+				 * against it before looping
+				 */
+				last_waited = ioc->last_waited;
+				continue;
+			}
 			spin_lock(&device->io_lock);
 
 			old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
 	if (device->pending_bios)
 		goto loop_lock;
 	spin_unlock(&device->io_lock);
+
+	/*
+	 * IO has already been through a long path to get here.  Checksumming,
+	 * async helper threads, perhaps compression.  We've done a pretty
+	 * good job of collecting a batch of IO and should just unplug
+	 * the device right away.
+	 *
+	 * This will help anyone who is waiting on the IO, they might have
+	 * already unplugged, but managed to do so before the bio they
+	 * cared about found its way down here.
+	 */
+	blk_run_backing_dev(bdi, NULL);
 done:
 	return 0;
 }
@@ -1374,6 +1413,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_add_device(trans, root, device);
 	}
 
+	/*
+	 * we've got more storage, clear any full flags on the space
+	 * infos
+	 */
+	btrfs_clear_space_info_full(root->fs_info);
+
 	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 
@@ -1459,6 +1504,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	device->fs_devices->total_rw_bytes += diff;
 
 	device->total_bytes = new_size;
+	btrfs_clear_space_info_full(device->dev_root->fs_info);
+
 	return btrfs_update_device(trans, device);
 }
 
@@ -2894,10 +2941,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (!map)
-		return -ENOMEM;
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
@@ -3106,6 +3149,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 
-	/* the device with this id has the most recent coyp of the super */
+	/* the device with this id has the most recent copy of the super */
 	u64 latest_devid;
 	u64 latest_trans;
 	u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -166,151 +166,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	if (sb) {
-		int res = fsync_super(sb);
-		drop_super(sb);
-		return res;
-	}
-	return sync_blockdev(bdev);
-}
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:	blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-	struct super_block *sb;
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
-		sb = get_super(bdev);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		__fsync_super(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
-		}
-	}
-
-	sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
-		}
-		drop_super(sb);
-	}
-
-	up(&bdev->bd_mount_sem);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
-/*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -344,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	head = page_buffers(page);
 	bh = head;
 	do {
-		if (bh->b_blocknr == block) {
+		if (!buffer_mapped(bh))
+			all_mapped = 0;
+		else if (bh->b_blocknr == block) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
 		}
-		if (!buffer_mapped(bh))
-			all_mapped = 0;
 		bh = bh->b_this_page;
 	} while (bh != head);
 
@@ -435,7 +290,7 @@ static void free_more_memory(void)
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS);
+						GFP_NOFS, NULL);
 	}
 }
 
@@ -692,6 +547,39 @@ repeat:
 	return err;
 }
 
+void do_thaw_all(unsigned long unused)
+{
+	struct super_block *sb;
+	char b[BDEVNAME_SIZE];
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+			printk(KERN_WARNING "Emergency Thaw on %s\n",
+			       bdevname(sb->s_bdev, b));
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	pdflush_operation(do_thaw_all, 0);
+}
+
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
@@ -760,32 +648,18 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
  */
-static int __set_page_dirty(struct page *page,
+static void __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	if (TestSetPageDirty(page))
-		return 0;
-
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	return 1;
 }
 
 /*
@@ -815,6 +689,7 @@ static int __set_page_dirty(struct page *page,
  */
 int __set_page_dirty_buffers(struct page *page)
 {
+	int newly_dirty;
 	struct address_space *mapping = page_mapping(page);
 
 	if (unlikely(!mapping))
@@ -830,9 +705,12 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
-	return __set_page_dirty(page, mapping, 1);
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 1);
+	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 
@@ -859,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping;
+	struct address_space *mapping, *prev_mapping = NULL;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -884,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE_SYNC, 1, &bh);
+				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				if (prev_mapping && prev_mapping != mapping)
+					blk_run_address_space(prev_mapping);
+				prev_mapping = mapping;
+
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -1261,8 +1150,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
 			return;
 	}
 
-	if (!test_set_buffer_dirty(bh))
-		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
+	if (!test_set_buffer_dirty(bh)) {
+		struct page *page = bh->b_page;
+		if (!TestSetPageDirty(page))
+			__set_page_dirty(page, page_mapping(page), 0);
+	}
 }
 
 /*
@@ -1714,6 +1606,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
@@ -1805,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(WRITE, bh);
+			submit_bh(write_op, bh);
 			nr_underway++;
 		}
 		bh = next;
@@ -1859,7 +1752,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(WRITE, bh);
+			submit_bh(write_op, bh);
 			nr_underway++;
 		}
 		bh = next;
@@ -2465,13 +2358,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 
 	lock_page(page);
 	size = i_size_read(inode);
@@ -2491,6 +2385,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
+	if (unlikely(ret)) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+	}
+
 out_unlock:
 	unlock_page(page);
 	return ret;
@@ -3067,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC)
+		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
 			lock_buffer(bh);
 		else if (!trylock_buffer(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+		    rw == SWRITE_SYNC_PLUG) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
@@ -3426,7 +3328,6 @@ EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+
+config CACHEFILES
+	tristate "Filesystem caching on files"
+	depends on FSCACHE && BLOCK
+	help
+	  This permits use of a mounted filesystem as a cache for other
+	  filesystems - primarily networking filesystems - thus allowing fast
+	  local disk to enhance the speed of slower devices.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
+
+config CACHEFILES_DEBUG
+	bool "Debug CacheFiles"
+	depends on CACHEFILES
+	help
+	  This permits debugging to be dynamically enabled in the filesystem
+	  caching on files module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+	  by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_HISTOGRAM
+	bool "Gather latency information on CacheFiles"
+	depends on CACHEFILES && PROC_FS
+	help
+
+	  This option causes latency information to be gathered on CacheFiles
+	  operation and exported through file:
+
+		/proc/fs/cachefiles/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/cachefiles.txt for more
+	  information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+
+cachefiles-y := \
+	bind.o \
+	daemon.o \
+	interface.o \
+	key.o \
+	main.o \
+	namei.o \
+	rdwr.o \
+	security.o \
+	xattr.o
+
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+	_enter("{%u,%u,%u,%u,%u,%u},%s",
+	       cache->frun_percent,
+	       cache->fcull_percent,
+	       cache->fstop_percent,
+	       cache->brun_percent,
+	       cache->bcull_percent,
+	       cache->bstop_percent,
+	       args);
+
+	/* start by checking things over */
+	ASSERT(cache->fstop_percent >= 0 &&
+	       cache->fstop_percent < cache->fcull_percent &&
+	       cache->fcull_percent < cache->frun_percent &&
+	       cache->frun_percent  < 100);
+
+	ASSERT(cache->bstop_percent >= 0 &&
+	       cache->bstop_percent < cache->bcull_percent &&
+	       cache->bcull_percent < cache->brun_percent &&
+	       cache->brun_percent  < 100);
+
+	if (*args) {
+		kerror("'bind' command doesn't take an argument");
+		return -EINVAL;
+	}
+
+	if (!cache->rootdirname) {
+		kerror("No cache directory specified");
+		return -EINVAL;
+	}
+
+	/* don't permit already bound caches to be re-bound */
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("Cache already bound");
+		return -EBUSY;
+	}
+
+	/* make sure we have copies of the tag and dirname strings */
+	if (!cache->tag) {
+		/* the tag string is released by the fops->release()
+		 * function, so we don't release it on error here */
+		cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+		if (!cache->tag)
+			return -ENOMEM;
+	}
+
+	/* add the cache */
+	return cachefiles_daemon_add_cache(cache);
+}
+
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+	struct cachefiles_object *fsdef;
+	struct nameidata nd;
+	struct kstatfs stats;
+	struct dentry *graveyard, *cachedir, *root;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("");
+
+	/* we want to work under the module's security ID */
+	ret = cachefiles_get_security_ID(cache);
+	if (ret < 0)
+		return ret;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	/* allocate the root index object */
+	ret = -ENOMEM;
+
+	fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	if (!fsdef)
+		goto error_root_object;
+
+	ASSERTCMP(fsdef->backer, ==, NULL);
+
+	atomic_set(&fsdef->usage, 1);
+	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+	_debug("- fsdef %p", fsdef);
+
+	/* look up the directory at the root of the cache */
+	memset(&nd, 0, sizeof(nd));
+
+	ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
+	if (ret < 0)
+		goto error_open_root;
+
+	cache->mnt = mntget(nd.path.mnt);
+	root = dget(nd.path.dentry);
+	path_put(&nd.path);
+
+	/* check parameters */
+	ret = -EOPNOTSUPP;
+	if (!root->d_inode ||
+	    !root->d_inode->i_op ||
+	    !root->d_inode->i_op->lookup ||
+	    !root->d_inode->i_op->mkdir ||
+	    !root->d_inode->i_op->setxattr ||
+	    !root->d_inode->i_op->getxattr ||
+	    !root->d_sb ||
+	    !root->d_sb->s_op ||
+	    !root->d_sb->s_op->statfs ||
+	    !root->d_sb->s_op->sync_fs)
+		goto error_unsupported;
+
+	ret = -EROFS;
+	if (root->d_sb->s_flags & MS_RDONLY)
+		goto error_unsupported;
+
+	/* determine the security of the on-disk cache as this governs
+	 * security ID of files we create */
+	ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the cache size and blocksize */
+	ret = vfs_statfs(root, &stats);
+	if (ret < 0)
+		goto error_unsupported;
+
+	ret = -ERANGE;
+	if (stats.f_bsize <= 0)
+		goto error_unsupported;
+
+	ret = -EOPNOTSUPP;
+	if (stats.f_bsize > PAGE_SIZE)
+		goto error_unsupported;
+
+	cache->bsize = stats.f_bsize;
+	cache->bshift = 0;
+	if (stats.f_bsize < PAGE_SIZE)
+		cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+
+	_debug("blksize %u (shift %u)",
+	       cache->bsize, cache->bshift);
+
+	_debug("size %llu, avail %llu",
+	       (unsigned long long) stats.f_blocks,
+	       (unsigned long long) stats.f_bavail);
+
+	/* set up caching limits */
+	do_div(stats.f_files, 100);
+	cache->fstop = stats.f_files * cache->fstop_percent;
+	cache->fcull = stats.f_files * cache->fcull_percent;
+	cache->frun  = stats.f_files * cache->frun_percent;
+
+	_debug("limits {%llu,%llu,%llu} files",
+	       (unsigned long long) cache->frun,
+	       (unsigned long long) cache->fcull,
+	       (unsigned long long) cache->fstop);
+
+	stats.f_blocks >>= cache->bshift;
+	do_div(stats.f_blocks, 100);
+	cache->bstop = stats.f_blocks * cache->bstop_percent;
+	cache->bcull = stats.f_blocks * cache->bcull_percent;
+	cache->brun  = stats.f_blocks * cache->brun_percent;
+
+	_debug("limits {%llu,%llu,%llu} blocks",
+	       (unsigned long long) cache->brun,
+	       (unsigned long long) cache->bcull,
+	       (unsigned long long) cache->bstop);
+
+	/* get the cache directory and check its type */
+	cachedir = cachefiles_get_directory(cache, root, "cache");
+	if (IS_ERR(cachedir)) {
+		ret = PTR_ERR(cachedir);
+		goto error_unsupported;
+	}
+
+	fsdef->dentry = cachedir;
+	fsdef->fscache.cookie = NULL;
+
+	ret = cachefiles_check_object_type(fsdef);
+	if (ret < 0)
+		goto error_unsupported;
+
+	/* get the graveyard directory */
+	graveyard = cachefiles_get_directory(cache, root, "graveyard");
+	if (IS_ERR(graveyard)) {
+		ret = PTR_ERR(graveyard);
+		goto error_unsupported;
+	}
+
+	cache->graveyard = graveyard;
+
+	/* publish the cache */
+	fscache_init_cache(&cache->cache,
+			   &cachefiles_cache_ops,
+			   "%s",
+			   fsdef->dentry->d_sb->s_id);
+
+	fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+
+	ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+	if (ret < 0)
+		goto error_add_cache;
+
+	/* done */
+	set_bit(CACHEFILES_READY, &cache->flags);
+	dput(root);
+
+	printk(KERN_INFO "CacheFiles:"
+	       " File cache on %s registered\n",
+	       cache->cache.identifier);
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+	cachefiles_end_secure(cache, saved_cred);
+	return 0;
+
+error_add_cache:
+	dput(cache->graveyard);
+	cache->graveyard = NULL;
+error_unsupported:
+	mntput(cache->mnt);
+	cache->mnt = NULL;
+	dput(fsdef->dentry);
+	fsdef->dentry = NULL;
+	dput(root);
+error_open_root:
+	kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+	cachefiles_end_secure(cache, saved_cred);
+	kerror("Failed to register: %d", ret);
+	return ret;
+}
+
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+	_enter("");
+
+	if (test_bit(CACHEFILES_READY, &cache->flags)) {
+		printk(KERN_INFO "CacheFiles:"
+		       " File cache on %s unregistering\n",
+		       cache->cache.identifier);
+
+		fscache_withdraw_cache(&cache->cache);
+	}
+
+	dput(cache->graveyard);
+	mntput(cache->mnt);
+
+	kfree(cache->rootdirname);
+	kfree(cache->secctx);
+	kfree(cache->tag);
+
+	_leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+				      loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+				       size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+					   struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+
+static unsigned long cachefiles_open;
+
+const struct file_operations cachefiles_daemon_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_daemon_open,
+	.release	= cachefiles_daemon_release,
+	.read		= cachefiles_daemon_read,
+	.write		= cachefiles_daemon_write,
+	.poll		= cachefiles_daemon_poll,
+};
+
+struct cachefiles_daemon_cmd {
+	char name[8];
+	int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+	{ "bind",	cachefiles_daemon_bind		},
+	{ "brun",	cachefiles_daemon_brun		},
+	{ "bcull",	cachefiles_daemon_bcull		},
+	{ "bstop",	cachefiles_daemon_bstop		},
+	{ "cull",	cachefiles_daemon_cull		},
+	{ "debug",	cachefiles_daemon_debug		},
+	{ "dir",	cachefiles_daemon_dir		},
+	{ "frun",	cachefiles_daemon_frun		},
+	{ "fcull",	cachefiles_daemon_fcull		},
+	{ "fstop",	cachefiles_daemon_fstop		},
+	{ "inuse",	cachefiles_daemon_inuse		},
+	{ "secctx",	cachefiles_daemon_secctx	},
+	{ "tag",	cachefiles_daemon_tag		},
+	{ "",		NULL				}
+};
+
+
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache;
+
+	_enter("");
+
+	/* only the superuser may do this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* the cachefiles device may only be open once at a time */
+	if (xchg(&cachefiles_open, 1) == 1)
+		return -EBUSY;
+
+	/* allocate a cache record */
+	cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+	if (!cache) {
+		cachefiles_open = 0;
+		return -ENOMEM;
+	}
+
+	mutex_init(&cache->daemon_mutex);
+	cache->active_nodes = RB_ROOT;
+	rwlock_init(&cache->active_lock);
+	init_waitqueue_head(&cache->daemon_pollwq);
+
+	/* set default caching limits
+	 * - limit at 1% free space and/or free files
+	 * - cull below 5% free space and/or free files
+	 * - cease culling above 7% free space and/or free files
+	 */
+	cache->frun_percent = 7;
+	cache->fcull_percent = 5;
+	cache->fstop_percent = 1;
+	cache->brun_percent = 7;
+	cache->bcull_percent = 5;
+	cache->bstop_percent = 1;
+
+	file->private_data = cache;
+	cache->cachefilesd = file;
+	return 0;
+}
+
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	_enter("");
+
+	ASSERT(cache);
+
+	set_bit(CACHEFILES_DEAD, &cache->flags);
+
+	cachefiles_daemon_unbind(cache);
+
+	ASSERT(!cache->active_nodes.rb_node);
+
+	/* clean up the control file interface */
+	cache->cachefilesd = NULL;
+	file->private_data = NULL;
+	cachefiles_open = 0;
+
+	kfree(cache);
+
+	_leave("");
+	return 0;
+}
+
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+				      size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	char buffer[256];
+	int n;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	/* check how much space the cache has */
+	cachefiles_has_space(cache, 0, 0);
+
+	/* summarise */
+	clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+
+	n = snprintf(buffer, sizeof(buffer),
+		     "cull=%c"
+		     " frun=%llx"
+		     " fcull=%llx"
+		     " fstop=%llx"
+		     " brun=%llx"
+		     " bcull=%llx"
+		     " bstop=%llx",
+		     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+		     (unsigned long long) cache->frun,
+		     (unsigned long long) cache->fcull,
+		     (unsigned long long) cache->fstop,
+		     (unsigned long long) cache->brun,
+		     (unsigned long long) cache->bcull,
+		     (unsigned long long) cache->bstop
+		     );
+
+	if (n > buflen)
+		return -EMSGSIZE;
+
+	if (copy_to_user(_buffer, buffer, n) != 0)
+		return -EFAULT;
+
+	return n;
+}
+
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+				       const char __user *_data,
+				       size_t datalen,
+				       loff_t *pos)
+{
+	const struct cachefiles_daemon_cmd *cmd;
+	struct cachefiles_cache *cache = file->private_data;
+	ssize_t ret;
+	char *data, *args, *cp;
+
+	//_enter(",,%zu,", datalen);
+
+	ASSERT(cache);
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags))
+		return -EIO;
+
+	if (datalen < 0 || datalen > PAGE_SIZE - 1)
+		return -EOPNOTSUPP;
+
+	/* drag the command string into the kernel so we can parse it */
+	data = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(data, _data, datalen) != 0)
+		goto error;
+
+	data[datalen] = '\0';
+
+	ret = -EINVAL;
+	if (memchr(data, '\0', datalen))
+		goto error;
+
+	/* strip any newline */
+	cp = memchr(data, '\n', datalen);
+	if (cp) {
+		if (cp == data)
+			goto error;
+
+		*cp = '\0';
+	}
+
+	/* parse the command */
+	ret = -EOPNOTSUPP;
+
+	for (args = data; *args; args++)
+		if (isspace(*args))
+			break;
+	if (*args) {
+		if (args == data)
+			goto error;
+		*args = '\0';
+		for (args++; isspace(*args); args++)
+			continue;
+	}
+
+	/* run the appropriate command handler */
+	for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+		if (strcmp(cmd->name, data) == 0)
+			goto found_command;
+
+error:
+	kfree(data);
+	//_leave(" = %zd", ret);
+	return ret;
+
+found_command:
+	mutex_lock(&cache->daemon_mutex);
+
+	ret = -EIO;
+	if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+		ret = cmd->handler(cache, args);
+
+	mutex_unlock(&cache->daemon_mutex);
+
+	if (ret == 0)
+		ret = datalen;
+	goto error;
+}
+
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+					   struct poll_table_struct *poll)
+{
+	struct cachefiles_cache *cache = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &cache->daemon_pollwq, poll);
+	mask = 0;
+
+	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+		mask |= POLLIN;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags))
+		mask |= POLLOUT;
+
+	return mask;
+}
+
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+					 char *args)
+{
+	kerror("Free space limits must be in range"
+	       " 0%%<=stop<cull<run<100%%");
+
+	return -EINVAL;
+}
+
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long frun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	frun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (frun <= cache->fcull_percent || frun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->frun_percent = frun;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fcull_percent = fcull;
+	return 0;
+}
+
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long fstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	fstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (fstop < 0 || fstop >= cache->fcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->fstop_percent = fstop;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long brun;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	brun = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (brun <= cache->bcull_percent || brun >= 100)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->brun_percent = brun;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bcull;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bcull = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bcull_percent = bcull;
+	return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long bstop;
+
+	_enter(",%s", args);
+
+	if (!*args)
+		return -EINVAL;
+
+	bstop = simple_strtoul(args, &args, 10);
+	if (args[0] != '%' || args[1] != '\0')
+		return -EINVAL;
+
+	if (bstop < 0 || bstop >= cache->bcull_percent)
+		return cachefiles_daemon_range_error(cache, args);
+
+	cache->bstop_percent = bstop;
+	return 0;
+}
+
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+	char *dir;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty directory specified");
+		return -EINVAL;
+	}
+
+	if (cache->rootdirname) {
+		kerror("Second cache directory specified");
+		return -EEXIST;
+	}
+
+	dir = kstrdup(args, GFP_KERNEL);
+	if (!dir)
+		return -ENOMEM;
+
+	cache->rootdirname = dir;
+	return 0;
+}
+
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+	char *secctx;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty security context specified");
+		return -EINVAL;
+	}
+
+	if (cache->secctx) {
+		kerror("Second security context specified");
+		return -EINVAL;
+	}
+
+	secctx = kstrdup(args, GFP_KERNEL);
+	if (!secctx)
+		return -ENOMEM;
+
+	cache->secctx = secctx;
+	return 0;
+}
+
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+	char *tag;
+
+	_enter(",%s", args);
+
+	if (!*args) {
+		kerror("Empty tag specified");
+		return -EINVAL;
+	}
+
+	if (cache->tag)
+		return -EEXIST;
+
+	tag = kstrdup(args, GFP_KERNEL);
+	if (!tag)
+		return -ENOMEM;
+
+	cache->tag = tag;
+	return 0;
+}
+
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+	struct fs_struct *fs;
+	struct dentry *dir;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("cull applied to unready cache");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		kerror("cull applied to dead cache");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	fs = current->fs;
+	read_lock(&fs->lock);
+	dir = dget(fs->pwd.dentry);
+	read_unlock(&fs->lock);
+
+	if (!S_ISDIR(dir->d_inode->i_mode))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_cull(cache, dir, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	dput(dir);
+	_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	dput(dir);
+	kerror("cull command requires dirfd to be a directory");
+	return -ENOTDIR;
+
+inval:
+	kerror("cull command requires dirfd and filename");
+	return -EINVAL;
+}
+
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+	unsigned long mask;
+
+	_enter(",%s", args);
+
+	mask = simple_strtoul(args, &args, 0);
+	if (args[0] != '\0')
+		goto inval;
+
+	cachefiles_debug = mask;
+	_leave(" = 0");
+	return 0;
+
+inval:
+	kerror("debug command requires mask");
+	return -EINVAL;
+}
+
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+	struct fs_struct *fs;
+	struct dentry *dir;
+	const struct cred *saved_cred;
+	int ret;
+
+	//_enter(",%s", args);
+
+	if (strchr(args, '/'))
+		goto inval;
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+		kerror("inuse applied to unready cache");
+		return -EIO;
+	}
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		kerror("inuse applied to dead cache");
+		return -EIO;
+	}
+
+	/* extract the directory dentry from the cwd */
+	fs = current->fs;
+	read_lock(&fs->lock);
+	dir = dget(fs->pwd.dentry);
+	read_unlock(&fs->lock);
+
+	if (!S_ISDIR(dir->d_inode->i_mode))
+		goto notdir;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_check_in_use(cache, dir, args);
+	cachefiles_end_secure(cache, saved_cred);
+
+	dput(dir);
+	//_leave(" = %d", ret);
+	return ret;
+
+notdir:
+	dput(dir);
+	kerror("inuse command requires dirfd to be a directory");
+	return -ENOTDIR;
+
+inval:
+	kerror("inuse command requires dirfd and filename");
+	return -EINVAL;
+}
+
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+			 unsigned fnr, unsigned bnr)
+{
+	struct kstatfs stats;
+	int ret;
+
+	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+	//       (unsigned long long) cache->frun,
+	//       (unsigned long long) cache->fcull,
+	//       (unsigned long long) cache->fstop,
+	//       (unsigned long long) cache->brun,
+	//       (unsigned long long) cache->bcull,
+	//       (unsigned long long) cache->bstop,
+	//       fnr, bnr);
+
+	/* find out how many pages of blockdev are available */
+	memset(&stats, 0, sizeof(stats));
+
+	ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "statfs failed");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	stats.f_bavail >>= cache->bshift;
+
+	//_debug("avail %llu,%llu",
+	//       (unsigned long long) stats.f_ffree,
+	//       (unsigned long long) stats.f_bavail);
+
+	/* see if there is sufficient space */
+	if (stats.f_ffree > fnr)
+		stats.f_ffree -= fnr;
+	else
+		stats.f_ffree = 0;
+
+	if (stats.f_bavail > bnr)
+		stats.f_bavail -= bnr;
+	else
+		stats.f_bavail = 0;
+
+	ret = -ENOBUFS;
+	if (stats.f_ffree < cache->fstop ||
+	    stats.f_bavail < cache->bstop)
+		goto begin_cull;
+
+	ret = 0;
+	if (stats.f_ffree < cache->fcull ||
+	    stats.f_bavail < cache->bcull)
+		goto begin_cull;
+
+	if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+	    stats.f_ffree >= cache->frun &&
+	    stats.f_bavail >= cache->brun &&
+	    test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+	    ) {
+		_debug("cease culling");
+		cachefiles_state_changed(cache);
+	}
+
+	//_leave(" = 0");
+	return 0;
+
+begin_cull:
+	if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+		_debug("### CULL CACHE ###");
+		cachefiles_state_changed(cache);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+struct cachefiles_lookup_data {
+	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
+	char			*key;		/* key path */
+};
+
+static int cachefiles_attr_changed(struct fscache_object *_object);
+
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+	struct fscache_cache *_cache,
+	struct fscache_cookie *cookie)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_xattr *auxdata;
+	unsigned keylen, auxlen;
+	void *buffer;
+	char *key;
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	_enter("{%s},%p,", cache->cache.identifier, cookie);
+
+	lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+	if (!lookup_data)
+		goto nomem_lookup_data;
+
+	/* create a new object record and a temporary leaf image */
+	object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+	if (!object)
+		goto nomem_object;
+
+	ASSERTCMP(object->backer, ==, NULL);
+
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	atomic_set(&object->usage, 1);
+
+	fscache_object_init(&object->fscache, cookie, &cache->cache);
+
+	object->type = cookie->def->type;
+
+	/* get hold of the raw key
+	 * - stick the length on the front and leave space on the back for the
+	 *   encoder
+	 */
+	buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+	if (!buffer)
+		goto nomem_buffer;
+
+	keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+	ASSERTCMP(keylen, <, 512);
+
+	*(uint16_t *)buffer = keylen;
+	((char *)buffer)[keylen + 2] = 0;
+	((char *)buffer)[keylen + 3] = 0;
+	((char *)buffer)[keylen + 4] = 0;
+
+	/* turn the raw key into something that can work with as a filename */
+	key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+	if (!key)
+		goto nomem_key;
+
+	/* get hold of the auxiliary data and prepend the object type */
+	auxdata = buffer;
+	auxlen = 0;
+	if (cookie->def->get_aux) {
+		auxlen = cookie->def->get_aux(cookie->netfs_data,
+					      auxdata->data, 511);
+		ASSERTCMP(auxlen, <, 511);
+	}
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	lookup_data->auxdata = auxdata;
+	lookup_data->key = key;
+	object->lookup_data = lookup_data;
+
+	_leave(" = %p [%p]", &object->fscache, lookup_data);
+	return &object->fscache;
+
+nomem_key:
+	kfree(buffer);
+nomem_buffer:
+	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	kmem_cache_free(cachefiles_object_jar, object);
+	fscache_object_destroyed(&cache->cache);
+nomem_object:
+	kfree(lookup_data);
+nomem_lookup_data:
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * attempt to look up the nominated node in this cache
+ */
+static void cachefiles_lookup_object(struct fscache_object *_object)
+{
+	struct cachefiles_lookup_data *lookup_data;
+	struct cachefiles_object *parent, *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	cache = container_of(_object->cache, struct cachefiles_cache, cache);
+	parent = container_of(_object->parent,
+			      struct cachefiles_object, fscache);
+	object = container_of(_object, struct cachefiles_object, fscache);
+	lookup_data = object->lookup_data;
+
+	ASSERTCMP(lookup_data, !=, NULL);
+
+	/* look up the key, creating any missing bits */
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = cachefiles_walk_to_object(parent, object,
+					lookup_data->key,
+					lookup_data->auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+
+	/* polish off by setting the attributes of non-index files */
+	if (ret == 0 &&
+	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+		cachefiles_attr_changed(&object->fscache);
+
+	if (ret < 0) {
+		printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+		       ret);
+		fscache_object_lookup_error(&object->fscache);
+	}
+
+	_leave(" [%d]", ret);
+}
+
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+
+	if (object->lookup_data) {
+		kfree(object->lookup_data->key);
+		kfree(object->lookup_data->auxdata);
+		kfree(object->lookup_data);
+		object->lookup_data = NULL;
+	}
+}
+
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object =
+		container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	atomic_inc(&object->usage);
+	return &object->fscache;
+}
+
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_xattr *auxdata;
+	struct cachefiles_cache *cache;
+	struct fscache_cookie *cookie;
+	const struct cred *saved_cred;
+	unsigned auxlen;
+
+	_enter("{OBJ%x}", _object->debug_id);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache, struct cachefiles_cache,
+			     cache);
+	cookie = object->fscache.cookie;
+
+	if (!cookie->def->get_aux) {
+		_leave(" [no aux]");
+		return;
+	}
+
+	auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+	if (!auxdata) {
+		_leave(" [nomem]");
+		return;
+	}
+
+	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	ASSERTCMP(auxlen, <, 511);
+
+	auxdata->len = auxlen + 1;
+	auxdata->type = cookie->def->type;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	cachefiles_update_object_xattr(object, auxdata);
+	cachefiles_end_secure(cache, saved_cred);
+	kfree(auxdata);
+	_leave("");
+}
+
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	/* delete retired objects */
+	if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+	    _object != cache->cache.fsdef
+	    ) {
+		_debug("- retire object OBJ%x", object->fscache.debug_id);
+		cachefiles_begin_secure(cache, &saved_cred);
+		cachefiles_delete_object(cache, object);
+		cachefiles_end_secure(cache, saved_cred);
+	}
+
+	/* close the filesystem stuff attached to the object */
+	if (object->backer != object->dentry)
+		dput(object->backer);
+	object->backer = NULL;
+
+	/* note that the object is now inactive */
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+		write_lock(&cache->active_lock);
+		if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+					&object->flags))
+			BUG();
+		rb_erase(&object->active_node, &cache->active_nodes);
+		wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+		write_unlock(&cache->active_lock);
+	}
+
+	dput(object->dentry);
+	object->dentry = NULL;
+
+	_leave("");
+}
+
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct fscache_cache *cache;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+
+	_enter("{OBJ%x,%d}",
+	       object->fscache.debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	ASSERTIFCMP(object->fscache.parent,
+		    object->fscache.parent->n_children, >, 0);
+
+	if (atomic_dec_and_test(&object->usage)) {
+		_debug("- kill object OBJ%x", object->fscache.debug_id);
+
+		ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+		ASSERTCMP(object->fscache.parent, ==, NULL);
+		ASSERTCMP(object->backer, ==, NULL);
+		ASSERTCMP(object->dentry, ==, NULL);
+		ASSERTCMP(object->fscache.n_ops, ==, 0);
+		ASSERTCMP(object->fscache.n_children, ==, 0);
+
+		if (object->lookup_data) {
+			kfree(object->lookup_data->key);
+			kfree(object->lookup_data->auxdata);
+			kfree(object->lookup_data);
+			object->lookup_data = NULL;
+		}
+
+		cache = object->fscache.cache;
+		kmem_cache_free(cachefiles_object_jar, object);
+		fscache_object_destroyed(cache);
+	}
+
+	_leave("");
+}
+
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	int ret;
+
+	_enter("%p", _cache);
+
+	cache = container_of(_cache, struct cachefiles_cache, cache);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = fsync_super(cache->mnt->mnt_sb);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO)
+		cachefiles_io_error(cache,
+				    "Attempt to sync backing fs superblock"
+				    " returned error %d",
+				    ret);
+}
+
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct iattr newattrs;
+	uint64_t ni_size;
+	loff_t oi_size;
+	int ret;
+
+	_object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+
+	_enter("{OBJ%x},[%llu]",
+	       _object->debug_id, (unsigned long long) ni_size);
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (ni_size == object->i_size)
+		return 0;
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+	fscache_set_store_limit(&object->fscache, ni_size);
+
+	oi_size = i_size_read(object->backer->d_inode);
+	if (oi_size == ni_size)
+		return 0;
+
+	newattrs.ia_size = ni_size;
+	newattrs.ia_valid = ATTR_SIZE;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	mutex_lock(&object->backer->d_inode->i_mutex);
+	ret = notify_change(object->backer, &newattrs);
+	mutex_unlock(&object->backer->d_inode->i_mutex);
+	cachefiles_end_secure(cache, saved_cred);
+
+	if (ret == -EIO) {
+		fscache_set_store_limit(&object->fscache, 0);
+		cachefiles_io_error_obj(object, "Size set failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+	_enter("");
+}
+
+const struct fscache_cache_ops cachefiles_cache_ops = {
+	.name			= "cachefiles",
+	.alloc_object		= cachefiles_alloc_object,
+	.lookup_object		= cachefiles_lookup_object,
+	.lookup_complete	= cachefiles_lookup_complete,
+	.grab_object		= cachefiles_grab_object,
+	.update_object		= cachefiles_update_object,
+	.drop_object		= cachefiles_drop_object,
+	.put_object		= cachefiles_put_object,
+	.sync_cache		= cachefiles_sync_cache,
+	.attr_changed		= cachefiles_attr_changed,
+	.read_or_alloc_page	= cachefiles_read_or_alloc_page,
+	.read_or_alloc_pages	= cachefiles_read_or_alloc_pages,
+	.allocate_page		= cachefiles_allocate_page,
+	.allocate_pages		= cachefiles_allocate_pages,
+	.write_page		= cachefiles_write_page,
+	.uncache_page		= cachefiles_uncache_page,
+	.dissociate_pages	= cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..19218e1463d6
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+
+struct cachefiles_cache;
+struct cachefiles_object;
+
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER	1
+#define CACHEFILES_DEBUG_KLEAVE	2
+#define CACHEFILES_DEBUG_KDEBUG	4
+
+/*
+ * node records
+ */
+struct cachefiles_object {
+	struct fscache_object		fscache;	/* fscache handle */
+	struct cachefiles_lookup_data	*lookup_data;	/* cached lookup data */
+	struct dentry			*dentry;	/* the file/dir representing this object */
+	struct dentry			*backer;	/* backing file */
+	loff_t				i_size;		/* object size */
+	unsigned long			flags;
+#define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
+	atomic_t			usage;		/* object usage count */
+	uint8_t				type;		/* object type */
+	uint8_t				new;		/* T if object new */
+	spinlock_t			work_lock;
+	struct rb_node			active_node;	/* link in active tree (dentry is key) */
+};
+
+extern struct kmem_cache *cachefiles_object_jar;
+
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+	struct fscache_cache		cache;		/* FS-Cache record */
+	struct vfsmount			*mnt;		/* mountpoint holding the cache */
+	struct dentry			*graveyard;	/* directory into which dead objects go */
+	struct file			*cachefilesd;	/* manager daemon handle */
+	const struct cred		*cache_cred;	/* security override for accessing cache */
+	struct mutex			daemon_mutex;	/* command serialisation mutex */
+	wait_queue_head_t		daemon_pollwq;	/* poll waitqueue for daemon */
+	struct rb_root			active_nodes;	/* active nodes (can't be culled) */
+	rwlock_t			active_lock;	/* lock for active_nodes */
+	atomic_t			gravecounter;	/* graveyard uniquifier */
+	unsigned			frun_percent;	/* when to stop culling (% files) */
+	unsigned			fcull_percent;	/* when to start culling (% files) */
+	unsigned			fstop_percent;	/* when to stop allocating (% files) */
+	unsigned			brun_percent;	/* when to stop culling (% blocks) */
+	unsigned			bcull_percent;	/* when to start culling (% blocks) */
+	unsigned			bstop_percent;	/* when to stop allocating (% blocks) */
+	unsigned			bsize;		/* cache's block size */
+	unsigned			bshift;		/* min(ilog2(PAGE_SIZE / bsize), 0) */
+	uint64_t			frun;		/* when to stop culling */
+	uint64_t			fcull;		/* when to start culling */
+	uint64_t			fstop;		/* when to stop allocating */
+	sector_t			brun;		/* when to stop culling */
+	sector_t			bcull;		/* when to start culling */
+	sector_t			bstop;		/* when to stop allocating */
+	unsigned long			flags;
+#define CACHEFILES_READY		0	/* T if cache prepared */
+#define CACHEFILES_DEAD			1	/* T if cache dead */
+#define CACHEFILES_CULLING		2	/* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+	char				*rootdirname;	/* name of cache root directory */
+	char				*secctx;	/* LSM security context */
+	char				*tag;		/* cache binding tag */
+};
+
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+	wait_queue_t			monitor;	/* link into monitored waitqueue */
+	struct page			*back_page;	/* backing file page we're waiting for */
+	struct page			*netfs_page;	/* netfs page we're going to fill */
+	struct fscache_retrieval	*op;		/* retrieval op covering this */
+	struct list_head		op_link;	/* link in op's todo list */
+};
+
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+	struct page			*netfs_page;	/* netfs page to copy */
+	struct cachefiles_object	*object;
+	struct list_head		obj_link;	/* link in object's lists */
+	fscache_rw_complete_t		end_io_func;
+	void				*context;
+};
+
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+	uint16_t			len;
+	uint8_t				type;
+	uint8_t				data[];
+};
+
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+	set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+	wake_up_all(&cache->daemon_pollwq);
+}
+
+/*
+ * cf-bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+
+/*
+ * cf-daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+				unsigned fnr, unsigned bnr);
+
+/*
+ * cf-interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+
+/*
+ * cf-key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+
+/*
+ * cf-namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+				    struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+				     struct cachefiles_object *object,
+				     const char *key,
+				     struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					       struct dentry *dir,
+					       const char *name);
+
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+			   char *filename);
+
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+				   struct dentry *dir, char *filename);
+
+/*
+ * cf-proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+#else
+#define cachefiles_proc_init()		(0)
+#define cachefiles_proc_cleanup()	do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * cf-rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+					 struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+					  struct list_head *, unsigned *,
+					  gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+				    gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+				     struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+
+/*
+ * cf-security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					       struct dentry *root,
+					       const struct cred **_saved_cred);
+
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+					   const struct cred **_saved_cred)
+{
+	*_saved_cred = override_creds(cache->cache_cred);
+}
+
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+					 const struct cred *saved_cred)
+{
+	revert_creds(saved_cred);
+}
+
+/*
+ * cf-xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				       struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+					  struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+					 struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+					  struct dentry *dentry);
+
+
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+
+#define cachefiles_io_error(___cache, FMT, ...)		\
+do {							\
+	kerror("I/O Error: " FMT, ##__VA_ARGS__);	\
+	fscache_io_error(&(___cache)->cache);		\
+	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
+} while (0)
+
+#define cachefiles_io_error_obj(object, FMT, ...)			\
+do {									\
+	struct cachefiles_cache *___cache;				\
+									\
+	___cache = container_of((object)->fscache.cache,		\
+				struct cachefiles_cache, cache);	\
+	cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);		\
+} while (0)
+
+
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline void _dbprintk(const char *fmt, ...)
+	__attribute__((format(printf, 1, 2)));
+static inline void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KENTER)	\
+		kenter(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE)	\
+		kleave(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT, ...)				\
+do {							\
+	if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG)	\
+		kdebug(FMT, ##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "CacheFiles: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_charmap[64] =
+	"0123456789"			/* 0 - 9 */
+	"abcdefghijklmnopqrstuvwxyz"	/* 10 - 35 */
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"	/* 36 - 61 */
+	"_-"				/* 62 - 63 */
+	;
+
+static const char cachefiles_filecharmap[256] = {
+	/* we skip space and tab and control chars */
+	[33 ... 46] = 1,		/* '!' -> '.' */
+	/* we skip '/' as it's significant to pathwalk */
+	[48 ... 127] = 1,		/* '0' -> '~' */
+};
+
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+	unsigned char csum, ch;
+	unsigned int acc;
+	char *key;
+	int loop, len, max, seg, mark, print;
+
+	_enter(",%d", keylen);
+
+	BUG_ON(keylen < 2 || keylen > 514);
+
+	csum = raw[0] + raw[1];
+	print = 1;
+	for (loop = 2; loop < keylen; loop++) {
+		ch = raw[loop];
+		csum += ch;
+		print &= cachefiles_filecharmap[ch];
+	}
+
+	if (print) {
+		/* if the path is usable ASCII, then we render it directly */
+		max = keylen - 2;
+		max += 2;	/* two base64'd length chars on the front */
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 251) / 252) = 3
+				 */
+		max += 1;	/* NUL on end */
+	} else {
+		/* calculate the maximum length of the cooked key */
+		keylen = (keylen + 2) / 3;
+
+		max = keylen * 4;
+		max += 5;	/* @checksum/M */
+		max += 3 * 2;	/* maximum number of segment dividers (".../M")
+				 * is ((514 + 188) / 189) = 3
+				 */
+		max += 1;	/* NUL on end */
+	}
+
+	max += 1;	/* 2nd NUL on end */
+
+	_debug("max: %d", max);
+
+	key = kmalloc(max, GFP_KERNEL);
+	if (!key)
+		return NULL;
+
+	len = 0;
+
+	/* build the cooked key */
+	sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+	len = 5;
+	mark = len - 1;
+
+	if (print) {
+		acc = *(uint16_t *) raw;
+		raw += 2;
+
+		key[len + 1] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		key[len] = cachefiles_charmap[acc & 63];
+		len += 2;
+
+		seg = 250;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			key[len++] = *raw++;
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'I';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'D';	break;
+		default:				type = 'S';	break;
+		}
+	} else {
+		seg = 252;
+		for (loop = keylen; loop > 0; loop--) {
+			if (seg <= 0) {
+				key[len++] = '\0';
+				mark = len;
+				key[len++] = '+';
+				seg = 252;
+			}
+
+			acc = *raw++;
+			acc |= *raw++ << 8;
+			acc |= *raw++ << 16;
+
+			_debug("acc: %06x", acc);
+
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+			acc >>= 6;
+			key[len++] = cachefiles_charmap[acc & 63];
+
+			ASSERT(len < max);
+		}
+
+		switch (type) {
+		case FSCACHE_COOKIE_TYPE_INDEX:		type = 'J';	break;
+		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'E';	break;
+		default:				type = 'T';	break;
+		}
+	}
+
+	key[mark] = type;
+	key[len++] = 0;
+	key[len] = 0;
+
+	_leave(" = %p %d", key, len);
+	return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+struct kmem_cache *cachefiles_object_jar;
+
+static struct miscdevice cachefiles_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "cachefiles",
+	.fops	= &cachefiles_daemon_fops,
+};
+
+static void cachefiles_object_init_once(void *_object)
+{
+	struct cachefiles_object *object = _object;
+
+	memset(object, 0, sizeof(*object));
+	spin_lock_init(&object->work_lock);
+}
+
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+	int ret;
+
+	ret = misc_register(&cachefiles_dev);
+	if (ret < 0)
+		goto error_dev;
+
+	/* create an object jar */
+	ret = -ENOMEM;
+	cachefiles_object_jar =
+		kmem_cache_create("cachefiles_object_jar",
+				  sizeof(struct cachefiles_object),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefiles_object_init_once);
+	if (!cachefiles_object_jar) {
+		printk(KERN_NOTICE
+		       "CacheFiles: Failed to allocate an object jar\n");
+		goto error_object_jar;
+	}
+
+	ret = cachefiles_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+	printk(KERN_INFO "CacheFiles: Loaded\n");
+	return 0;
+
+error_proc:
+	kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+	misc_deregister(&cachefiles_dev);
+error_dev:
+	kerror("failed to register: %d", ret);
+	return ret;
+}
+
+fs_initcall(cachefiles_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+	printk(KERN_INFO "CacheFiles: Unloading\n");
+
+	cachefiles_proc_cleanup();
+	kmem_cache_destroy(cachefiles_object_jar);
+	misc_deregister(&cachefiles_dev);
+}
+
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "internal.h"
+
+static int cachefiles_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+
+/*
+ * record the fact that an object is now active
+ */
+static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					  struct cachefiles_object *object)
+{
+	struct cachefiles_object *xobject;
+	struct rb_node **_p, *_parent = NULL;
+	struct dentry *dentry;
+
+	_enter(",%p", object);
+
+try_again:
+	write_lock(&cache->active_lock);
+
+	if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+		BUG();
+
+	dentry = object->dentry;
+	_p = &cache->active_nodes.rb_node;
+	while (*_p) {
+		_parent = *_p;
+		xobject = rb_entry(_parent,
+				   struct cachefiles_object, active_node);
+
+		ASSERT(xobject != object);
+
+		if (xobject->dentry > dentry)
+			_p = &(*_p)->rb_left;
+		else if (xobject->dentry < dentry)
+			_p = &(*_p)->rb_right;
+		else
+			goto wait_for_old_object;
+	}
+
+	rb_link_node(&object->active_node, _parent, _p);
+	rb_insert_color(&object->active_node, &cache->active_nodes);
+
+	write_unlock(&cache->active_lock);
+	_leave("");
+	return;
+
+	/* an old object from a previous incarnation is hogging the slot - we
+	 * need to wait for it to be destroyed */
+wait_for_old_object:
+	if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+		printk(KERN_ERR "\n");
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Unexpected object collision\n");
+		printk(KERN_ERR "xobject: OBJ%x\n",
+		       xobject->fscache.debug_id);
+		printk(KERN_ERR "xobjstate=%s\n",
+		       fscache_object_states[xobject->fscache.state]);
+		printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
+		printk(KERN_ERR "xobjevent=%lx [%lx]\n",
+		       xobject->fscache.events, xobject->fscache.event_mask);
+		printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
+		       xobject->fscache.n_ops, xobject->fscache.n_in_progress,
+		       xobject->fscache.n_exclusive);
+		printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
+		       xobject->fscache.cookie,
+		       xobject->fscache.cookie->parent,
+		       xobject->fscache.cookie->netfs_data,
+		       xobject->fscache.cookie->flags);
+		printk(KERN_ERR "xparent=%p\n",
+		       xobject->fscache.parent);
+		printk(KERN_ERR "object: OBJ%x\n",
+		       object->fscache.debug_id);
+		printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
+		       object->fscache.cookie,
+		       object->fscache.cookie->parent,
+		       object->fscache.cookie->netfs_data,
+		       object->fscache.cookie->flags);
+		printk(KERN_ERR "parent=%p\n",
+		       object->fscache.parent);
+		BUG();
+	}
+	atomic_inc(&xobject->usage);
+	write_unlock(&cache->active_lock);
+
+	_debug(">>> wait");
+	wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+		    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
+	_debug("<<< waited");
+
+	cache->cache.ops->put_object(&xobject->fscache);
+	goto try_again;
+}
+
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+				  struct dentry *dir,
+				  struct dentry *rep)
+{
+	struct dentry *grave, *trap;
+	char nbuffer[8 + 8 + 1];
+	int ret;
+
+	_enter(",'%*.*s','%*.*s'",
+	       dir->d_name.len, dir->d_name.len, dir->d_name.name,
+	       rep->d_name.len, rep->d_name.len, rep->d_name.name);
+
+	/* non-directories can just be unlinked */
+	if (!S_ISDIR(rep->d_inode->i_mode)) {
+		_debug("unlink stale object");
+		ret = vfs_unlink(dir->d_inode, rep);
+
+		mutex_unlock(&dir->d_inode->i_mutex);
+
+		if (ret == -EIO)
+			cachefiles_io_error(cache, "Unlink failed");
+
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* directories have to be moved to the graveyard */
+	_debug("move stale object to graveyard");
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+try_again:
+	/* first step is to make up a grave dentry in the graveyard */
+	sprintf(nbuffer, "%08x%08x",
+		(uint32_t) get_seconds(),
+		(uint32_t) atomic_inc_return(&cache->gravecounter));
+
+	/* do the multiway lock magic */
+	trap = lock_rename(cache->graveyard, dir);
+
+	/* do some checks before getting the grave dentry */
+	if (rep->d_parent != dir) {
+		/* the entry was probably culled when we dropped the parent dir
+		 * lock */
+		unlock_rename(cache->graveyard, dir);
+		_leave(" = 0 [culled?]");
+		return 0;
+	}
+
+	if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Graveyard no longer a directory");
+		return -EIO;
+	}
+
+	if (trap == rep) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	if (d_mountpoint(rep)) {
+		unlock_rename(cache->graveyard, dir);
+		cachefiles_io_error(cache, "Mountpoint in cache");
+		return -EIO;
+	}
+
+	grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+	if (IS_ERR(grave)) {
+		unlock_rename(cache->graveyard, dir);
+
+		if (PTR_ERR(grave) == -ENOMEM) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
+		}
+
+		cachefiles_io_error(cache, "Lookup error %ld",
+				    PTR_ERR(grave));
+		return -EIO;
+	}
+
+	if (grave->d_inode) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		grave = NULL;
+		cond_resched();
+		goto try_again;
+	}
+
+	if (d_mountpoint(grave)) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "Mountpoint in graveyard");
+		return -EIO;
+	}
+
+	/* target should not be an ancestor of source */
+	if (trap == grave) {
+		unlock_rename(cache->graveyard, dir);
+		dput(grave);
+		cachefiles_io_error(cache, "May not make directory loop");
+		return -EIO;
+	}
+
+	/* attempt the rename */
+	ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+	if (ret != 0 && ret != -ENOMEM)
+		cachefiles_io_error(cache, "Rename failed with error %d", ret);
+
+	unlock_rename(cache->graveyard, dir);
+	dput(grave);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+			     struct cachefiles_object *object)
+{
+	struct dentry *dir;
+	int ret;
+
+	_enter(",{%p}", object->dentry);
+
+	ASSERT(object->dentry);
+	ASSERT(object->dentry->d_inode);
+	ASSERT(object->dentry->d_parent);
+
+	dir = dget_parent(object->dentry);
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	ret = cachefiles_bury_object(cache, dir, object->dentry);
+
+	dput(dir);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+			      struct cachefiles_object *object,
+			      const char *key,
+			      struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_cache *cache;
+	struct dentry *dir, *next = NULL;
+	unsigned long start;
+	const char *name;
+	int ret, nlen;
+
+	_enter("{%p},,%s,", parent->dentry, key);
+
+	cache = container_of(parent->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	ASSERT(parent->dentry);
+	ASSERT(parent->dentry->d_inode);
+
+	if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+		// TODO: convert file to dir
+		_leave("looking up in none directory");
+		return -ENOBUFS;
+	}
+
+	dir = dget(parent->dentry);
+
+advance:
+	/* attempt to transit the first directory component */
+	name = key;
+	nlen = strlen(key);
+
+	/* key ends in a double NUL */
+	key = key + nlen + 1;
+	if (!*key)
+		key = NULL;
+
+lookup_again:
+	/* search the current directory for the element name */
+	_debug("lookup '%s'", name);
+
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	start = jiffies;
+	next = lookup_one_len(name, dir, nlen);
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(next))
+		goto lookup_error;
+
+	_debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+
+	if (!key)
+		object->new = !next->d_inode;
+
+	/* if this element of the path doesn't exist, then the lookup phase
+	 * failed, and we can release any readers in the certain knowledge that
+	 * there's nothing for them to actually read */
+	if (!next->d_inode)
+		fscache_object_lookup_negative(&object->fscache);
+
+	/* we need to create the object if it's negative */
+	if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+		/* index objects and intervening tree levels must be subdirs */
+		if (!next->d_inode) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			start = jiffies;
+			ret = vfs_mkdir(dir->d_inode, next, 0);
+			cachefiles_hist(cachefiles_mkdir_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(next->d_inode);
+
+			_debug("mkdir -> %p{%p{ino=%lu}}",
+			       next, next->d_inode, next->d_inode->i_ino);
+
+		} else if (!S_ISDIR(next->d_inode->i_mode)) {
+			kerror("inode %lu is not a directory",
+			       next->d_inode->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+
+	} else {
+		/* non-index objects start out life as files */
+		if (!next->d_inode) {
+			ret = cachefiles_has_space(cache, 1, 0);
+			if (ret < 0)
+				goto create_error;
+
+			start = jiffies;
+			ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+			cachefiles_hist(cachefiles_create_histogram, start);
+			if (ret < 0)
+				goto create_error;
+
+			ASSERT(next->d_inode);
+
+			_debug("create -> %p{%p{ino=%lu}}",
+			       next, next->d_inode, next->d_inode->i_ino);
+
+		} else if (!S_ISDIR(next->d_inode->i_mode) &&
+			   !S_ISREG(next->d_inode->i_mode)
+			   ) {
+			kerror("inode %lu is not a file or directory",
+			       next->d_inode->i_ino);
+			ret = -ENOBUFS;
+			goto error;
+		}
+	}
+
+	/* process the next component */
+	if (key) {
+		_debug("advance");
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(dir);
+		dir = next;
+		next = NULL;
+		goto advance;
+	}
+
+	/* we've found the object we were looking for */
+	object->dentry = next;
+
+	/* if we've found that the terminal object exists, then we need to
+	 * check its attributes and delete it if it's out of date */
+	if (!object->new) {
+		_debug("validate '%*.*s'",
+		       next->d_name.len, next->d_name.len, next->d_name.name);
+
+		ret = cachefiles_check_object_xattr(object, auxdata);
+		if (ret == -ESTALE) {
+			/* delete the object (the deleter drops the directory
+			 * mutex) */
+			object->dentry = NULL;
+
+			ret = cachefiles_bury_object(cache, dir, next);
+			dput(next);
+			next = NULL;
+
+			if (ret < 0)
+				goto delete_error;
+
+			_debug("redo lookup");
+			goto lookup_again;
+		}
+	}
+
+	/* note that we're now using this object */
+	cachefiles_mark_object_active(cache, object);
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+	dir = NULL;
+
+	_debug("=== OBTAINED_OBJECT ===");
+
+	if (object->new) {
+		/* attach data to a newly constructed terminal object */
+		ret = cachefiles_set_object_xattr(object, auxdata);
+		if (ret < 0)
+			goto check_error;
+	} else {
+		/* always update the atime on an object we've just looked up
+		 * (this is used to keep track of culling, and atimes are only
+		 * updated by read, write and readdir but not lookup or
+		 * open) */
+		touch_atime(cache->mnt, next);
+	}
+
+	/* open a file interface onto a data file */
+	if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (S_ISREG(object->dentry->d_inode->i_mode)) {
+			const struct address_space_operations *aops;
+
+			ret = -EPERM;
+			aops = object->dentry->d_inode->i_mapping->a_ops;
+			if (!aops->bmap)
+				goto check_error;
+
+			object->backer = object->dentry;
+		} else {
+			BUG(); // TODO: open file in data-class subdir
+		}
+	}
+
+	object->new = 0;
+	fscache_obtained_object(&object->fscache);
+
+	_leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+	return 0;
+
+create_error:
+	_debug("create error %d", ret);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Create/mkdir failed");
+	goto error;
+
+check_error:
+	_debug("check error %d", ret);
+	write_lock(&cache->active_lock);
+	rb_erase(&object->active_node, &cache->active_nodes);
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+	write_unlock(&cache->active_lock);
+
+	dput(object->dentry);
+	object->dentry = NULL;
+	goto error_out;
+
+delete_error:
+	_debug("delete error %d", ret);
+	goto error_out2;
+
+lookup_error:
+	_debug("lookup error %ld", PTR_ERR(next));
+	ret = PTR_ERR(next);
+	if (ret == -EIO)
+		cachefiles_io_error(cache, "Lookup failed");
+	next = NULL;
+error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(next);
+error_out2:
+	dput(dir);
+error_out:
+	if (ret == -ENOSPC)
+		ret = -ENOBUFS;
+
+	_leave(" = error %d", -ret);
+	return ret;
+}
+
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+					struct dentry *dir,
+					const char *dirname)
+{
+	struct dentry *subdir;
+	unsigned long start;
+	int ret;
+
+	_enter(",,%s", dirname);
+
+	/* search the current directory for the element name */
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	start = jiffies;
+	subdir = lookup_one_len(dirname, dir, strlen(dirname));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(subdir)) {
+		if (PTR_ERR(subdir) == -ENOMEM)
+			goto nomem_d_alloc;
+		goto lookup_error;
+	}
+
+	_debug("subdir -> %p %s",
+	       subdir, subdir->d_inode ? "positive" : "negative");
+
+	/* we need to create the subdir if it doesn't exist yet */
+	if (!subdir->d_inode) {
+		ret = cachefiles_has_space(cache, 1, 0);
+		if (ret < 0)
+			goto mkdir_error;
+
+		_debug("attempt mkdir");
+
+		ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
+
+		ASSERT(subdir->d_inode);
+
+		_debug("mkdir -> %p{%p{ino=%lu}}",
+		       subdir,
+		       subdir->d_inode,
+		       subdir->d_inode->i_ino);
+	}
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+
+	/* we need to make sure the subdir is a directory */
+	ASSERT(subdir->d_inode);
+
+	if (!S_ISDIR(subdir->d_inode->i_mode)) {
+		kerror("%s is not a directory", dirname);
+		ret = -EIO;
+		goto check_error;
+	}
+
+	ret = -EPERM;
+	if (!subdir->d_inode->i_op ||
+	    !subdir->d_inode->i_op->setxattr ||
+	    !subdir->d_inode->i_op->getxattr ||
+	    !subdir->d_inode->i_op->lookup ||
+	    !subdir->d_inode->i_op->mkdir ||
+	    !subdir->d_inode->i_op->create ||
+	    !subdir->d_inode->i_op->rename ||
+	    !subdir->d_inode->i_op->rmdir ||
+	    !subdir->d_inode->i_op->unlink)
+		goto check_error;
+
+	_leave(" = [%lu]", subdir->d_inode->i_ino);
+	return subdir;
+
+check_error:
+	dput(subdir);
+	_leave(" = %d [check]", ret);
+	return ERR_PTR(ret);
+
+mkdir_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(subdir);
+	kerror("mkdir %s failed with error %d", dirname, ret);
+	return ERR_PTR(ret);
+
+lookup_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	ret = PTR_ERR(subdir);
+	kerror("Lookup %s failed with error %d", dirname, ret);
+	return ERR_PTR(ret);
+
+nomem_d_alloc:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+					      struct dentry *dir,
+					      char *filename)
+{
+	struct cachefiles_object *object;
+	struct rb_node *_n;
+	struct dentry *victim;
+	unsigned long start;
+	int ret;
+
+	//_enter(",%*.*s/,%s",
+	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	/* look up the victim */
+	mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+
+	start = jiffies;
+	victim = lookup_one_len(filename, dir, strlen(filename));
+	cachefiles_hist(cachefiles_lookup_histogram, start);
+	if (IS_ERR(victim))
+		goto lookup_error;
+
+	//_debug("victim -> %p %s",
+	//       victim, victim->d_inode ? "positive" : "negative");
+
+	/* if the object is no longer there then we probably retired the object
+	 * at the netfs's request whilst the cull was in progress
+	 */
+	if (!victim->d_inode) {
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(victim);
+		_leave(" = -ENOENT [absent]");
+		return ERR_PTR(-ENOENT);
+	}
+
+	/* check to see if we're using this object */
+	read_lock(&cache->active_lock);
+
+	_n = cache->active_nodes.rb_node;
+
+	while (_n) {
+		object = rb_entry(_n, struct cachefiles_object, active_node);
+
+		if (object->dentry > victim)
+			_n = _n->rb_left;
+		else if (object->dentry < victim)
+			_n = _n->rb_right;
+		else
+			goto object_in_use;
+	}
+
+	read_unlock(&cache->active_lock);
+
+	//_leave(" = %p", victim);
+	return victim;
+
+object_in_use:
+	read_unlock(&cache->active_lock);
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(victim);
+	//_leave(" = -EBUSY [in use]");
+	return ERR_PTR(-EBUSY);
+
+lookup_error:
+	mutex_unlock(&dir->d_inode->i_mutex);
+	ret = PTR_ERR(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return ERR_PTR(-ESTALE);
+	}
+
+	if (ret == -EIO) {
+		cachefiles_io_error(cache, "Lookup failed");
+	} else if (ret != -ENOMEM) {
+		kerror("Internal error: %d", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+		    char *filename)
+{
+	struct dentry *victim;
+	int ret;
+
+	_enter(",%*.*s/,%s",
+	       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	_debug("victim -> %p %s",
+	       victim, victim->d_inode ? "positive" : "negative");
+
+	/* okay... the victim is not being used so we can cull it
+	 * - start by marking it as stale
+	 */
+	_debug("victim is cullable");
+
+	ret = cachefiles_remove_object_xattr(cache, victim);
+	if (ret < 0)
+		goto error_unlock;
+
+	/*  actually remove the victim (drops the dir mutex) */
+	_debug("bury");
+
+	ret = cachefiles_bury_object(cache, dir, victim);
+	if (ret < 0)
+		goto error;
+
+	dput(victim);
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+error:
+	dput(victim);
+	if (ret == -ENOENT) {
+		/* file or dir now absent - probably retired by netfs */
+		_leave(" = -ESTALE [absent]");
+		return -ESTALE;
+	}
+
+	if (ret != -ENOMEM) {
+		kerror("Internal error: %d", ret);
+		ret = -EIO;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+			    char *filename)
+{
+	struct dentry *victim;
+
+	//_enter(",%*.*s/,%s",
+	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+	victim = cachefiles_check_active(cache, dir, filename);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(victim);
+	//_leave(" = 0");
+	return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned x, y, z, t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		x = atomic_read(&cachefiles_lookup_histogram[index]);
+		y = atomic_read(&cachefiles_mkdir_histogram[index]);
+		z = atomic_read(&cachefiles_create_histogram[index]);
+		if (x == 0 && y == 0 && z == 0)
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations cachefiles_histogram_ops = {
+	.start		= cachefiles_histogram_start,
+	.stop		= cachefiles_histogram_stop,
+	.next		= cachefiles_histogram_next,
+	.show		= cachefiles_histogram_show,
+};
+
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cachefiles_histogram_ops);
+}
+
+static const struct file_operations cachefiles_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cachefiles_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/cachefiles", NULL))
+		goto error_dir;
+
+	if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+			 &cachefiles_histogram_fops))
+		goto error_histogram;
+
+	_leave(" = 0");
+	return 0;
+
+error_histogram:
+	remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+	remove_proc_entry("fs/cachefiles/histogram", NULL);
+	remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "internal.h"
+
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+				  int sync, void *_key)
+{
+	struct cachefiles_one_read *monitor =
+		container_of(wait, struct cachefiles_one_read, monitor);
+	struct cachefiles_object *object;
+	struct wait_bit_key *key = _key;
+	struct page *page = wait->private;
+
+	ASSERT(key);
+
+	_enter("{%lu},%u,%d,{%p,%u}",
+	       monitor->netfs_page->index, mode, sync,
+	       key->flags, key->bit_nr);
+
+	if (key->flags != &page->flags ||
+	    key->bit_nr != PG_locked)
+		return 0;
+
+	_debug("--- monitor %p %lx ---", page, page->flags);
+
+	if (!PageUptodate(page) && !PageError(page))
+		dump_stack();
+
+	/* remove from the waitqueue */
+	list_del(&wait->task_list);
+
+	/* move onto the action list and queue for FS-Cache thread pool */
+	ASSERT(monitor->op);
+
+	object = container_of(monitor->op->op.object,
+			      struct cachefiles_object, fscache);
+
+	spin_lock(&object->work_lock);
+	list_add_tail(&monitor->op_link, &monitor->op->to_do);
+	spin_unlock(&object->work_lock);
+
+	fscache_enqueue_retrieval(monitor->op);
+	return 0;
+}
+
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+	struct cachefiles_one_read *monitor;
+	struct cachefiles_object *object;
+	struct fscache_retrieval *op;
+	struct pagevec pagevec;
+	int error, max;
+
+	op = container_of(_op, struct fscache_retrieval, op);
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("{ino=%lu}", object->backer->d_inode->i_ino);
+
+	pagevec_init(&pagevec, 0);
+
+	max = 8;
+	spin_lock_irq(&object->work_lock);
+
+	while (!list_empty(&op->to_do)) {
+		monitor = list_entry(op->to_do.next,
+				     struct cachefiles_one_read, op_link);
+		list_del(&monitor->op_link);
+
+		spin_unlock_irq(&object->work_lock);
+
+		_debug("- copy {%lu}", monitor->back_page->index);
+
+		error = -EIO;
+		if (PageUptodate(monitor->back_page)) {
+			copy_highpage(monitor->netfs_page, monitor->back_page);
+
+			pagevec_add(&pagevec, monitor->netfs_page);
+			fscache_mark_pages_cached(monitor->op, &pagevec);
+			error = 0;
+		}
+
+		if (error)
+			cachefiles_io_error_obj(
+				object,
+				"Readpage failed on backing file %lx",
+				(unsigned long) monitor->back_page->flags);
+
+		page_cache_release(monitor->back_page);
+
+		fscache_end_io(op, monitor->netfs_page, error);
+		page_cache_release(monitor->netfs_page);
+		fscache_put_retrieval(op);
+		kfree(monitor);
+
+		/* let the thread pool have some air occasionally */
+		max--;
+		if (max < 0 || need_resched()) {
+			if (!list_empty(&op->to_do))
+				fscache_enqueue_retrieval(op);
+			_leave(" [maxed out]");
+			return;
+		}
+
+		spin_lock_irq(&object->work_lock);
+	}
+
+	spin_unlock_irq(&object->work_lock);
+	_leave("");
+}
+
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+					    struct fscache_retrieval *op,
+					    struct page *netpage,
+					    struct pagevec *pagevec)
+{
+	struct cachefiles_one_read *monitor;
+	struct address_space *bmapping;
+	struct page *newpage, *backpage;
+	int ret;
+
+	_enter("");
+
+	pagevec_reinit(pagevec);
+
+	_debug("read back %p{%lu,%d}",
+	       netpage, netpage->index, page_count(netpage));
+
+	monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+	if (!monitor)
+		goto nomem;
+
+	monitor->netfs_page = netpage;
+	monitor->op = fscache_get_retrieval(op);
+
+	init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+
+	/* attempt to get hold of the backing page */
+	bmapping = object->backer->d_inode->i_mapping;
+	newpage = NULL;
+
+	for (;;) {
+		backpage = find_get_page(bmapping, netpage->index);
+		if (backpage)
+			goto backing_page_already_present;
+
+		if (!newpage) {
+			newpage = page_cache_alloc_cold(bmapping);
+			if (!newpage)
+				goto nomem_monitor;
+		}
+
+		ret = add_to_page_cache(newpage, bmapping,
+					netpage->index, GFP_KERNEL);
+		if (ret == 0)
+			goto installed_new_backing_page;
+		if (ret != -EEXIST)
+			goto nomem_page;
+	}
+
+	/* we've installed a new backing page, so now we need to add it
+	 * to the LRU list and start it reading */
+installed_new_backing_page:
+	_debug("- new %p", newpage);
+
+	backpage = newpage;
+	newpage = NULL;
+
+	page_cache_get(backpage);
+	pagevec_add(pagevec, backpage);
+	__pagevec_lru_add_file(pagevec);
+
+read_backing_page:
+	ret = bmapping->a_ops->readpage(NULL, backpage);
+	if (ret < 0)
+		goto read_error;
+
+	/* set the monitor to transfer the data across */
+monitor_backing_page:
+	_debug("- monitor add");
+
+	/* install the monitor */
+	page_cache_get(monitor->netfs_page);
+	page_cache_get(backpage);
+	monitor->back_page = backpage;
+	monitor->monitor.private = backpage;
+	add_page_wait_queue(backpage, &monitor->monitor);
+	monitor = NULL;
+
+	/* but the page may have been read before the monitor was installed, so
+	 * the monitor may miss the event - so we have to ensure that we do get
+	 * one in such a case */
+	if (trylock_page(backpage)) {
+		_debug("jumpstart %p {%lx}", backpage, backpage->flags);
+		unlock_page(backpage);
+	}
+	goto success;
+
+	/* if the backing page is already present, it can be in one of
+	 * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+	_debug("- present");
+
+	if (newpage) {
+		page_cache_release(newpage);
+		newpage = NULL;
+	}
+
+	if (PageError(backpage))
+		goto io_error;
+
+	if (PageUptodate(backpage))
+		goto backing_page_already_uptodate;
+
+	if (!trylock_page(backpage))
+		goto monitor_backing_page;
+	_debug("read %p {%lx}", backpage, backpage->flags);
+	goto read_backing_page;
+
+	/* the backing page is already up to date, attach the netfs
+	 * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+	_debug("- uptodate");
+
+	pagevec_add(pagevec, netpage);
+	fscache_mark_pages_cached(op, pagevec);
+
+	copy_highpage(netpage, backpage);
+	fscache_end_io(op, netpage, 0);
+
+success:
+	_debug("success");
+	ret = 0;
+
+out:
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(monitor->op);
+		kfree(monitor);
+	}
+	_leave(" = %d", ret);
+	return ret;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM)
+		goto out;
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	ret = -ENOBUFS;
+	goto out;
+
+nomem_page:
+	page_cache_release(newpage);
+nomem_monitor:
+	fscache_put_retrieval(monitor->op);
+	kfree(monitor);
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	struct inode *inode;
+	sector_t block0, block;
+	unsigned shift;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{%p},{%lx},,,", object, page->index);
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	inode = object->backer->d_inode;
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		return -ENOBUFS;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	op->op.flags = FSCACHE_OP_FAST;
+	op->op.processor = cachefiles_read_copier;
+
+	pagevec_init(&pagevec, 0);
+
+	/* we assume the absence or presence of the first block is a good
+	 * enough indication for the page as a whole
+	 * - TODO: don't use bmap() for this as it is _not_ actually good
+	 *   enough for this as it doesn't indicate errors, but it's all we've
+	 *   got for the moment
+	 */
+	block0 = page->index;
+	block0 <<= shift;
+
+	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+	_debug("%llx -> %llx",
+	       (unsigned long long) block0,
+	       (unsigned long long) block);
+
+	if (block) {
+		/* submit the apparently valid page to the backing fs to be
+		 * read from disk */
+		ret = cachefiles_read_backing_file_one(object, op, page,
+						       &pagevec);
+	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
+		/* there's space in the cache we can use */
+		pagevec_add(&pagevec, page);
+		fscache_mark_pages_cached(op, &pagevec);
+		ret = -ENODATA;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+					struct fscache_retrieval *op,
+					struct list_head *list,
+					struct pagevec *mark_pvec)
+{
+	struct cachefiles_one_read *monitor = NULL;
+	struct address_space *bmapping = object->backer->d_inode->i_mapping;
+	struct pagevec lru_pvec;
+	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+	int ret = 0;
+
+	_enter("");
+
+	pagevec_init(&lru_pvec, 0);
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+
+		_debug("read back %p{%lu,%d}",
+		       netpage, netpage->index, page_count(netpage));
+
+		if (!monitor) {
+			monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+			if (!monitor)
+				goto nomem;
+
+			monitor->op = fscache_get_retrieval(op);
+			init_waitqueue_func_entry(&monitor->monitor,
+						  cachefiles_read_waiter);
+		}
+
+		for (;;) {
+			backpage = find_get_page(bmapping, netpage->index);
+			if (backpage)
+				goto backing_page_already_present;
+
+			if (!newpage) {
+				newpage = page_cache_alloc_cold(bmapping);
+				if (!newpage)
+					goto nomem;
+			}
+
+			ret = add_to_page_cache(newpage, bmapping,
+						netpage->index, GFP_KERNEL);
+			if (ret == 0)
+				goto installed_new_backing_page;
+			if (ret != -EEXIST)
+				goto nomem;
+		}
+
+		/* we've installed a new backing page, so now we need to add it
+		 * to the LRU list and start it reading */
+	installed_new_backing_page:
+		_debug("- new %p", newpage);
+
+		backpage = newpage;
+		newpage = NULL;
+
+		page_cache_get(backpage);
+		if (!pagevec_add(&lru_pvec, backpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+	reread_backing_page:
+		ret = bmapping->a_ops->readpage(NULL, backpage);
+		if (ret < 0)
+			goto read_error;
+
+		/* add the netfs page to the pagecache and LRU, and set the
+		 * monitor to transfer the data across */
+	monitor_backing_page:
+		_debug("- monitor add");
+
+		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+					GFP_KERNEL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				continue;
+			}
+			goto nomem;
+		}
+
+		page_cache_get(netpage);
+		if (!pagevec_add(&lru_pvec, netpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+		/* install a monitor */
+		page_cache_get(netpage);
+		monitor->netfs_page = netpage;
+
+		page_cache_get(backpage);
+		monitor->back_page = backpage;
+		monitor->monitor.private = backpage;
+		add_page_wait_queue(backpage, &monitor->monitor);
+		monitor = NULL;
+
+		/* but the page may have been read before the monitor was
+		 * installed, so the monitor may miss the event - so we have to
+		 * ensure that we do get one in such a case */
+		if (trylock_page(backpage)) {
+			_debug("2unlock %p {%lx}", backpage, backpage->flags);
+			unlock_page(backpage);
+		}
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		page_cache_release(netpage);
+		netpage = NULL;
+		continue;
+
+		/* if the backing page is already present, it can be in one of
+		 * three states: read in progress, read failed or read okay */
+	backing_page_already_present:
+		_debug("- present %p", backpage);
+
+		if (PageError(backpage))
+			goto io_error;
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate;
+
+		_debug("- not ready %p{%lx}", backpage, backpage->flags);
+
+		if (!trylock_page(backpage))
+			goto monitor_backing_page;
+
+		if (PageError(backpage)) {
+			_debug("error %lx", backpage->flags);
+			unlock_page(backpage);
+			goto io_error;
+		}
+
+		if (PageUptodate(backpage))
+			goto backing_page_already_uptodate_unlock;
+
+		/* we've locked a page that's neither up to date nor erroneous,
+		 * so we need to attempt to read it again */
+		goto reread_backing_page;
+
+		/* the backing page is already up to date, attach the netfs
+		 * page to the pagecache and LRU and copy the data across */
+	backing_page_already_uptodate_unlock:
+		_debug("uptodate %lx", backpage->flags);
+		unlock_page(backpage);
+	backing_page_already_uptodate:
+		_debug("- uptodate");
+
+		ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+					GFP_KERNEL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				page_cache_release(netpage);
+				continue;
+			}
+			goto nomem;
+		}
+
+		copy_highpage(netpage, backpage);
+
+		page_cache_release(backpage);
+		backpage = NULL;
+
+		if (!pagevec_add(mark_pvec, netpage))
+			fscache_mark_pages_cached(op, mark_pvec);
+
+		page_cache_get(netpage);
+		if (!pagevec_add(&lru_pvec, netpage))
+			__pagevec_lru_add_file(&lru_pvec);
+
+		fscache_end_io(op, netpage, 0);
+		page_cache_release(netpage);
+		netpage = NULL;
+		continue;
+	}
+
+	netpage = NULL;
+
+	_debug("out");
+
+out:
+	/* tidy up */
+	pagevec_lru_add_file(&lru_pvec);
+
+	if (newpage)
+		page_cache_release(newpage);
+	if (netpage)
+		page_cache_release(netpage);
+	if (backpage)
+		page_cache_release(backpage);
+	if (monitor) {
+		fscache_put_retrieval(op);
+		kfree(monitor);
+	}
+
+	list_for_each_entry_safe(netpage, _n, list, lru) {
+		list_del(&netpage->lru);
+		page_cache_release(netpage);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+nomem:
+	_debug("nomem");
+	ret = -ENOMEM;
+	goto out;
+
+read_error:
+	_debug("read error %d", ret);
+	if (ret == -ENOMEM)
+		goto out;
+io_error:
+	cachefiles_io_error_obj(object, "Page read error on backing file");
+	ret = -ENOBUFS;
+	goto out;
+}
+
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+				   struct list_head *pages,
+				   unsigned *nr_pages,
+				   gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct list_head backpages;
+	struct pagevec pagevec;
+	struct inode *inode;
+	struct page *page, *_n;
+	unsigned shift, nrbackpages;
+	int ret, ret2, space;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("{OBJ%x,%d},,%d,,",
+	       object->fscache.debug_id, atomic_read(&op->op.usage),
+	       *nr_pages);
+
+	if (!object->backer)
+		return -ENOBUFS;
+
+	space = 1;
+	if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+		space = 0;
+
+	inode = object->backer->d_inode;
+	ASSERT(S_ISREG(inode->i_mode));
+	ASSERT(inode->i_mapping->a_ops->bmap);
+	ASSERT(inode->i_mapping->a_ops->readpages);
+
+	/* calculate the shift required to use bmap */
+	if (inode->i_sb->s_blocksize > PAGE_SIZE)
+		return -ENOBUFS;
+
+	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+	pagevec_init(&pagevec, 0);
+
+	op->op.flags = FSCACHE_OP_FAST;
+	op->op.processor = cachefiles_read_copier;
+
+	INIT_LIST_HEAD(&backpages);
+	nrbackpages = 0;
+
+	ret = space ? -ENODATA : -ENOBUFS;
+	list_for_each_entry_safe(page, _n, pages, lru) {
+		sector_t block0, block;
+
+		/* we assume the absence or presence of the first block is a
+		 * good enough indication for the page as a whole
+		 * - TODO: don't use bmap() for this as it is _not_ actually
+		 *   good enough for this as it doesn't indicate errors, but
+		 *   it's all we've got for the moment
+		 */
+		block0 = page->index;
+		block0 <<= shift;
+
+		block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+						      block0);
+		_debug("%llx -> %llx",
+		       (unsigned long long) block0,
+		       (unsigned long long) block);
+
+		if (block) {
+			/* we have data - add it to the list to give to the
+			 * backing fs */
+			list_move(&page->lru, &backpages);
+			(*nr_pages)--;
+			nrbackpages++;
+		} else if (space && pagevec_add(&pagevec, page) == 0) {
+			fscache_mark_pages_cached(op, &pagevec);
+			ret = -ENODATA;
+		}
+	}
+
+	if (pagevec_count(&pagevec) > 0)
+		fscache_mark_pages_cached(op, &pagevec);
+
+	if (list_empty(pages))
+		ret = 0;
+
+	/* submit the apparently valid pages to the backing fs to be read from
+	 * disk */
+	if (nrbackpages > 0) {
+		ret2 = cachefiles_read_backing_file(object, op, &backpages,
+						    &pagevec);
+		if (ret2 == -ENOMEM || ret2 == -EINTR)
+			ret = ret2;
+	}
+
+	if (pagevec_count(&pagevec) > 0)
+		fscache_mark_pages_cached(op, &pagevec);
+
+	_leave(" = %d [nr=%u%s]",
+	       ret, *nr_pages, list_empty(pages) ? " empty" : "");
+	return ret;
+}
+
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+			     struct page *page,
+			     gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lx},", object, page->index);
+
+	ret = cachefiles_has_space(cache, 0, 1);
+	if (ret == 0) {
+		pagevec_init(&pagevec, 0);
+		pagevec_add(&pagevec, page);
+		fscache_mark_pages_cached(op, &pagevec);
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+			      struct list_head *pages,
+			      unsigned *nr_pages,
+			      gfp_t gfp)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct pagevec pagevec;
+	struct page *page;
+	int ret;
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,,,%d,", object, *nr_pages);
+
+	ret = cachefiles_has_space(cache, 0, *nr_pages);
+	if (ret == 0) {
+		pagevec_init(&pagevec, 0);
+
+		list_for_each_entry(page, pages, lru) {
+			if (pagevec_add(&pagevec, page) == 0)
+				fscache_mark_pages_cached(op, &pagevec);
+		}
+
+		if (pagevec_count(&pagevec) > 0)
+			fscache_mark_pages_cached(op, &pagevec);
+		ret = -ENODATA;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	mm_segment_t old_fs;
+	struct file *file;
+	loff_t pos;
+	void *data;
+	int ret;
+
+	ASSERT(op != NULL);
+	ASSERT(page != NULL);
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+
+	_enter("%p,%p{%lx},,,", object, page, page->index);
+
+	if (!object->backer) {
+		_leave(" = -ENOBUFS");
+		return -ENOBUFS;
+	}
+
+	ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	/* write the page to the backing filesystem and let it store it in its
+	 * own time */
+	dget(object->backer);
+	mntget(cache->mnt);
+	file = dentry_open(object->backer, cache->mnt, O_RDWR,
+			   cache->cache_cred);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+	} else {
+		ret = -EIO;
+		if (file->f_op->write) {
+			pos = (loff_t) page->index << PAGE_SHIFT;
+			data = kmap(page);
+			old_fs = get_fs();
+			set_fs(KERNEL_DS);
+			ret = file->f_op->write(
+				file, (const void __user *) data, PAGE_SIZE,
+				&pos);
+			set_fs(old_fs);
+			kunmap(page);
+			if (ret != PAGE_SIZE)
+				ret = -EIO;
+		}
+		fput(file);
+	}
+
+	if (ret < 0) {
+		if (ret == -EIO)
+			cachefiles_io_error_obj(
+				object, "Write page to backing file failed");
+		ret = -ENOBUFS;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+
+	object = container_of(_object, struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	_enter("%p,{%lu}", object, page->index);
+
+	spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("{%s}", cache->secctx);
+
+	new = prepare_kernel_cred(current);
+	if (!new) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	if (cache->secctx) {
+		ret = set_security_override_from_ctx(new, cache->secctx);
+		if (ret < 0) {
+			put_cred(new);
+			printk(KERN_ERR "CacheFiles:"
+			       " Security denies permission to nominate"
+			       " security context: error %d\n",
+			       ret);
+			goto error;
+		}
+	}
+
+	cache->cache_cred = new;
+	ret = 0;
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+				      struct dentry *root)
+{
+	int ret;
+
+	ret = security_inode_mkdir(root->d_inode, root, 0);
+	if (ret < 0) {
+		printk(KERN_ERR "CacheFiles:"
+		       " Security denies permission to make dirs: error %d",
+		       ret);
+		return ret;
+	}
+
+	ret = security_inode_create(root->d_inode, root, 0);
+	if (ret < 0)
+		printk(KERN_ERR "CacheFiles:"
+		       " Security denies permission to create files: error %d",
+		       ret);
+
+	return ret;
+}
+
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+					struct dentry *root,
+					const struct cred **_saved_cred)
+{
+	struct cred *new;
+	int ret;
+
+	_enter("");
+
+	/* duplicate the cache creds for COW (the override is currently in
+	 * force, so we can use prepare_creds() to do this) */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	cachefiles_end_secure(cache, *_saved_cred);
+
+	/* use the cache root dir's security context as the basis with
+	 * which create files */
+	ret = set_create_files_as(new, root->d_inode);
+	if (ret < 0) {
+		_leave(" = %d [cfa]", ret);
+		return ret;
+	}
+
+	put_cred(cache->cache_cred);
+	cache->cache_cred = new;
+
+	cachefiles_begin_secure(cache, _saved_cred);
+	ret = cachefiles_check_cache_dir(cache, root);
+
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include "internal.h"
+
+static const char cachefiles_xattr_cache[] =
+	XATTR_USER_PREFIX "CacheFiles.cache";
+
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+	struct dentry *dentry = object->dentry;
+	char type[3], xtype[3];
+	int ret;
+
+	ASSERT(dentry);
+	ASSERT(dentry->d_inode);
+
+	if (!object->fscache.cookie)
+		strcpy(type, "C3");
+	else
+		snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+
+	_enter("%p{%s}", object, type);
+
+	/* attempt to install a type label directly */
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+			   XATTR_CREATE);
+	if (ret == 0) {
+		_debug("SET"); /* we succeeded */
+		goto error;
+	}
+
+	if (ret != -EEXIST) {
+		kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+		       dentry->d_name.len, dentry->d_name.len,
+		       dentry->d_name.name, dentry->d_inode->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+	if (ret < 0) {
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+		       dentry->d_name.len, dentry->d_name.len,
+		       dentry->d_name.name, dentry->d_inode->i_ino,
+		       -ret);
+		goto error;
+	}
+
+	/* check the type is what we're expecting */
+	if (ret != 2)
+		goto bad_type_length;
+
+	if (xtype[0] != type[0] || xtype[1] != type[1])
+		goto bad_type;
+
+	ret = 0;
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	kerror("Cache object %lu type xattr length incorrect",
+	       dentry->d_inode->i_ino);
+	ret = -EIO;
+	goto error;
+
+bad_type:
+	xtype[2] = 0;
+	kerror("Cache object %*.*s [%lu] type %s not %s",
+	       dentry->d_name.len, dentry->d_name.len,
+	       dentry->d_name.name, dentry->d_inode->i_ino,
+	       xtype, type);
+	ret = -EIO;
+	goto error;
+}
+
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+				struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(object->fscache.cookie);
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_CREATE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to set xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+				   struct cachefiles_xattr *auxdata)
+{
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	ASSERT(object->fscache.cookie);
+	ASSERT(dentry);
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	/* attempt to install the cache metadata directly */
+	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len,
+			   XATTR_REPLACE);
+	if (ret < 0 && ret != -ENOMEM)
+		cachefiles_io_error_obj(
+			object,
+			"Failed to update xattr with error %d", ret);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+				  struct cachefiles_xattr *auxdata)
+{
+	struct cachefiles_xattr *auxbuf;
+	struct dentry *dentry = object->dentry;
+	int ret;
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	ASSERT(dentry);
+	ASSERT(dentry->d_inode);
+
+	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+	if (!auxbuf) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* read the current type label */
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			   &auxbuf->type, 512 + 1);
+	if (ret < 0) {
+		if (ret == -ENODATA)
+			goto stale; /* no attribute - power went off
+				     * mid-cull? */
+
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		cachefiles_io_error_obj(object,
+					"Can't read xattr on %lu (err %d)",
+					dentry->d_inode->i_ino, -ret);
+		goto error;
+	}
+
+	/* check the on-disk object */
+	if (ret < 1)
+		goto bad_type_length;
+
+	if (auxbuf->type != auxdata->type)
+		goto stale;
+
+	auxbuf->len = ret;
+
+	/* consult the netfs */
+	if (object->fscache.cookie->def->check_aux) {
+		enum fscache_checkaux result;
+		unsigned int dlen;
+
+		dlen = auxbuf->len - 1;
+
+		_debug("checkaux %s #%u",
+		       object->fscache.cookie->def->name, dlen);
+
+		result = fscache_check_aux(&object->fscache,
+					   &auxbuf->data, dlen);
+
+		switch (result) {
+			/* entry okay as is */
+		case FSCACHE_CHECKAUX_OKAY:
+			goto okay;
+
+			/* entry requires update */
+		case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+			break;
+
+			/* entry requires deletion */
+		case FSCACHE_CHECKAUX_OBSOLETE:
+			goto stale;
+
+		default:
+			BUG();
+		}
+
+		/* update the current label */
+		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+				   &auxdata->type, auxdata->len,
+				   XATTR_REPLACE);
+		if (ret < 0) {
+			cachefiles_io_error_obj(object,
+						"Can't update xattr on %lu"
+						" (error %d)",
+						dentry->d_inode->i_ino, -ret);
+			goto error;
+		}
+	}
+
+okay:
+	ret = 0;
+
+error:
+	kfree(auxbuf);
+	_leave(" = %d", ret);
+	return ret;
+
+bad_type_length:
+	kerror("Cache object %lu xattr length incorrect",
+	       dentry->d_inode->i_ino);
+	ret = -EIO;
+	goto error;
+
+stale:
+	ret = -ESTALE;
+	goto error;
+}
+
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+				   struct dentry *dentry)
+{
+	int ret;
+
+	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+	if (ret < 0) {
+		if (ret == -ENOENT || ret == -ENODATA)
+			ret = 0;
+		else if (ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Can't remove xattr from %lu"
+					    " (error %d)",
+					    dentry->d_inode->i_ino, -ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1dfc..65984006192c 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,22 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.  fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network).  Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount). 
+
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -7,7 +26,10 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 
 Version 1.55
 ------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 341a98965bd0..6994a0f54f02 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -118,6 +118,18 @@ config CIFS_DEBUG2
 	   option can be turned off unless you are debugging
 	   cifs problems.  If unsure, say N.
 
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support"
+	  depends on CIFS && KEYS
+	  help
+	    Distributed File System (DFS) support is used to access shares
+	    transparently in an enterprise name space, even if the share
+	    moves to a different server.  This feature also enables
+	    an upcall mechanism for CIFS which contacts userspace helper
+	    utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
@@ -131,12 +143,3 @@ config CIFS_EXPERIMENTAL
 	    (which is disabled by default). See the file fs/cifs/README
 	    for more details.  If unsure, say N.
 
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index da4515e3be20..07434181623b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -472,6 +472,19 @@ A partial list of the supported mount options follows:
 		even if the cifs server would support posix advisory locks.
 		"forcemand" is accepted as a shorter form of this mount
 		option.
+ nostrictsync   If this mount option is set, when an application does an
+		fsync call then the cifs client does not send an SMB Flush
+		to the server (to force the server to write all dirty data
+		for this file immediately to disk), although cifs still sends
+		all dirty (cached) file data to the server and waits for the
+		server to respond to the write.  Since SMB Flush can be
+		very slow, and some servers may be reliable enough (to risk
+		delaying slightly flushing the data to disk on the server),
+		turning on this option may be useful to improve performance for
+		applications that fsync too much, at a small risk of server
+		crash.  If this mount option is not set, by default cifs will
+		send an SMB flush request (and wait for a response) on every
+		fsync call.
  nodfs          Disable DFS (global name space support) even if the
 		server claims to support it.  This can help work around
 		a problem with parsing of DFS paths with Samba server
@@ -692,13 +705,14 @@ require this helper. Note that NTLMv2 security (which does not require the
 cifs.upcall helper program), instead of using Kerberos, is sufficient for
 some use cases.
 
-Enabling DFS support (used to access shares transparently in an MS-DFS
-global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
-addition, DFS support for target shares which are specified as UNC
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
 names which begin with host names (rather than IP addresses) requires
 a user space helper (such as cifs.upcall) to be present in order to
 translate host names to ip address, and the user space helper must also
-be configured in the file /etc/request-key.conf
+be configured in the file /etc/request-key.conf.  Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
 
 To use cifs Kerberos and DFS support, the Linux keyutils package should be
 installed and something like the following lines should be added to the
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 490e34bbf27a..7f19fefd3d45 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -340,6 +340,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 				seq_printf(m, "\nWrites: %d Bytes: %lld",
 					atomic_read(&tcon->num_writes),
 					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nFlushes: %d",
+					atomic_read(&tcon->num_flushes));
 				seq_printf(m, "\nLocks: %d HardLinks: %d "
 					      "Symlinks: %d",
 					atomic_read(&tcon->num_locks),
@@ -402,7 +404,6 @@ cifs_proc_init(void)
 	if (proc_fs_cifs == NULL)
 		return;
 
-	proc_fs_cifs->owner = THIS_MODULE;
 	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
 
 #ifdef CONFIG_CIFS_STATS
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 85c0a74d034d..5fdbf8a14472 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -104,9 +104,9 @@ static char *cifs_get_share_name(const char *node_name)
 
 
 /**
- * compose_mount_options	-	creates mount options for refferral
+ * cifs_compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @dentry:		point where we are going to mount
+ * @fullpath:		full path in UNC format
  * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
@@ -116,8 +116,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Returns: pointer to new mount options or ERR_PTR.
  * Caller is responcible for freeing retunrned value if it is not error.
  */
-static char *compose_mount_options(const char *sb_mountdata,
-				   struct dentry *dentry,
+char *cifs_compose_mount_options(const char *sb_mountdata,
+				   const char *fullpath,
 				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
@@ -128,7 +128,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
-	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
@@ -202,17 +201,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 		goto compose_mount_options_err;
 	}
 
-	/*
-	 * this function gives us a path with a double backslash prefix. We
-	 * require a single backslash for DFS. Temporarily increment fullpath
-	 * to put it in the proper form and decrement before freeing it.
-	 */
-	fullpath = build_path_from_dentry(dentry);
-	if (!fullpath) {
-		rc = -ENOMEM;
-		goto compose_mount_options_err;
-	}
-	++fullpath;
 	tkn_e = strchr(tkn_e + 1, '\\');
 	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
 		strncat(mountdata, &sep, 1);
@@ -221,8 +209,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strcat(mountdata, tkn_e + 1);
 		strcat(mountdata, fullpath + ref->path_consumed);
 	}
-	--fullpath;
-	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -245,10 +231,20 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 	struct vfsmount *mnt;
 	char *mountdata;
 	char *devname = NULL;
+	char *fullpath;
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	mountdata = compose_mount_options(cifs_sb->mountdata,
-						dentry, ref, &devname);
+	/*
+	 * this function gives us a path with a double backslash prefix. We
+	 * require a single backslash for DFS.
+	 */
+	fullpath = build_path_from_dentry(dentry);
+	if (!fullpath)
+		return ERR_PTR(-ENOMEM);
+
+	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
+			fullpath + 1, ref, &devname);
+	kfree(fullpath);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c4c306f7b06f..4797787c6a44 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -32,6 +32,7 @@
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
+#define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13ea53251dcf..38491fd3871d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type,
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..77e190dc2883 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -78,8 +78,8 @@ extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 
 /* Functions related to dir entries */
-extern struct dentry_operations cifs_dentry_ops;
-extern struct dentry_operations cifs_ci_dentry_ops;
+extern const struct dentry_operations cifs_dentry_ops;
+extern const struct dentry_operations cifs_ci_dentry_ops;
 
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..9fbf4dff5da6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
-	unsigned int maxRw;	/* maxRw specifies the maximum */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int max_vcs;	/* maximum number of smb sessions, at least
+				   those that can be specified uniquely with
+				   vcnumbers */
 	char sessid[4];		/* unique token id for this session */
 	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
 	__u16 flags;
+	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
@@ -250,6 +254,7 @@ struct cifsTconInfo {
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
 	atomic_t num_reads;
+	atomic_t num_flushes;
 	atomic_t num_oplock_brks;
 	atomic_t num_opens;
 	atomic_t num_closes;
@@ -294,6 +299,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b4e2e9f0ee3d..b370489c8da5 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifspdu.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 
 #include <net/sock.h>
+#include "smbfsctl.h"
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 #define LANMAN_PROT 0
@@ -34,15 +35,15 @@
 #define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 
-/* SMB command codes */
-/*
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+/* SMB command codes:
+ * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
  * (ie which include no useful data other than the SMB error code itself).
- * Knowing this helps avoid response buffer allocations and copy in some cases
+ * This can allow us to avoid response buffer allocations and copy in some cases
  */
 #define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
 #define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
 #define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
+#define SMB_COM_FLUSH                 0x05 /* triv req/rsp */
 #define SMB_COM_DELETE                0x06 /* trivial response */
 #define SMB_COM_RENAME                0x07 /* trivial response */
 #define SMB_COM_QUERY_INFORMATION     0x08 /* aka getattr */
@@ -790,6 +791,12 @@ typedef struct smb_com_close_rsp {
 	__u16 ByteCount;	/* bct = 0 */
 } __attribute__((packed)) CLOSE_RSP;
 
+typedef struct smb_com_flush_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) FLUSH_REQ;
+
 typedef struct smb_com_findclose_req {
 	struct smb_hdr hdr; /* wct = 1 */
 	__u16 FileID;
@@ -1924,19 +1931,19 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 #define DFS_TYPE_ROOT 0x0001
 
 /* Referral Entry Flags */
-#define DFS_NAME_LIST_REF 0x0200
+#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
+#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
 
-typedef struct dfs_referral_level_3 {
-	__le16 VersionNumber;
+typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
+	__le16 VersionNumber;  /* must be 3 or 4 */
 	__le16 Size;
 	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
-	__le16 ReferralEntryFlags; /* 0x0200 bit set only for domain
-				      or DC referral responce */
+	__le16 ReferralEntryFlags;
 	__le32 TimeToLive;
 	__le16 DfsPathOffset;
 	__le16 DfsAlternatePathOffset;
 	__le16 NetworkAddressOffset; /* offset of the link target */
-	__le16 ServiceSiteGuid;
+	__u8   ServiceSiteGuid[16];  /* MBZ, ignored */
 } __attribute__((packed)) REFERRAL3;
 
 typedef struct smb_com_transaction_get_dfs_refer_rsp {
@@ -1946,48 +1953,15 @@ typedef struct smb_com_transaction_get_dfs_refer_rsp {
 	__u8 Pad;
 	__le16 PathConsumed;
 	__le16 NumberOfReferrals;
-	__le16 DFSFlags;
-	__u16 Pad2;
+	__le32 DFSFlags;
 	REFERRAL3 referrals[1];	/* array of level 3 dfs_referral structures */
 	/* followed by the strings pointed to by the referral structures */
 } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
 
 /* DFS Flags */
-#define DFSREF_REFERRAL_SERVER  0x0001
-#define DFSREF_STORAGE_SERVER   0x0002
-
-/* IOCTL information */
-/*
- * List of ioctl function codes that look to be of interest to remote clients
- * like this one.  Need to do some experimentation to make sure they all work
- * remotely.  Some of the following, such as the encryption/compression ones
- * would be invoked from tools via a specialized hook into the VFS rather
- * than via the standard vfs entry points
- */
-#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
-#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
-#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
-#define FSCTL_LOCK_VOLUME            0x00090018
-#define FSCTL_UNLOCK_VOLUME          0x0009001C
-#define FSCTL_GET_COMPRESSION        0x0009003C
-#define FSCTL_SET_COMPRESSION        0x0009C040
-#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
-#define FSCTL_FILESYS_GET_STATISTICS 0x00090090
-#define FSCTL_SET_REPARSE_POINT      0x000900A4
-#define FSCTL_GET_REPARSE_POINT      0x000900A8
-#define FSCTL_DELETE_REPARSE_POINT   0x000900AC
-#define FSCTL_SET_SPARSE             0x000900C4
-#define FSCTL_SET_ZERO_DATA          0x000900C8
-#define FSCTL_SET_ENCRYPTION         0x000900D7
-#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB
-#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF
-#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3
-#define FSCTL_SIS_COPYFILE           0x00090100
-#define FSCTL_SIS_LINK_FILES         0x0009C104
-
-#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
-#define IO_REPARSE_TAG_HSM           0xC0000004
-#define IO_REPARSE_TAG_SIS           0x80000007
+#define DFSREF_REFERRAL_SERVER  0x00000001 /* all targets are DFS roots */
+#define DFSREF_STORAGE_SERVER   0x00000002 /* no further ref requests needed */
+#define DFSREF_TARGET_FAILBACK  0x00000004 /* only for DFS referral version 4 */
 
 /*
  ************************************************************************
@@ -2508,8 +2482,6 @@ struct data_blob {
 	6) Use nanosecond timestamps throughout all time fields if
 	   corresponding attribute flag is set
 	7) sendfile - handle based copy
-	8) Direct i/o
-	9) Misc fcntls?
 
 	what about fixing 64 bit alignment
 
@@ -2628,7 +2600,5 @@ typedef struct file_chattr_info {
 	__le64	mode; /* list of actual attribute bits on this inode */
 } __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
 						(chattr, chflags) level 0x206 */
-
-#endif
-
+#endif 				/* POSIX */
 #endif				/* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba6298809..4167716d32f2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,7 +42,11 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
+extern char *cifs_compose_mount_options(const char *sb_mountdata,
+		const char *fullpath, const struct dfs_info3_param *ref,
+		char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
@@ -91,6 +95,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern int cifs_posix_open(char *full_path, struct inode **pinode,
+			   struct super_block *sb, int mode, int oflags,
+			   int *poplock, __u16 *pnetfid, int xid);
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+				FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
@@ -277,6 +287,9 @@ extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
 			const int smb_file_id);
 
+extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+			const int smb_file_id);
+
 extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
 			const int netfid, unsigned int count,
 			const __u64 lseek, unsigned int *nbytes, char **buf,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..bc09c998631f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->maxRw = 0xFF00;
+			server->max_rw = 0xFF00;
 			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
 		} else {
-			server->maxRw = 0;/* we do not need to use raw anyway */
+			server->max_rw = 0;/* do not need to use raw anyway */
 			server->capabilities = CAP_MPX_MODE;
 		}
 		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-	server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -1933,6 +1934,27 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 
 int
+CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+{
+	int rc = 0;
+	FLUSH_REQ *pSMB = NULL;
+	cFYI(1, ("In CIFSSMBFlush"));
+
+	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	cifs_stats_inc(&tcon->num_flushes);
+	if (rc)
+		cERROR(1, ("Send error in Flush = %d", rc));
+
+	return rc;
+}
+
+int
 CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	      const char *fromName, const char *toName,
 	      const struct nls_table *nls_codepage, int remap)
@@ -2355,8 +2377,10 @@ winCreateHardLinkRetry:
 				     PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-		pSMB->OldFileName[name_len] = 0;	/* pad */
-		pSMB->OldFileName[name_len + 1] = 0x04;
+
+		/* protocol specifies ASCII buffer format (0x04) for unicode */
+		pSMB->OldFileName[name_len] = 0x04;
+		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
 		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
 				     toName, PATH_MAX, nls_codepage, remap);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be943051..0de3b5615a22 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -95,6 +95,7 @@ struct smb_vol {
 	bool local_lease:1; /* check leases only on local system, not remote */
 	bool noblocksnd:1;
 	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1274,6 +1275,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->intr = 0;
 		} else if (strnicmp(data, "intr", 4) == 0) {
 			vol->intr = 1;
+		} else if (strnicmp(data, "nostrictsync", 12) == 0) {
+			vol->nostrictsync = 1;
+		} else if (strnicmp(data, "strictsync", 10) == 0) {
+			vol->nostrictsync = 0;
 		} else if (strnicmp(data, "serverino", 7) == 0) {
 			vol->server_ino = 1;
 		} else if (strnicmp(data, "noserverino", 9) == 0) {
@@ -1379,8 +1384,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
 		else if (addr->ss_family == AF_INET6 &&
-			 memcmp(&server->addr.sockAddr6.sin6_addr,
-				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+					  &addr6->sin6_addr))
 			continue;
 
 		++server->srv_count;
@@ -2160,6 +2165,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
 	if (pvolume_info->nobrl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nostrictsync)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
 	if (pvolume_info->mand_lock)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
 	if (pvolume_info->cifs_acl)
@@ -2180,6 +2187,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			   "mount option supported"));
 }
 
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__u64 inode_num;
+	FILE_ALL_INFO *pfile_info;
+
+	rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+				   cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != -EOPNOTSUPP)
+		return rc;
+
+	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (pfile_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(pfile_info);
+	return rc;
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2190,6 +2224,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
+	char   *full_path;
 
 	xid = GetXid();
 
@@ -2426,6 +2461,23 @@ mount_fail_check:
 		cifs_sb->rsize = min(cifs_sb->rsize,
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
+	if (!rc && cifs_sb->prepathlen) {
+		/* build_path_to_root works only when we have a valid tcon */
+		full_path = cifs_build_path_to_root(cifs_sb);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		if (rc) {
+			cERROR(1, ("Path %s in not accessible: %d",
+						full_path, rc));
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
 	/* volume_info->password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
@@ -3622,7 +3674,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			    BCC(smb_buffer_response)) {
 				kfree(tcon->nativeFileSystem);
 				tcon->nativeFileSystem =
-				    kzalloc(length + 2, GFP_KERNEL);
+				    kzalloc(2*(length + 1), GFP_KERNEL);
 				if (tcon->nativeFileSystem)
 					cifs_strfromUCS_le(
 						tcon->nativeFileSystem,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5ad..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with dentries
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,80 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+int cifs_posix_open(char *full_path, struct inode **pinode,
+		    struct super_block *sb, int mode, int oflags,
+		    int *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	__u32 oplock;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, ("posix open %s", full_path));
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+		(FMODE_READ | FMODE_WRITE))
+		posix_flags = SMB_O_RDWR;
+	else if (oflags & FMODE_READ)
+		posix_flags = SMB_O_RDONLY;
+	else if (oflags & FMODE_WRITE)
+		posix_flags = SMB_O_WRONLY;
+	if (oflags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (oflags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (oflags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	if (oflags & O_APPEND)
+		posix_flags |= SMB_O_APPEND;
+	if (oflags & O_SYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (oflags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (oflags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (oflags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+
+	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+			pnetfid, presp_data, &oplock, full_path,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	/* get new inode and set it up */
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	if (*pinode == NULL)
+		*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+	/* else an inode was passed in. Update its info, don't create one */
+
+	/* We do not need to close the file if new_inode fails since
+	   the caller will retry qpathinfo as long as inode is null */
+	if (*pinode == NULL)
+		goto posix_open_ret;
+
+	posix_fill_in_inode(*pinode, presp_data, 1);
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -150,7 +224,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
-	/* BB below access is too much for the mknod to request */
+	int oflags;
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
@@ -173,14 +254,44 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
 
-	if (nd && (nd->flags & LOOKUP_OPEN)) {
-		int oflags = nd->intent.open.flags;
+	if (nd && (nd->flags & LOOKUP_OPEN))
+		oflags = nd->intent.open.flags;
+	else
+		oflags = FMODE_READ;
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+				     mode, oflags, &oplock, &fileHandle, xid);
+		/* EIO could indicate that (posix open) operation is not
+		   supported, despite what server claimed in capability
+		   negotation.  EREMOTE indicates DFS junction, which is not
+		   handled in posix open */
+
+		if ((rc == 0) && (newinode == NULL))
+			goto cifs_create_get_file_info; /* query inode info */
+		else if (rc == 0) /* success, no need to query */
+			goto cifs_create_set_dentry;
+		else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto cifs_create_out;
+		/* else fallthrough to retry, using older open call, this is
+		   case where server does not support this SMB level, and
+		   falsely claims capability (also get here for DFS case
+		   which should be rare for path not covered on files) */
+	}
 
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		/* if the file is going to stay open, then we
+		   need to set the desired access properly */
 		desiredAccess = 0;
 		if (oflags & FMODE_READ)
-			desiredAccess |= GENERIC_READ;
+			desiredAccess |= GENERIC_READ; /* is this too little? */
 		if (oflags & FMODE_WRITE) {
 			desiredAccess |= GENERIC_WRITE;
 			if (!(oflags & FMODE_READ))
@@ -199,8 +310,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
 	   ACLs */
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
@@ -233,116 +342,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 	if (rc) {
 		cFYI(1, ("cifs_create returned 0x%x", rc));
-	} else {
-		/* If Open reported that we actually created a file
-		then we now have to set the mode if possible */
-		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
-			struct cifs_unix_set_info_args args = {
+		goto cifs_create_out;
+	}
+
+	/* If Open reported that we actually created a file
+	   then we now have to set the mode if possible */
+	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
 				.atime	= NO_CHANGE_64,
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
-			};
+		};
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				args.uid = (__u64) current_fsuid();
-				if (inode->i_mode & S_ISGID)
-					args.gid = (__u64) inode->i_gid;
-				else
-					args.gid = (__u64) current_fsgid();
-			} else {
-				args.uid = NO_CHANGE_64;
-				args.gid = NO_CHANGE_64;
-			}
-			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = (__u64) inode->i_gid;
+			else
+				args.gid = (__u64) current_fsgid();
 		} else {
-			/* BB implement mode setting via Windows security
-			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-
-			/* Could set r/o dos attribute if mode & 0222 == 0 */
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
 		}
+		CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else {
+		/* BB implement mode setting via Windows security
+		   descriptors e.g. */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
-		/* server might mask mode so we have to query for it */
-		if (tcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
-						 inode->i_sb, xid);
-		else {
-			rc = cifs_get_inode_info(&newinode, full_path,
-						 buf, inode->i_sb, xid,
-						 &fileHandle);
-			if (newinode) {
-				if (cifs_sb->mnt_cifs_flags &
-				    CIFS_MOUNT_DYNPERM)
-					newinode->i_mode = mode;
-				if ((oplock & CIFS_CREATE_ACTION) &&
-				    (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_SET_UID)) {
-					newinode->i_uid = current_fsuid();
-					if (inode->i_mode & S_ISGID)
-						newinode->i_gid =
-							inode->i_gid;
-					else
-						newinode->i_gid =
-							current_fsgid();
-				}
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf,
+					 inode->i_sb, xid, &fileHandle);
+		if (newinode) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
 			}
 		}
+	}
 
-		if (rc != 0) {
-			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
-				 rc));
-		} else
-			setup_cifs_dentry(tcon, direntry, newinode);
-
-		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
-			(!(nd->flags & LOOKUP_OPEN))) {
-			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, tcon, fileHandle);
-		} else if (newinode) {
-			struct cifsFileInfo *pCifsFile =
-			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
-			if (pCifsFile == NULL)
-				goto cifs_create_out;
-			pCifsFile->netfid = fileHandle;
-			pCifsFile->pid = current->tgid;
-			pCifsFile->pInode = newinode;
-			pCifsFile->invalidHandle = false;
-			pCifsFile->closePend     = false;
-			init_MUTEX(&pCifsFile->fh_sem);
-			mutex_init(&pCifsFile->lock_mutex);
-			INIT_LIST_HEAD(&pCifsFile->llist);
-			atomic_set(&pCifsFile->wrtPending, 0);
-
-			/* set the following in open now
+cifs_create_set_dentry:
+	if (rc == 0)
+		setup_cifs_dentry(tcon, direntry, newinode);
+	else
+		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+
+	/* nfsd case - nfs srv does not set nd */
+	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
+		/* mknod case - do not leave file open */
+		CIFSSMBClose(xid, tcon, fileHandle);
+	} else if (newinode) {
+		struct cifsFileInfo *pCifsFile =
+			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+		if (pCifsFile == NULL)
+			goto cifs_create_out;
+		pCifsFile->netfid = fileHandle;
+		pCifsFile->pid = current->tgid;
+		pCifsFile->pInode = newinode;
+		pCifsFile->invalidHandle = false;
+		pCifsFile->closePend     = false;
+		init_MUTEX(&pCifsFile->fh_sem);
+		mutex_init(&pCifsFile->lock_mutex);
+		INIT_LIST_HEAD(&pCifsFile->llist);
+		atomic_set(&pCifsFile->wrtPending, 0);
+
+		/* set the following in open now
 				pCifsFile->pfile = file; */
-			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &tcon->openFileList);
-			pCifsInode = CIFS_I(newinode);
-			if (pCifsInode) {
-				/* if readable file instance put first in list*/
-				if (write_only) {
-					list_add_tail(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				} else {
-					list_add(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				}
-				if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-					pCifsInode->clientCanCacheAll = true;
-					pCifsInode->clientCanCacheRead = true;
-					cFYI(1, ("Exclusive Oplock inode %p",
-						newinode));
-				} else if ((oplock & 0xF) == OPLOCK_READ)
-					pCifsInode->clientCanCacheRead = true;
+		write_lock(&GlobalSMBSeslock);
+		list_add(&pCifsFile->tlist, &tcon->openFileList);
+		pCifsInode = CIFS_I(newinode);
+		if (pCifsInode) {
+			/* if readable file instance put first in list*/
+			if (write_only) {
+				list_add_tail(&pCifsFile->flist,
+					      &pCifsInode->openFileList);
+			} else {
+				list_add(&pCifsFile->flist,
+					 &pCifsInode->openFileList);
 			}
-			write_unlock(&GlobalSMBSeslock);
+			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+				pCifsInode->clientCanCacheAll = true;
+				pCifsInode->clientCanCacheRead = true;
+				cFYI(1, ("Exclusive Oplock inode %p",
+					newinode));
+			} else if ((oplock & 0xF) == OPLOCK_READ)
+				pCifsInode->clientCanCacheRead = true;
 		}
+		write_unlock(&GlobalSMBSeslock);
 	}
 cifs_create_out:
 	kfree(buf);
@@ -374,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		rc = -ENOMEM;
 	else if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
-			.mode	= mode & ~current->fs->umask,
+			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
@@ -596,7 +701,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	return rc;
 }     */
 
-struct dentry_operations cifs_dentry_ops = {
+const struct dentry_operations cifs_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
@@ -634,7 +739,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 	return 1;
 }
 
-struct dentry_operations cifs_ci_dentry_ops = {
+const struct dentry_operations cifs_ci_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 	.d_hash = cifs_ci_hash,
 	.d_compare = cifs_ci_compare,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 12bb656fbe75..81747acca4c4 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,8 +78,36 @@ static inline int cifs_convert_flags(unsigned int flags)
 	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
 		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
 		FILE_READ_DATA);
+}
 
+static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+{
+	fmode_t posix_flags = 0;
 
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		posix_flags = FMODE_READ;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		posix_flags = FMODE_WRITE;
+	else if ((flags & O_ACCMODE) == O_RDWR) {
+		/* GENERIC_ALL is too much permission to request
+		   can cause unnecessary access denied on create */
+		/* return GENERIC_ALL; */
+		posix_flags = FMODE_READ | FMODE_WRITE;
+	}
+	/* can not map O_CREAT or O_EXCL or O_TRUNC flags when
+	   reopening a file.  They had their effect on the original open */
+	if (flags & O_APPEND)
+		posix_flags |= (fmode_t)O_APPEND;
+	if (flags & O_SYNC)
+		posix_flags |= (fmode_t)O_SYNC;
+	if (flags & O_DIRECTORY)
+		posix_flags |= (fmode_t)O_DIRECTORY;
+	if (flags & O_NOFOLLOW)
+		posix_flags |= (fmode_t)O_NOFOLLOW;
+	if (flags & O_DIRECT)
+		posix_flags |= (fmode_t)O_DIRECT;
+
+	return posix_flags;
 }
 
 static inline int cifs_get_disposition(unsigned int flags)
@@ -97,6 +125,80 @@ static inline int cifs_get_disposition(unsigned int flags)
 }
 
 /* all arguments to this function must be checked for validity in caller */
+static inline int cifs_posix_open_inode_helper(struct inode *inode,
+			struct file *file, struct cifsInodeInfo *pCifsInode,
+			struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+/*	struct timespec temp; */   /* BB REMOVEME BB */
+
+	file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (file->private_data == NULL)
+		return -ENOMEM;
+	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
+	write_lock(&GlobalSMBSeslock);
+	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
+
+	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+	if (pCifsInode == NULL) {
+		write_unlock(&GlobalSMBSeslock);
+		return -EINVAL;
+	}
+
+	/* want handles we can use to read with first
+	   in the list so we do not have to walk the
+	   list to search for one in write_begin */
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
+		list_add_tail(&pCifsFile->flist,
+			      &pCifsInode->openFileList);
+	} else {
+		list_add(&pCifsFile->flist,
+			 &pCifsInode->openFileList);
+	}
+
+	if (pCifsInode->clientCanCacheRead) {
+		/* we have the inode open somewhere else
+		   no need to discard cache data */
+		goto psx_client_can_cache;
+	}
+
+	/* BB FIXME need to fix this check to move it earlier into posix_open
+	   BB  fIX following section BB FIXME */
+
+	/* if not oplocked, invalidate inode pages if mtime or file
+	   size changed */
+/*	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
+			   (file->f_path.dentry->d_inode->i_size ==
+			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
+		cFYI(1, ("inode unchanged on server"));
+	} else {
+		if (file->f_path.dentry->d_inode->i_mapping) {
+			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+			if (rc != 0)
+				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+		}
+		cFYI(1, ("invalidating remote inode since open detected it "
+			 "changed"));
+		invalidate_remote_inode(file->f_path.dentry->d_inode);
+	} */
+
+psx_client_can_cache:
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, ("Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode));
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
+	/* will have to change the unlock if we reenable the
+	   filemap_fdatawrite (which does not seem necessary */
+	write_unlock(&GlobalSMBSeslock);
+	return 0;
+}
+
+/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
 	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
@@ -167,7 +269,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct list_head *tmp;
@@ -180,7 +282,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 	if (file->f_flags & O_CREAT) {
 		/* search inode for this file and fill in file->private_data */
@@ -220,6 +322,45 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
 		 inode, file->f_flags, full_path));
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (!tcon->broken_posix_open && tcon->unix_ext &&
+	    (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix open succeeded"));
+			/* no need for special case handling of setting mode
+			   on read only files needed here */
+
+			cifs_posix_open_inode_helper(inode, file, pCifsInode,
+						     pCifsFile, oplock, netfid);
+			goto out;
+		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			if (tcon->ses->serverNOS)
+				cERROR(1, ("server %s of type %s returned"
+					   " unexpected error on SMB posix open"
+					   ", disabling posix open support."
+					   " Check if server update available.",
+					   tcon->ses->serverName,
+					   tcon->ses->serverNOS));
+			tcon->broken_posix_open = true;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto out;
+		/* else fallthrough to retry open the old way on network i/o
+		   or DFS errors */
+	}
+
 	desiredAccess = cifs_convert_flags(file->f_flags);
 
 /*********************************************************************
@@ -248,11 +389,6 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	disposition = cifs_get_disposition(file->f_flags);
 
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-
 	/* BB pass O_SYNC flag through on file attributes .. BB */
 
 	/* Also refresh inode by passing in file_info buf returned by SMBOpen
@@ -269,7 +405,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -278,7 +414,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	if (rc == -EIO) {
 		/* Old server, try legacy style OpenX */
-		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
 			desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -295,12 +431,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &pTcon->openFileList);
+	list_add(&pCifsFile->tlist, &tcon->openFileList);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 	if (pCifsInode) {
 		rc = cifs_open_inode_helper(inode, file, pCifsInode,
-					    pCifsFile, pTcon,
+					    pCifsFile, tcon,
 					    &oplock, buf, full_path, xid);
 	} else {
 		write_unlock(&GlobalSMBSeslock);
@@ -309,7 +445,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (oplock & CIFS_CREATE_ACTION) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
-		if (pTcon->unix_ext) {
+		if (tcon->unix_ext) {
 			struct cifs_unix_set_info_args args = {
 				.mode	= inode->i_mode,
 				.uid	= NO_CHANGE_64,
@@ -319,7 +455,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
 			};
-			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -349,7 +485,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct inode *inode;
@@ -387,7 +523,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
@@ -404,20 +540,37 @@ reopen_error_exit:
 
 	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
 		 inode, file->f_flags, full_path));
-	desiredAccess = cifs_convert_flags(file->f_flags);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
 
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix reopen succeeded"));
+			goto reopen_success;
+		}
+		/* fallthrough to retry open the old way on errors, especially
+		   in the reconnect path it is important to retry hard */
+	}
+
+	desiredAccess = cifs_convert_flags(file->f_flags);
+
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
 	   since file might have write behind data that needs to be flushed
 	   and server version of file size can be stale. If we knew for sure
 	   that inode was not dirty locally we could do this */
 
-	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
+	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -426,6 +579,7 @@ reopen_error_exit:
 		cFYI(1, ("cifs_open returned 0x%x", rc));
 		cFYI(1, ("oplock: %d", oplock));
 	} else {
+reopen_success:
 		pCifsFile->netfid = netfid;
 		pCifsFile->invalidHandle = false;
 		up(&pCifsFile->fh_sem);
@@ -439,7 +593,7 @@ reopen_error_exit:
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = false;
 				pCifsInode->clientCanCacheRead = false;
-				if (pTcon->unix_ext)
+				if (tcon->unix_ext)
 					rc = cifs_get_inode_info_unix(&inode,
 						full_path, inode->i_sb, xid);
 				else
@@ -467,7 +621,6 @@ reopen_error_exit:
 			cifs_relock_file(pCifsFile);
 		}
 	}
-
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1523,6 +1676,9 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	int xid;
 	int rc = 0;
+	struct cifsTconInfo *tcon;
+	struct cifsFileInfo *smbfile =
+		(struct cifsFileInfo *)file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	xid = GetXid();
@@ -1534,7 +1690,12 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	if (rc == 0) {
 		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
+		tcon = CIFS_SB(inode->i_sb)->tcon;
+		if (!rc && tcon && smbfile &&
+		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 	}
+
 	FreeXid(xid);
 	return rc;
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b5184664..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
 
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, __u64 *inum)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode == NULL)
+		return NULL;
+
+	/*
+	 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+	 *     stop passing inum as ptr. Are there sanity checks we can use to
+	 *     ensure that the server is really filling in that field? Also,
+	 *     if serverino is disabled, perhaps we should be using iunique()?
+	 */
+	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+		inode->i_ino = (unsigned long) *inum;
+
+	/*
+	 * must set this here instead of cifs_alloc_inode since VFS will
+	 * clobber i_flags
+	 */
+	if (sb->s_flags & MS_NOATIME)
+		inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 	const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
+		*pinode = cifs_new_inode(sb, &find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
 		}
-		/* Is an i_ino of zero legal? */
-		/* note ino incremented to unique num in new_inode */
-		/* Are there sanity checks we can use to ensure that
-		   the server is really filling in that field? */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-
-		insert_inode_hash(*pinode);
 	}
 
 	inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
-		if (*pinode == NULL) {
-			rc = -ENOMEM;
-			goto cgii_exit;
-		}
+		__u64 inode_num;
+		__u64 *pinum = &inode_num;
+
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
 		   there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 			int rc1 = 0;
-			__u64 inode_num;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					full_path, &inode_num,
+					full_path, pinum,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				pinum = NULL;
 				/* BB EOPNOSUPP disable SERVER_INUM? */
-			} else /* do we need cast or hash to ino? */
-				(*pinode)->i_ino = inode_num;
-		} /* else ino incremented to unique num in new_inode*/
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-		insert_inode_hash(*pinode);
+			}
+		} else {
+			pinum = NULL;
+		}
+
+		*pinode = cifs_new_inode(sb, pinum);
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 	}
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	full_path = build_path_to_root(cifs_sb);
+	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -729,6 +763,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsTconInfo *pTcon = cifs_sb->tcon;
 	FILE_BASIC_INFO	info_buf;
 
+	if (attrs == NULL)
+		return -EINVAL;
+
 	if (attrs->ia_valid & ATTR_ATIME) {
 		set_time = true;
 		info_buf.LastAccessTime =
@@ -1017,7 +1054,7 @@ out_reval:
 	return rc;
 }
 
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
 	FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
 	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1088,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			goto mkdir_out;
 		}
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
 				mode, NULL /* netfid */, pInfo, &oplock,
 				full_path, cifs_sb->local_nls,
@@ -1114,24 +1151,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = new_inode(inode->i_sb);
+			newinode = cifs_new_inode(inode->i_sb,
+						  &pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
 
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				newinode->i_ino =
-					(unsigned long)pInfo->UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (inode->i_sb->s_flags & MS_NOATIME)
-				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
 			newinode->i_nlink = 2;
-
-			insert_inode_hash(newinode);
 			d_instantiate(direntry, newinode);
 
 			/* we already checked in POSIXCreate whether
@@ -1177,7 +1204,7 @@ mkdir_get_info:
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		/* must turn on setgid bit if parent dir has it */
 		if (inode->i_mode & S_ISGID)
 			mode |= S_ISGID;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
 
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
-	struct inode **ptmp_inode, struct dentry **pnew_dentry)
+static int
+construct_dentry(struct qstr *qstring, struct file *file,
+		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+		 __u64 *inum)
 {
-	struct dentry *tmp_dentry;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct dentry *tmp_dentry = NULL;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int rc = 0;
 
 	cFYI(1, ("For %s", qstring->name));
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	qstring->hash = full_name_hash(qstring->name, qstring->len);
 	tmp_dentry = d_lookup(file->f_path.dentry, qstring);
 	if (tmp_dentry) {
+		/* BB: overwrite old name? i.e. tmp_dentry->d_name and
+		 * tmp_dentry->d_name.len??
+		 */
 		cFYI(0, ("existing dentry with inode 0x%p",
 			 tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
 		if (*ptmp_inode == NULL) {
-			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+			*ptmp_inode = cifs_new_inode(sb, inum);
 			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
 		if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			return rc;
 		}
 
-		*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-		if (pTcon->nocase)
+		if (CIFS_SB(sb)->tcon->nocase)
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
+
+		*ptmp_inode = cifs_new_inode(sb, inum);
 		if (*ptmp_inode == NULL)
 			return rc;
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 		rc = 2;
 	}
 
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+	struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			len = strnlen(filename, PATH_MAX);
 		}
 
-		/* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			*pinum = pFindData->UniqueId;
+		*pinum = pFindData->UniqueId;
 	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO *pFindData =
 			(FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	struct qstr qstring;
 	struct cifsFileInfo *pCifsF;
 	unsigned int obj_type;
-	ino_t  inum;
+	__u64  inum;
 	struct cifs_sb_info *cifs_sb;
 	struct inode *tmp_inode;
 	struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if (rc)
 		return rc;
 
-	rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+	/* only these two infolevels return valid inode numbers */
+	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+	    pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					&inum);
+	else
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					NULL);
+
 	if ((tmp_inode == NULL) || (tmp_dentry == NULL))
 		return -ENOMEM;
 
-	if (rc) {
-		/* inode created, we need to hash it with right inode number */
-		if (inum != 0) {
-			/* BB fixme - hash the 2 32 quantities bits together if
-			 *  necessary BB */
-			tmp_inode->i_ino = inum;
-		}
-		insert_inode_hash(tmp_inode);
-	}
-
 	/* we pass in rc below, indicating whether it is a new inode,
 	   so we can figure out whether to invalidate the inode cached
 	   data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+
+	list_for_each(tmp, &ses->server->smb_ses_list) {
+		tmp_ses = list_entry(tmp, struct cifsSesInfo,
+				     smb_ses_list);
+		if (tmp_ses->need_reconnect == false)
+			return false;
+	}
+	/* could not find a session that was already connected,
+	   this must be the first one we are reconnecting */
+	return true;
+}
+
+/*
+ *	vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *	one for each smb session (some Windows versions set maxvcs incorrectly
+ *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *	any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+	__u16 vcnum = 0;
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+	__u16 max_vcs = ses->server->max_vcs;
+	__u16 i;
+	int free_vc_found = 0;
+
+	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+	field to one but do not enforce this limit, which allows an SMB client
+	to establish more virtual circuits than allowed by this value ... but
+	other server implementations can enforce this limit." */
+	if (max_vcs < 2)
+		max_vcs = 0xFFFF;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+			goto get_vc_num_exit;  /* vcnum will be zero */
+	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+		if (i == 0) /* this is the only connection, use vc 0 */
+			break;
+
+		free_vc_found = 1;
+
+		list_for_each(tmp, &ses->server->smb_ses_list) {
+			tmp_ses = list_entry(tmp, struct cifsSesInfo,
+					     smb_ses_list);
+			if (tmp_ses->vcnum == i) {
+				free_vc_found = 0;
+				break; /* found duplicate, try next vcnum */
+			}
+		}
+		if (free_vc_found)
+			break; /* we found a vcnumber that will work - use it */
+	}
+
+	if (i == 0)
+		vcnum = 0; /* for most common case, ie if one smb session, use
+			      vc zero.  Also for case when no free vcnum, zero
+			      is safest to send (some clients only send zero) */
+	else if (free_vc_found == 0)
+		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+				reset all uids on that, but 1 is ok. */
+	else
+		vcnum = i;
+	ses->vcnum = vcnum;
+get_vc_num_exit:
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return le16_to_cpu(vcnum);
+}
+
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
 
 	/* init fields common to all four types of SessSetup */
-	/* note that header is initialized to zero in header_assemble */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
 	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = get_next_vcnum(ses);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	if (ses->capabilities & CAP_UNIX)
 		capabilities |= CAP_UNIX;
 
-	/* BB check whether to init vcnum BB */
 	return capabilities;
 }
 
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 
 	kfree(ses->serverOS);
 	/* UTF-8 string will not grow more than four times as big as UCS-16 */
-	ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+	ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverOS != NULL)
 		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
 	data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 		return rc;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+	ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverNOS != NULL) {
 		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
 				   nls_cp);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 000000000000..7056b891e087
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2 client.  There is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ */
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 75b1fa90b2cb..4bb9d0a5decc 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -59,7 +59,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
 
-static struct dentry_operations coda_dentry_operations =
+static const struct dentry_operations coda_dentry_operations =
 {
 	.d_revalidate	= coda_dentry_revalidate,
 	.d_delete	= coda_dentry_delete,
diff --git a/fs/compat.c b/fs/compat.c
index d0145ca27572..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -378,6 +379,34 @@ out:
 	return error;
 }
 
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+{
+	struct super_block *sb;
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	sb = user_get_super(new_decode_dev(dev));
+	if (!sb)
+		return -EINVAL;
+	err = vfs_statfs(sb->s_root, &sbuf);
+	drop_super(sb);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+
 static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
 {
 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
@@ -1167,16 +1196,12 @@ out:
 	return ret;
 }
 
-asmlinkage ssize_t
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+static size_t compat_readv(struct file *file,
+			   const struct compat_iovec __user *vec,
+			   unsigned long vlen, loff_t *pos)
 {
-	struct file *file;
 	ssize_t ret = -EBADF;
 
-	file = fget(fd);
-	if (!file)
-		return -EBADF;
-
 	if (!(file->f_mode & FMODE_READ))
 		goto out;
 
@@ -1184,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 		goto out;
 
-	ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
+	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
 
 out:
 	if (ret > 0)
 		add_rchar(current, ret);
 	inc_syscr(current);
-	fput(file);
 	return ret;
 }
 
 asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
+		 unsigned long vlen)
 {
 	struct file *file;
-	ssize_t ret = -EBADF;
+	int fput_needed;
+	ssize_t ret;
 
-	file = fget(fd);
+	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
+	ret = compat_readv(file, vec, vlen, &file->f_pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = compat_readv(file, vec, vlen, &pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static size_t compat_writev(struct file *file,
+			    const struct compat_iovec __user *vec,
+			    unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
 	if (!(file->f_mode & FMODE_WRITE))
 		goto out;
 
@@ -1210,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 		goto out;
 
-	ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
+	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
 
 out:
 	if (ret > 0)
 		add_wchar(current, ret);
 	inc_syscw(current);
-	fput(file);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
+		  unsigned long vlen)
+{
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = compat_writev(file, vec, vlen, &file->f_pos);
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+		   unsigned long vlen, u32 pos_low, u32 pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	struct file *file;
+	int fput_needed;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	ret = compat_writev(file, vec, vlen, &pos);
+	fput_light(file, fput_needed);
 	return ret;
 }
 
@@ -1392,27 +1482,36 @@ int compat_do_execve(char * filename,
 {
 	struct linux_binprm *bprm;
 	struct file *file;
+	struct files_struct *displaced;
 	int retval;
 
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto out_ret;
+
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
-		goto out_ret;
+		goto out_files;
 
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm, current->files);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1454,9 +1553,15 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
+	if (displaced)
+		put_files_struct(displaced);
 	return retval;
 
 out:
@@ -1469,12 +1574,21 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
 	free_bprm(bprm);
 
+out_files:
+	if (displaced)
+		reset_files_struct(displaced);
 out_ret:
 	return retval;
 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9c6d815dd191..3e87ce443ea2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
@@ -522,6 +522,11 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
 		if (err)
 			return -EFAULT;
 		break;
+	case SIOCSHWTSTAMP:
+		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
+			return -EFAULT;
+		ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
+		break;
 	default:
 		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
 			return -EFAULT;
@@ -1913,6 +1918,9 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -1938,6 +1946,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
@@ -1988,6 +1998,8 @@ COMPATIBLE_IOCTL(TUNSETGROUP)
 COMPATIBLE_IOCTL(TUNGETFEATURES)
 COMPATIBLE_IOCTL(TUNSETOFFLOAD)
 COMPATIBLE_IOCTL(TUNSETTXFILTER)
+COMPATIBLE_IOCTL(TUNGETSNDBUF)
+COMPATIBLE_IOCTL(TUNSETSNDBUF)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2561,6 +2573,7 @@ HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
+HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
 
 /* ioctls used by appletalk ddp.c */
 HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e82..05373db21a4e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(struct dentry *dentry)
 	return 1;
 }
 
-static struct dentry_operations configfs_dentry_ops = {
+static const struct dentry_operations configfs_dentry_ops = {
 	.d_iput		= configfs_d_iput,
 	/* simple_delete_dentry() isn't exported */
 	.d_delete	= configfs_d_delete,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = 0;
 	buf->f_files = CRAMFS_SB(sb)->files;
 	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = CRAMFS_MAXPATHLEN;
 	return 0;
 }
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
 static int cramfs_readpage(struct file *file, struct page * page)
 {
 	struct inode *inode = page->mapping->host;
-	u32 maxblock, bytes_filled;
+	u32 maxblock;
+	int bytes_filled;
 	void *pgdata;
 
 	maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	bytes_filled = 0;
+	pgdata = kmap(page);
+
 	if (page->index < maxblock) {
 		struct super_block *sb = inode->i_sb;
 		u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
 		start_offset = OFFSET(inode) + maxblock*4;
 		mutex_lock(&read_mutex);
 		if (page->index)
-			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4);
-		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset);
+			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
+				4);
+		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+			start_offset);
 		mutex_unlock(&read_mutex);
-		pgdata = kmap(page);
+
 		if (compr_len == 0)
 			; /* hole */
-		else if (compr_len > (PAGE_CACHE_SIZE << 1))
-			printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len);
-		else {
+		else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+			pr_err("cramfs: bad compressed blocksize %u\n",
+				compr_len);
+			goto err;
+		} else {
 			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
 				 PAGE_CACHE_SIZE,
 				 cramfs_read(sb, start_offset, compr_len),
 				 compr_len);
 			mutex_unlock(&read_mutex);
+			if (unlikely(bytes_filled < 0))
+				goto err;
 		}
-	} else
-		pgdata = kmap(page);
+	}
+
 	memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
-	kunmap(page);
 	flush_dcache_page(page);
+	kunmap(page);
 	SetPageUptodate(page);
 	unlock_page(page);
 	return 0;
+
+err:
+	kunmap(page);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
 }
 
 static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
 err:
 	printk("Error %d while decompressing!\n", err);
 	printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
-	return 0;
+	return -EIO;
 }
 
 int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da5..761d30be2683 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -32,6 +31,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
-EXPORT_SYMBOL_GPL(d_obtain_alias);
+EXPORT_SYMBOL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
@@ -1247,15 +1247,18 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	struct dentry *found;
 	struct dentry *new;
 
-	/* Does a dentry matching the name exist already? */
+	/*
+	 * First check if a dentry matching the name already exists,
+	 * if not go ahead and create it now.
+	 */
 	found = d_hash_and_lookup(dentry->d_parent, name);
-	/* If not, create it now and return */
 	if (!found) {
 		new = d_alloc(dentry->d_parent, name);
 		if (!new) {
 			error = -ENOMEM;
 			goto err_out;
 		}
+
 		found = d_splice_alias(inode, new);
 		if (found) {
 			dput(new);
@@ -1263,61 +1266,46 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 		}
 		return new;
 	}
-	/* Matching dentry exists, check if it is negative. */
+
+	/*
+	 * If a matching dentry exists, and it's not negative use it.
+	 *
+	 * Decrement the reference count to balance the iget() done
+	 * earlier on.
+	 */
 	if (found->d_inode) {
 		if (unlikely(found->d_inode != inode)) {
 			/* This can't happen because bad inodes are unhashed. */
 			BUG_ON(!is_bad_inode(inode));
 			BUG_ON(!is_bad_inode(found->d_inode));
 		}
-		/*
-		 * Already have the inode and the dentry attached, decrement
-		 * the reference count to balance the iget() done
-		 * earlier on.  We found the dentry using d_lookup() so it
-		 * cannot be disconnected and thus we do not need to worry
-		 * about any NFS/disconnectedness issues here.
-		 */
 		iput(inode);
 		return found;
 	}
+
 	/*
 	 * Negative dentry: instantiate it unless the inode is a directory and
-	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-	 * in which case d_move() that in place of the found dentry.
+	 * already has a dentry.
 	 */
-	if (!S_ISDIR(inode->i_mode)) {
-		/* Not a directory; everything is easy. */
-		d_instantiate(found, inode);
-		return found;
-	}
 	spin_lock(&dcache_lock);
-	if (list_empty(&inode->i_dentry)) {
-		/*
-		 * Directory without a 'disconnected' dentry; we need to do
-		 * d_instantiate() by hand because it takes dcache_lock which
-		 * we already hold.
-		 */
+	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
 		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
 	}
+
 	/*
-	 * Directory with a 'disconnected' dentry; get a reference to the
-	 * 'disconnected' dentry.
+	 * In case a directory already has a (disconnected) entry grab a
+	 * reference to it, move it in place and use it.
 	 */
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	dget_locked(new);
 	spin_unlock(&dcache_lock);
-	/* Do security vodoo. */
 	security_d_instantiate(found, inode);
-	/* Move new in place of found. */
 	d_move(new, found);
-	/* Balance the iget() we did above. */
 	iput(inode);
-	/* Throw away found. */
 	dput(found);
-	/* Use new as the actual dentry. */
 	return new;
 
 err_out:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+
 static struct kobject *debug_kobj;
 
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
 		kobject_put(debug_kobj);
+	else
+		debugfs_registered = true;
+
 	return retval;
 }
 
 static void __exit debugfs_exit(void)
 {
+	debugfs_registered = false;
+
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
 	kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633f..63a4a59e4148 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
 
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
-
-	printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
-			inode->i_ino);
 out:
 	mutex_unlock(&root->d_inode->i_mutex);
 	return rc;
@@ -325,179 +322,81 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 
 /*
- * Safely parse the mount options in @data and update @opts.
+ * devpts_get_sb()
  *
- * devpts ends up parsing options two times during mount, due to the
- * two modes of operation it supports. The first parse occurs in
- * devpts_get_sb() when determining the mode (single-instance or
- * multi-instance mode). The second parse happens in devpts_remount()
- * or new_pts_mount() depending on the mode.
+ *     If the '-o newinstance' mount option was specified, mount a new
+ *     (private) instance of devpts.  PTYs created in this instance are
+ *     independent of the PTYs in other devpts instances.
  *
- * Parsing of options modifies the @data making subsequent parsing
- * incorrect. So make a local copy of @data and parse it.
+ *     If the '-o newinstance' option was not specified, mount/remount the
+ *     initial kernel mount of devpts.  This type of mount gives the
+ *     legacy, single-instance semantics.
  *
- * Return: 0 On success, -errno on error
- */
-static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
-{
-	int rc;
-	void *datacp;
-
-	if (!data)
-		return 0;
-
-	/* Use kstrdup() ?  */
-	datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!datacp)
-		return -ENOMEM;
-
-	memcpy(datacp, data, PAGE_SIZE);
-	rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
-	kfree(datacp);
-
-	return rc;
-}
-
-/*
- * Mount a new (private) instance of devpts.  PTYs created in this
- * instance are independent of the PTYs in other devpts instances.
- */
-static int new_pts_mount(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
-{
-	int err;
-	struct pts_fs_info *fsi;
-	struct pts_mount_opts *opts;
-
-	printk(KERN_NOTICE "devpts: newinstance mount\n");
-
-	err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
-	if (err)
-		return err;
-
-	fsi = DEVPTS_SB(mnt->mnt_sb);
-	opts = &fsi->mount_opts;
-
-	err = parse_mount_options(data, PARSE_MOUNT, opts);
-	if (err)
-		goto fail;
-
-	err = mknod_ptmx(mnt->mnt_sb);
-	if (err)
-		goto fail;
-
-	return 0;
-
-fail:
-	dput(mnt->mnt_sb->s_root);
-	deactivate_super(mnt->mnt_sb);
-	return err;
-}
-
-/*
- * Check if 'newinstance' mount option was specified in @data.
+ *     The 'newinstance' option is needed to support multiple namespace
+ *     semantics in devpts while preserving backward compatibility of the
+ *     current 'single-namespace' semantics. i.e all mounts of devpts
+ *     without the 'newinstance' mount option should bind to the initial
+ *     kernel mount, like get_sb_single().
  *
- * Return: -errno  	on error (eg: invalid mount options specified)
- * 	 : 1 		if 'newinstance' mount option was specified
- * 	 : 0 		if 'newinstance' mount option was NOT specified
- */
-static int is_new_instance_mount(void *data)
-{
-	int rc;
-	struct pts_mount_opts opts;
-
-	if (!data)
-		return 0;
-
-	rc = safe_parse_mount_options(data, &opts);
-	if (!rc)
-		rc = opts.newinstance;
-
-	return rc;
-}
-
-/*
- * get_init_pts_sb()
+ *     Mounts with 'newinstance' option create a new, private namespace.
  *
- *     This interface is needed to support multiple namespace semantics in
- *     devpts while preserving backward compatibility of the current 'single-
- *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
- *     mount option should bind to the initial kernel mount, like
- *     get_sb_single().
+ *     NOTE:
  *
- *     Mounts with 'newinstance' option create a new private namespace.
- *
- *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     For single-mount semantics, devpts cannot use get_sb_single(),
  *     because get_sb_single()/sget() find and use the super-block from
  *     the most recent mount of devpts. But that recent mount may be a
  *     'newinstance' mount and get_sb_single() would pick the newinstance
  *     super-block instead of the initial super-block.
- *
- *     This interface is identical to get_sb_single() except that it
- *     consistently selects the 'single-namespace' superblock even in the
- *     presence of the private namespace (i.e 'newinstance') super-blocks.
  */
-static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *s;
 	int error;
+	struct pts_mount_opts opts;
+	struct super_block *s;
+
+	memset(&opts, 0, sizeof(opts));
+	if (data) {
+		error = parse_mount_options(data, PARSE_MOUNT, &opts);
+		if (error)
+			return error;
+	}
+
+	if (opts.newinstance)
+		s = sget(fs_type, NULL, set_anon_super, NULL);
+	else
+		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 
-	s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
-			return error;
-		}
+		if (error)
+			goto out_undo_sget;
 		s->s_flags |= MS_ACTIVE;
 	}
-	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
-}
 
-/*
- * Mount or remount the initial kernel mount of devpts. This type of
- * mount maintains the legacy, single-instance semantics, while the
- * kernel still allows multiple-instances.
- */
-static int init_pts_mount(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
-{
-	int err;
+	simple_set_mnt(mnt, s);
 
-	err = get_init_pts_sb(fs_type, flags, data, mnt);
-	if (err)
-		return err;
+	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
 
-	err = mknod_ptmx(mnt->mnt_sb);
-	if (err) {
-		dput(mnt->mnt_sb->s_root);
-		deactivate_super(mnt->mnt_sb);
-	}
+	error = mknod_ptmx(s);
+	if (error)
+		goto out_dput;
 
-	return err;
-}
-
-static int devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	int new;
-
-	new = is_new_instance_mount(data);
-	if (new < 0)
-		return new;
+	return 0;
 
-	if (new)
-		return new_pts_mount(fs_type, flags, data, mnt);
+out_dput:
+	dput(s->s_root);
 
-	return init_pts_mount(fs_type, flags, data, mnt);
+out_undo_sget:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return error;
 }
+
 #else
 /*
  * This supports only the legacy single-instance semantics (no
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 92969f879a17..858fba14aaa6 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -156,7 +156,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 
 	bucket = dir_hash(ls, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 
 	de = search_bucket(ls, name, namelen, bucket);
 
@@ -173,7 +173,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 	list_del(&de->list);
 	kfree(de);
  out:
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 }
 
 void dlm_dir_clear(struct dlm_ls *ls)
@@ -185,14 +185,14 @@ void dlm_dir_clear(struct dlm_ls *ls)
 	DLM_ASSERT(list_empty(&ls->ls_recover_list), );
 
 	for (i = 0; i < ls->ls_dirtbl_size; i++) {
-		write_lock(&ls->ls_dirtbl[i].lock);
+		spin_lock(&ls->ls_dirtbl[i].lock);
 		head = &ls->ls_dirtbl[i].list;
 		while (!list_empty(head)) {
 			de = list_entry(head->next, struct dlm_direntry, list);
 			list_del(&de->list);
 			put_free_de(ls, de);
 		}
-		write_unlock(&ls->ls_dirtbl[i].lock);
+		spin_unlock(&ls->ls_dirtbl[i].lock);
 	}
 }
 
@@ -307,17 +307,17 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 
 	bucket = dir_hash(ls, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 	de = search_bucket(ls, name, namelen, bucket);
 	if (de) {
 		*r_nodeid = de->master_nodeid;
-		write_unlock(&ls->ls_dirtbl[bucket].lock);
+		spin_unlock(&ls->ls_dirtbl[bucket].lock);
 		if (*r_nodeid == nodeid)
 			return -EEXIST;
 		return 0;
 	}
 
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 
 	if (namelen > DLM_RESNAME_MAXLEN)
 		return -EINVAL;
@@ -330,7 +330,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	de->length = namelen;
 	memcpy(de->name, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 	tmp = search_bucket(ls, name, namelen, bucket);
 	if (tmp) {
 		kfree(de);
@@ -339,7 +339,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
 	}
 	*r_nodeid = de->master_nodeid;
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 	return 0;
 }
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 076e86f38bc8..d01ca0a711db 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -99,7 +99,7 @@ struct dlm_direntry {
 
 struct dlm_dirtable {
 	struct list_head	list;
-	rwlock_t		lock;
+	spinlock_t		lock;
 };
 
 struct dlm_rsbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 01e7d39c5fba..205ec95b347e 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -835,7 +835,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 		lkb->lkb_wait_count++;
 		hold_lkb(lkb);
 
-		log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
+		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
 			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
 			  lkb->lkb_wait_count, lkb->lkb_flags);
 		goto out;
@@ -851,7 +851,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
  out:
 	if (error)
-		log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
+		log_error(ls, "addwait error %x %d flags %x %d %d %s",
 			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
 			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
 	mutex_unlock(&ls->ls_waiters_mutex);
@@ -863,23 +863,55 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
    request reply on the requestqueue) between dlm_recover_waiters_pre() which
    set RESEND and dlm_recover_waiters_post() */
 
-static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
+				struct dlm_message *ms)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int overlap_done = 0;
 
 	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
 		overlap_done = 1;
 		goto out_del;
 	}
 
 	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
 		overlap_done = 1;
 		goto out_del;
 	}
 
+	/* Cancel state was preemptively cleared by a successful convert,
+	   see next comment, nothing to do. */
+
+	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
+	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
+		log_debug(ls, "remwait %x cancel_reply wait_type %d",
+			  lkb->lkb_id, lkb->lkb_wait_type);
+		return -1;
+	}
+
+	/* Remove for the convert reply, and premptively remove for the
+	   cancel reply.  A convert has been granted while there's still
+	   an outstanding cancel on it (the cancel is moot and the result
+	   in the cancel reply should be 0).  We preempt the cancel reply
+	   because the app gets the convert result and then can follow up
+	   with another op, like convert.  This subsequent op would see the
+	   lingering state of the cancel and fail with -EBUSY. */
+
+	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
+	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
+	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
+		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
+			  lkb->lkb_id);
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_count--;
+		goto out_del;
+	}
+
 	/* N.B. type of reply may not always correspond to type of original
 	   msg due to lookup->request optimization, verify others? */
 
@@ -888,8 +920,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 		goto out_del;
 	}
 
-	log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
-		  lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
+	log_error(ls, "remwait error %x reply %d flags %x no wait_type",
+		  lkb->lkb_id, mstype, lkb->lkb_flags);
 	return -1;
 
  out_del:
@@ -899,7 +931,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 	   this would happen */
 
 	if (overlap_done && lkb->lkb_wait_type) {
-		log_error(ls, "remove_from_waiters %x reply %d give up on %d",
+		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
 			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
 		lkb->lkb_wait_count--;
 		lkb->lkb_wait_type = 0;
@@ -921,7 +953,7 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 	int error;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb, mstype);
+	error = _remove_from_waiters(lkb, mstype, NULL);
 	mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
@@ -936,7 +968,7 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 	if (ms != &ls->ls_stub_ms)
 		mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb, ms->m_type);
+	error = _remove_from_waiters(lkb, ms->m_type, ms);
 	if (ms != &ls->ls_stub_ms)
 		mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
@@ -2083,6 +2115,11 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_timeout_cs = args->timeout;
 	rv = 0;
  out:
+	if (rv)
+		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+			  lkb->lkb_status, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
 	return rv;
 }
 
@@ -2149,6 +2186,13 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 			goto out;
 		}
 
+		/* there's nothing to cancel */
+		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
+		    !lkb->lkb_wait_type) {
+			rv = -EBUSY;
+			goto out;
+		}
+
 		switch (lkb->lkb_wait_type) {
 		case DLM_MSG_LOOKUP:
 		case DLM_MSG_REQUEST:
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index aa32e5f02493..cd8e2df3c295 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -487,7 +487,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 		goto out_lkbfree;
 	for (i = 0; i < size; i++) {
 		INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
-		rwlock_init(&ls->ls_dirtbl[i].lock);
+		spin_lock_init(&ls->ls_dirtbl[i].lock);
 	}
 
 	INIT_LIST_HEAD(&ls->ls_waiters);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 103a5ebd1371..609108a83267 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,7 @@
  *
  * Cluster nodes are referred to by their nodeids. nodeids are
  * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
+ * be expanded for the cluster infrastructure then that is its
  * responsibility. It is this layer's
  * responsibility to resolve these into IP address or
  * whatever it needs for inter-node communication.
@@ -36,9 +36,9 @@
  * of high load. Also, this way, the sending thread can collect together
  * messages bound for one node and send them in one block.
  *
- * lowcomms will choose to use wither TCP or SCTP as its transport layer
+ * lowcomms will choose to use either TCP or SCTP as its transport layer
  * depending on the configuration variable 'protocol'. This should be set
- * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
+ * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
  * cluster-wide mechanism as it must be the same on all nodes of the cluster
  * for the DLM to function.
  *
@@ -48,11 +48,11 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/pagemap.h>
-#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <net/sctp/user.h>
+#include <net/ipv6.h>
 
 #include "dlm_internal.h"
 #include "lowcomms.h"
@@ -60,6 +60,7 @@
 #include "config.h"
 
 #define NEEDED_RMEM (4*1024*1024)
+#define CONN_HASH_SIZE 32
 
 struct cbuf {
 	unsigned int base;
@@ -114,6 +115,7 @@ struct connection {
 	int retries;
 #define MAX_CONNECT_RETRIES 3
 	int sctp_assoc;
+	struct hlist_node list;
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
@@ -138,14 +140,37 @@ static int dlm_local_count;
 static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
-static DEFINE_IDR(connections_idr);
+static struct hlist_head connection_hash[CONN_HASH_SIZE];
 static DEFINE_MUTEX(connections_lock);
-static int max_nodeid;
 static struct kmem_cache *con_cache;
 
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
 
+
+/* This is deliberately very simple because most clusters have simple
+   sequential nodeids, so we should be able to go straight to a connection
+   struct in the array */
+static inline int nodeid_hash(int nodeid)
+{
+	return nodeid & (CONN_HASH_SIZE-1);
+}
+
+static struct connection *__find_con(int nodeid)
+{
+	int r;
+	struct hlist_node *h;
+	struct connection *con;
+
+	r = nodeid_hash(nodeid);
+
+	hlist_for_each_entry(con, h, &connection_hash[r], list) {
+		if (con->nodeid == nodeid)
+			return con;
+	}
+	return NULL;
+}
+
 /*
  * If 'allocation' is zero then we don't attempt to create a new
  * connection structure for this node.
@@ -154,31 +179,17 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 {
 	struct connection *con = NULL;
 	int r;
-	int n;
 
-	con = idr_find(&connections_idr, nodeid);
+	con = __find_con(nodeid);
 	if (con || !alloc)
 		return con;
 
-	r = idr_pre_get(&connections_idr, alloc);
-	if (!r)
-		return NULL;
-
 	con = kmem_cache_zalloc(con_cache, alloc);
 	if (!con)
 		return NULL;
 
-	r = idr_get_new_above(&connections_idr, con, nodeid, &n);
-	if (r) {
-		kmem_cache_free(con_cache, con);
-		return NULL;
-	}
-
-	if (n != nodeid) {
-		idr_remove(&connections_idr, n);
-		kmem_cache_free(con_cache, con);
-		return NULL;
-	}
+	r = nodeid_hash(nodeid);
+	hlist_add_head(&con->list, &connection_hash[r]);
 
 	con->nodeid = nodeid;
 	mutex_init(&con->sock_mutex);
@@ -189,19 +200,30 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 
 	/* Setup action pointers for child sockets */
 	if (con->nodeid) {
-		struct connection *zerocon = idr_find(&connections_idr, 0);
+		struct connection *zerocon = __find_con(0);
 
 		con->connect_action = zerocon->connect_action;
 		if (!con->rx_action)
 			con->rx_action = zerocon->rx_action;
 	}
 
-	if (nodeid > max_nodeid)
-		max_nodeid = nodeid;
-
 	return con;
 }
 
+/* Loop round all connections */
+static void foreach_conn(void (*conn_func)(struct connection *c))
+{
+	int i;
+	struct hlist_node *h, *n;
+	struct connection *con;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+			conn_func(con);
+		}
+	}
+}
+
 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
 	struct connection *con;
@@ -217,14 +239,17 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 static struct connection *assoc2con(int assoc_id)
 {
 	int i;
+	struct hlist_node *h;
 	struct connection *con;
 
 	mutex_lock(&connections_lock);
-	for (i=0; i<=max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con && con->sctp_assoc == assoc_id) {
-			mutex_unlock(&connections_lock);
-			return con;
+
+	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry(con, h, &connection_hash[i], list) {
+			if (con && con->sctp_assoc == assoc_id) {
+				mutex_unlock(&connections_lock);
+				return con;
+			}
 		}
 	}
 	mutex_unlock(&connections_lock);
@@ -250,8 +275,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
 	} else {
 		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
 		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
+		ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
 	}
 
 	return 0;
@@ -376,25 +400,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
 		log_print("send EOF to node failed: %d", ret);
 }
 
+static void sctp_init_failed_foreach(struct connection *con)
+{
+	con->sctp_assoc = 0;
+	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	}
+}
+
 /* INIT failed but we don't know which node...
    restart INIT on all pending nodes */
 static void sctp_init_failed(void)
 {
-	int i;
-	struct connection *con;
-
 	mutex_lock(&connections_lock);
-	for (i=1; i<=max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (!con)
-			continue;
-		con->sctp_assoc = 0;
-		if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
-			if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
-				queue_work(send_workqueue, &con->swork);
-			}
-		}
-	}
+
+	foreach_conn(sctp_init_failed_foreach);
+
 	mutex_unlock(&connections_lock);
 }
 
@@ -1313,13 +1335,10 @@ out_connect:
 
 static void clean_one_writequeue(struct connection *con)
 {
-	struct list_head *list;
-	struct list_head *temp;
+	struct writequeue_entry *e, *safe;
 
 	spin_lock(&con->writequeue_lock);
-	list_for_each_safe(list, temp, &con->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
+	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
 		list_del(&e->list);
 		free_entry(e);
 	}
@@ -1369,14 +1388,7 @@ static void process_send_sockets(struct work_struct *work)
 /* Discard all entries on the write queues */
 static void clean_writequeues(void)
 {
-	int nodeid;
-
-	for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
-		struct connection *con = __nodeid2con(nodeid, 0);
-
-		if (con)
-			clean_one_writequeue(con);
-	}
+	foreach_conn(clean_one_writequeue);
 }
 
 static void work_stop(void)
@@ -1406,23 +1418,29 @@ static int work_start(void)
 	return 0;
 }
 
-void dlm_lowcomms_stop(void)
+static void stop_conn(struct connection *con)
 {
-	int i;
-	struct connection *con;
+	con->flags |= 0x0F;
+	if (con->sock)
+		con->sock->sk->sk_user_data = NULL;
+}
 
+static void free_conn(struct connection *con)
+{
+	close_connection(con, true);
+	if (con->othercon)
+		kmem_cache_free(con_cache, con->othercon);
+	hlist_del(&con->list);
+	kmem_cache_free(con_cache, con);
+}
+
+void dlm_lowcomms_stop(void)
+{
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
 	mutex_lock(&connections_lock);
-	for (i = 0; i <= max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con) {
-			con->flags |= 0x0F;
-			if (con->sock)
-				con->sock->sk->sk_user_data = NULL;
-		}
-	}
+	foreach_conn(stop_conn);
 	mutex_unlock(&connections_lock);
 
 	work_stop();
@@ -1430,25 +1448,20 @@ void dlm_lowcomms_stop(void)
 	mutex_lock(&connections_lock);
 	clean_writequeues();
 
-	for (i = 0; i <= max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con) {
-			close_connection(con, true);
-			if (con->othercon)
-				kmem_cache_free(con_cache, con->othercon);
-			kmem_cache_free(con_cache, con);
-		}
-	}
-	max_nodeid = 0;
+	foreach_conn(free_conn);
+
 	mutex_unlock(&connections_lock);
 	kmem_cache_destroy(con_cache);
-	idr_init(&connections_idr);
 }
 
 int dlm_lowcomms_start(void)
 {
 	int error = -EINVAL;
 	struct connection *con;
+	int i;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&connection_hash[i]);
 
 	init_local();
 	if (!dlm_local_count) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 065149e84f42..ebce994ab0b7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -84,7 +84,7 @@ struct dlm_lock_result32 {
 
 static void compat_input(struct dlm_write_request *kb,
 			 struct dlm_write_request32 *kb32,
-			 size_t count)
+			 int namelen)
 {
 	kb->version[0] = kb32->version[0];
 	kb->version[1] = kb32->version[1];
@@ -96,8 +96,7 @@ static void compat_input(struct dlm_write_request *kb,
 	    kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
-		memcpy(kb->i.lspace.name, kb32->i.lspace.name, count -
-			offsetof(struct dlm_write_request32, i.lspace.name));
+		memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
 	} else if (kb->cmd == DLM_USER_PURGE) {
 		kb->i.purge.nodeid = kb32->i.purge.nodeid;
 		kb->i.purge.pid = kb32->i.purge.pid;
@@ -115,8 +114,7 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
 		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
 		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-		memcpy(kb->i.lock.name, kb32->i.lock.name, count -
-			offsetof(struct dlm_write_request32, i.lock.name));
+		memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
 	}
 }
 
@@ -539,9 +537,16 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #ifdef CONFIG_COMPAT
 	if (!kbuf->is64bit) {
 		struct dlm_write_request32 *k32buf;
+		int namelen = 0;
+
+		if (count > sizeof(struct dlm_write_request32))
+			namelen = count - sizeof(struct dlm_write_request32);
+
 		k32buf = (struct dlm_write_request32 *)kbuf;
-		kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
-			       sizeof(struct dlm_write_request32)), GFP_KERNEL);
+
+		/* add 1 after namelen so that the name string is terminated */
+		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+			       GFP_KERNEL);
 		if (!kbuf) {
 			kfree(k32buf);
 			return -ENOMEM;
@@ -549,7 +554,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		if (proc)
 			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-		compat_input(kbuf, k32buf, count + 1);
+
+		compat_input(kbuf, k32buf, namelen);
 		kfree(k32buf);
 	}
 #endif
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 3e5637fc3779..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
 			continue;
 		if (inode->i_mapping->nrpages == 0)
 			continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f6caeb1d1106..8b65f289ee00 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
 	list_for_each_entry(global_auth_tok,
 			    &mount_crypt_stat->global_auth_tok_list,
 			    mount_crypt_stat_list) {
+		if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			continue;
 		rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
 		if (rc) {
 			printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
@@ -1322,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
 }
 
 static int
-ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
-				    struct dentry *ecryptfs_dentry,
-				    char *virt)
+ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+				    char *virt, size_t virt_len)
 {
 	int rc;
 
 	rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
-				  0, crypt_stat->num_header_bytes_at_front);
+				  0, virt_len);
 	if (rc)
 		printk(KERN_ERR "%s: Error attempting to write header "
 		       "information to lower file; rc = [%d]\n", __func__,
@@ -1339,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
 
 static int
 ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
-				 struct ecryptfs_crypt_stat *crypt_stat,
 				 char *page_virt, size_t size)
 {
 	int rc;
@@ -1349,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
 	return rc;
 }
 
+static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
+					       unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
 /**
  * ecryptfs_write_metadata
  * @ecryptfs_dentry: The eCryptfs dentry
@@ -1365,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 {
 	struct ecryptfs_crypt_stat *crypt_stat =
 		&ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	unsigned int order;
 	char *virt;
+	size_t virt_len;
 	size_t size = 0;
 	int rc = 0;
 
@@ -1381,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		rc = -EINVAL;
 		goto out;
 	}
+	virt_len = crypt_stat->num_header_bytes_at_front;
+	order = get_order(virt_len);
 	/* Released in this function */
-	virt = (char *)get_zeroed_page(GFP_KERNEL);
+	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
 	if (!virt) {
 		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
-					 crypt_stat, ecryptfs_dentry);
+	rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
+					 ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
 		       __func__, rc);
 		goto out_free;
 	}
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry,
-						      crypt_stat, virt, size);
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
+						      size);
 	else
-		rc = ecryptfs_write_metadata_to_contents(crypt_stat,
-							 ecryptfs_dentry, virt);
+		rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+							 virt_len);
 	if (rc) {
 		printk(KERN_ERR "%s: Error writing metadata out to lower file; "
 		       "rc = [%d]\n", __func__, rc);
 		goto out_free;
 	}
 out_free:
-	free_page((unsigned long)virt);
+	free_pages((unsigned long)virt, order);
 out:
 	return rc;
 }
@@ -2206,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 					 struct dentry *ecryptfs_dir_dentry,
 					 const char *name, size_t name_size)
 {
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
 	char *decoded_name;
 	size_t decoded_name_size;
 	size_t packet_size;
 	int rc = 0;
 
-	if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+	    && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
 	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
 			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
-		struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-			&ecryptfs_superblock_to_private(
-				ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
 		const char *orig_name = name;
 		size_t orig_name_size = name_size;
 
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 5e596583946c..2dda5ade75bc 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -89,7 +89,7 @@ static void ecryptfs_d_release(struct dentry *dentry)
 	return;
 }
 
-struct dentry_operations ecryptfs_dops = {
+const struct dentry_operations ecryptfs_dops = {
 	.d_revalidate = ecryptfs_d_revalidate,
 	.d_release = ecryptfs_d_release,
 };
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95714ab..064c5820e4e5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
  */
 struct ecryptfs_global_auth_tok {
 #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
+#define ECRYPTFS_AUTH_TOK_FNEK    0x00000002
 	u32 flags;
 	struct list_head mount_crypt_stat_list;
 	struct key *global_auth_tok_key;
@@ -579,7 +580,7 @@ extern const struct inode_operations ecryptfs_main_iops;
 extern const struct inode_operations ecryptfs_dir_iops;
 extern const struct inode_operations ecryptfs_symlink_iops;
 extern const struct super_operations ecryptfs_sops;
-extern struct dentry_operations ecryptfs_dops;
+extern const struct dentry_operations ecryptfs_dops;
 extern struct address_space_operations ecryptfs_aops;
 extern int ecryptfs_verbosity;
 extern unsigned int ecryptfs_message_buf_len;
@@ -619,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct ecryptfs_crypt_stat *crypt_stat,
 					struct inode *ecryptfs_dir_inode,
 					struct nameidata *ecryptfs_nd);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
@@ -696,7 +696,7 @@ ecryptfs_write_header_metadata(char *virt,
 int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-			   char *sig);
+			   char *sig, u32 global_auth_tok_flags);
 int ecryptfs_get_global_auth_tok_for_sig(
 	struct ecryptfs_global_auth_tok **global_auth_tok,
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5697899a168d..55b3145b8072 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -246,7 +246,6 @@ out:
  */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct ecryptfs_crypt_stat *crypt_stat,
 					struct inode *ecryptfs_dir_inode,
 					struct nameidata *ecryptfs_nd)
 {
@@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 	struct vfsmount *lower_mnt;
 	struct inode *lower_inode;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_crypt_stat *crypt_stat;
 	char *page_virt = NULL;
 	u64 file_size;
 	int rc = 0;
@@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 			goto out_free_kmem;
 		}
 	}
+	crypt_stat = &ecryptfs_inode_to_private(
+					ecryptfs_dentry->d_inode)->crypt_stat;
+	/* TODO: lock for crypt_stat comparison */
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+			ecryptfs_set_default_sizes(crypt_stat);
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      ecryptfs_dentry->d_inode);
 	if (rc) {
@@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 {
 	char *encrypted_and_encoded_name = NULL;
 	size_t encrypted_and_encoded_name_size;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
-	struct ecryptfs_inode_info *inode_info;
 	struct dentry *lower_dir_dentry, *lower_dentry;
 	int rc = 0;
 
@@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 	if (lower_dentry->d_inode)
 		goto lookup_and_interpose;
-	inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
-	if (inode_info) {
-		crypt_stat = &inode_info->crypt_stat;
-		/* TODO: lock for crypt_stat comparison */
-		if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-			ecryptfs_set_default_sizes(crypt_stat);
-	}
-	if (crypt_stat)
-		mount_crypt_stat = crypt_stat->mount_crypt_stat;
-	else
-		mount_crypt_stat = &ecryptfs_superblock_to_private(
-			ecryptfs_dentry->d_sb)->mount_crypt_stat;
-	if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
-	    && !(mount_crypt_stat && (mount_crypt_stat->flags
-				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+				ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(mount_crypt_stat
+	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
 		goto lookup_and_interpose;
 	dput(lower_dentry);
 	rc = ecryptfs_encrypt_and_encode_filename(
 		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
-		crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
 		ecryptfs_dentry->d_name.len);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
@@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 lookup_and_interpose:
 	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-						 crypt_stat, ecryptfs_dir_inode,
+						 ecryptfs_dir_inode,
 						 ecryptfs_nd);
 	goto out;
 out_d_drop:
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff539420cc6f..af737bb56cb7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 out_release_free_unlock:
 	crypto_free_hash(s->hash_desc.tfm);
 out_free_unlock:
-	memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
-	kfree(s->block_aligned_filename);
+	kzfree(s->block_aligned_filename);
 out_unlock:
 	mutex_unlock(s->tfm_mutex);
 out:
@@ -2375,7 +2374,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
 
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-			     char *sig)
+			     char *sig, u32 global_auth_tok_flags)
 {
 	struct ecryptfs_global_auth_tok *new_auth_tok;
 	int rc = 0;
@@ -2389,6 +2388,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 		goto out;
 	}
 	memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_auth_tok->flags = global_auth_tok_flags;
 	new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
 	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
 	list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e1be1e..aed56c25539b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		case ecryptfs_opt_ecryptfs_sig:
 			sig_src = args[0].from;
 			rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
-							  sig_src);
+							  sig_src, 0);
 			if (rc) {
 				printk(KERN_ERR "Error attempting to register "
 				       "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				ECRYPTFS_SIG_SIZE_HEX] = '\0';
 			rc = ecryptfs_add_global_auth_tok(
 				mount_crypt_stat,
-				mount_crypt_stat->global_default_fnek_sig);
+				mount_crypt_stat->global_default_fnek_sig,
+				ECRYPTFS_AUTH_TOK_FNEK);
 			if (rc) {
 				printk(KERN_ERR "Error attempting to register "
 				       "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e01..295e7fa56755 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
 	if (daemon->user_ns)
 		put_user_ns(daemon->user_ns);
 	mutex_unlock(&daemon->mux);
-	memset(daemon, 0, sizeof(*daemon));
-	kfree(daemon);
+	kzfree(daemon);
 out:
 	return rc;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
 }
 
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
+	struct super_block *sb = dentry->d_sb;
+	struct efs_sb_info *sbi = SUPER_INFO(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
-	buf->f_blocks  = sb->total_groups *	/* total data blocks */
-			(sb->group_size - sb->inode_blocks);
-	buf->f_bfree   = sb->data_free;		/* free data blocks */
-	buf->f_bavail  = sb->data_free;		/* free blocks for non-root */
-	buf->f_files   = sb->total_groups *	/* total inodes */
-			sb->inode_blocks *
+	buf->f_blocks  = sbi->total_groups *	/* total data blocks */
+			(sbi->group_size - sbi->inode_blocks);
+	buf->f_bfree   = sbi->data_free;	/* free data blocks */
+	buf->f_bavail  = sbi->data_free;	/* free blocks for non-root */
+	buf->f_files   = sbi->total_groups *	/* total inodes */
+			sbi->inode_blocks *
 			(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
-	buf->f_ffree   = sb->inode_free;	/* free inodes */
+	buf->f_ffree   = sbi->inode_free;	/* free inodes */
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = EFS_MAXNAMELEN;	/* max filename length */
 
 	return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..2a701d593d35 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
 	 * issue a wakeup.
 	 */
 	__u64 count;
+	unsigned int flags;
 };
 
 /*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
 		n = (int) (ULLONG_MAX - ctx->count);
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
-		wake_up_locked(&ctx->wqh);
+		wake_up_locked_poll(&ctx->wqh, POLLIN);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt;
+	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	ucnt = ctx->count;
-	if (ucnt > 0)
+	if (ctx->count > 0)
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (res = 0;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				ucnt = ctx->count;
 				res = sizeof(ucnt);
 				break;
 			}
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
-		ctx->count = 0;
+	if (likely(res > 0)) {
+		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+		ctx->count -= ucnt;
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked(&ctx->wqh);
+			wake_up_locked_poll(&ctx->wqh, POLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (res > 0) {
+	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked(&ctx->wqh);
+			wake_up_locked_poll(&ctx->wqh, POLLIN);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
+	ctx->flags = flags;
 
 	/*
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & (O_CLOEXEC | O_NONBLOCK));
+			      flags & EFD_SHARED_FCNTL_FLAGS);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 011b9b8c90c6..a89f370fadb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
- *  fs/eventpoll.c (Efficent event polling implementation)
- *  Copyright (C) 2001,...,2007	 Davide Libenzi
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
+ *  Copyright (C) 2001,...,2009	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
  * a better scalability.
  */
 
-#define DEBUG_EPOLL 0
-
-#if DEBUG_EPOLL > 0
-#define DPRINTK(x) printk x
-#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
-#else /* #if DEBUG_EPOLL > 0 */
-#define DPRINTK(x) (void) 0
-#define DNPRINTK(n, x) (void) 0
-#endif /* #if DEBUG_EPOLL > 0 */
-
-#define DEBUG_EPI 0
-
-#if DEBUG_EPI != 0
-#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
-#else /* #if DEBUG_EPI != 0 */
-#define EPI_SLAB_DEBUG 0
-#endif /* #if DEBUG_EPI != 0 */
-
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
 
-/* Maximum number of poll wake up nests we are allowing */
-#define EP_MAX_POLLWAKE_NESTS 4
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
 
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
 };
 
 /*
- * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
- * It is used to keep track on all tasks that are currently inside the wake_up() code
- * to 1) short-circuit the one coming from the same task and same wait queue head
- * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
- * 3) let go the ones coming from other tasks.
+ * Structure used to track possible nested calls, for too deep recursions
+ * and loop cycles.
  */
-struct wake_task_node {
+struct nested_call_node {
 	struct list_head llink;
-	struct task_struct *task;
-	wait_queue_head_t *wq;
+	void *cookie;
+	int cpu;
 };
 
 /*
- * This is used to implement the safe poll wake up avoiding to reenter
- * the poll callback from inside wake_up().
+ * This structure is used as collector for nested calls, to check for
+ * maximum recursion dept and loop cycles.
  */
-struct poll_safewake {
-	struct list_head wake_task_list;
+struct nested_calls {
+	struct list_head tasks_call_list;
 	spinlock_t lock;
 };
 
@@ -213,7 +192,7 @@ struct eppoll_entry {
 	struct list_head llink;
 
 	/* The "base" pointer is set to the container "struct epitem" */
-	void *base;
+	struct epitem *base;
 
 	/*
 	 * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
 	struct epitem *epi;
 };
 
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+	int maxevents;
+	struct epoll_event __user *events;
+};
+
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
  */
 static DEFINE_MUTEX(epmutex);
 
-/* Safe wake up implementation */
-static struct poll_safewake psw;
+/* Used for safe wake up implementation */
+static struct nested_calls poll_safewake_ncalls;
+
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
 
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
 }
 
 /* Initialize the poll safe wake up structure */
-static void ep_poll_safewake_init(struct poll_safewake *psw)
+static void ep_nested_calls_init(struct nested_calls *ncalls)
 {
-
-	INIT_LIST_HEAD(&psw->wake_task_list);
-	spin_lock_init(&psw->lock);
+	INIT_LIST_HEAD(&ncalls->tasks_call_list);
+	spin_lock_init(&ncalls->lock);
 }
 
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
- * because this one gets called by the poll callback, that in turn is called
- * from inside a wake_up(), that might be called from irq context.
+/**
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ *                  that the recursion limit is not exceeded, and that
+ *                  the same nested call (by the meaning of same cookie) is
+ *                  no re-entered.
+ *
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
+ * @max_nests: Maximum number of allowed nesting calls.
+ * @nproc: Nested call core function pointer.
+ * @priv: Opaque data to be passed to the @nproc callback.
+ * @cookie: Cookie to be used to identify this nested call.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
  */
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+			  int (*nproc)(void *, void *, int), void *priv,
+			  void *cookie)
 {
-	int wake_nests = 0;
+	int error, call_nests = 0;
 	unsigned long flags;
-	struct task_struct *this_task = current;
-	struct list_head *lsthead = &psw->wake_task_list;
-	struct wake_task_node *tncur;
-	struct wake_task_node tnode;
+	int this_cpu = get_cpu();
+	struct list_head *lsthead = &ncalls->tasks_call_list;
+	struct nested_call_node *tncur;
+	struct nested_call_node tnode;
 
-	spin_lock_irqsave(&psw->lock, flags);
+	spin_lock_irqsave(&ncalls->lock, flags);
 
-	/* Try to see if the current task is already inside this wakeup call */
+	/*
+	 * Try to see if the current task is already inside this wakeup call.
+	 * We use a list here, since the population inside this set is always
+	 * very much limited.
+	 */
 	list_for_each_entry(tncur, lsthead, llink) {
-
-		if (tncur->wq == wq ||
-		    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
+		if (tncur->cpu == this_cpu &&
+		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
 			 * We abort this wake by breaking the cycle itself.
 			 */
-			spin_unlock_irqrestore(&psw->lock, flags);
-			return;
+			error = -1;
+			goto out_unlock;
 		}
 	}
 
-	/* Add the current task to the list */
-	tnode.task = this_task;
-	tnode.wq = wq;
+	/* Add the current task and cookie to the list */
+	tnode.cpu = this_cpu;
+	tnode.cookie = cookie;
 	list_add(&tnode.llink, lsthead);
 
-	spin_unlock_irqrestore(&psw->lock, flags);
+	spin_unlock_irqrestore(&ncalls->lock, flags);
 
-	/* Do really wake up now */
-	wake_up_nested(wq, 1 + wake_nests);
+	/* Call the nested function */
+	error = (*nproc)(priv, cookie, call_nests);
 
 	/* Remove the current task from the list */
-	spin_lock_irqsave(&psw->lock, flags);
+	spin_lock_irqsave(&ncalls->lock, flags);
 	list_del(&tnode.llink);
-	spin_unlock_irqrestore(&psw->lock, flags);
+ out_unlock:
+	spin_unlock_irqrestore(&ncalls->lock, flags);
+
+	put_cpu();
+	return error;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+	wake_up_locked_poll(wqueue, events);
+	spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+				     unsigned long events, int subclass)
+{
+	wake_up_poll(wqueue, events);
+}
+#endif
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+			  1 + call_nests);
+	return 0;
+}
+
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+		       ep_poll_wakeup_proc, NULL, wq);
 }
 
 /*
- * This function unregister poll callbacks from the associated file descriptor.
- * Since this must be called without holding "ep->lock" the atomic exchange trick
- * will protect us from multiple unregister.
+ * This function unregisters poll callbacks from the associated file
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
+ * ep_free).
  */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
-	int nwait;
 	struct list_head *lsthead = &epi->pwqlist;
 	struct eppoll_entry *pwq;
 
-	/* This is called without locks, so we need the atomic exchange */
-	nwait = xchg(&epi->nwait, 0);
+	while (!list_empty(lsthead)) {
+		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
 
-	if (nwait) {
-		while (!list_empty(lsthead)) {
-			pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+		list_del(&pwq->llink);
+		remove_wait_queue(pwq->whead, &pwq->wait);
+		kmem_cache_free(pwq_cache, pwq);
+	}
+}
 
-			list_del_init(&pwq->llink);
-			remove_wait_queue(pwq->whead, &pwq->wait);
-			kmem_cache_free(pwq_cache, pwq);
-		}
+/**
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
+ *                      the scan code, to call f_op->poll(). Also allows for
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+			      int (*sproc)(struct eventpoll *,
+					   struct list_head *, void *),
+			      void *priv)
+{
+	int error, pwake = 0;
+	unsigned long flags;
+	struct epitem *epi, *nepi;
+	LIST_HEAD(txlist);
+
+	/*
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() and epoll_ctl().
+	 */
+	mutex_lock(&ep->mtx);
+
+	/*
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we want the "sproc" callback to be able to do it
+	 * in a lockless way.
+	 */
+	spin_lock_irqsave(&ep->lock, flags);
+	list_splice_init(&ep->rdllist, &txlist);
+	ep->ovflist = NULL;
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/*
+	 * Now call the callback function.
+	 */
+	error = (*sproc)(ep, &txlist, priv);
+
+	spin_lock_irqsave(&ep->lock, flags);
+	/*
+	 * During the time we spent inside the "sproc" callback, some
+	 * other events might have been queued by the poll callback.
+	 * We re-insert them inside the main ready-list here.
+	 */
+	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * We need to check if the item is already in the list.
+		 * During the "sproc" callback execution time, items are
+		 * queued into ->ovflist but the "txlist" might already
+		 * contain them, and the list_splice() below takes care of them.
+		 */
+		if (!ep_is_linked(&epi->rdllink))
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	ep->ovflist = EP_UNACTIVE_PTR;
+
+	/*
+	 * Quickly re-inject items left on "txlist".
+	 */
+	list_splice(&txlist, &ep->rdllist);
+
+	if (!list_empty(&ep->rdllist)) {
+		/*
+		 * Wake up (if active) both the eventpoll wait list and
+		 * the ->poll() wait list (delayed after we release the lock).
+		 */
+		if (waitqueue_active(&ep->wq))
+			wake_up_locked(&ep->wq);
+		if (waitqueue_active(&ep->poll_wait))
+			pwake++;
 	}
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	mutex_unlock(&ep->mtx);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&ep->poll_wait);
+
+	return error;
 }
 
 /*
@@ -417,10 +546,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	ep_unregister_pollwait(ep, epi);
 
 	/* Remove the current item from the list of epoll hooks */
-	spin_lock(&file->f_ep_lock);
+	spin_lock(&file->f_lock);
 	if (ep_is_linked(&epi->fllink))
 		list_del_init(&epi->fllink);
-	spin_unlock(&file->f_ep_lock);
+	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
 
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	atomic_dec(&ep->user->epoll_watches);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
-		     current, ep, file));
-
 	return 0;
 }
 
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	/*
 	 * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	if (ep)
 		ep_free(ep);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
 	return 0;
 }
 
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
+{
+	struct epitem *epi, *tmp;
+
+	list_for_each_entry_safe(epi, tmp, head, rdllink) {
+		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+		    epi->event.events)
+			return POLLIN | POLLRDNORM;
+		else {
+			/*
+			 * Item has been dropped into the ready list by the poll
+			 * callback, but it's not actually ready, as far as
+			 * caller requested events goes. We can remove it here.
+			 */
+			list_del_init(&epi->rdllink);
+		}
+	}
+
+	return 0;
+}
+
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+}
+
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	unsigned int pollflags = 0;
-	unsigned long flags;
+	int pollflags;
 	struct eventpoll *ep = file->private_data;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
-	/* Check our condition */
-	spin_lock_irqsave(&ep->lock, flags);
-	if (!list_empty(&ep->rdllist))
-		pollflags = POLLIN | POLLRDNORM;
-	spin_unlock_irqrestore(&ep->lock, flags);
+	/*
+	 * Proceed to find out if wanted events are really available inside
+	 * the ready list. This need to be done under ep_call_nested()
+	 * supervision, since the call to f_op->poll() done on listed files
+	 * could re-enter here.
+	 */
+	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+				   ep_poll_readyevents_proc, ep, ep);
 
-	return pollflags;
+	return pollflags != -1 ? pollflags : 0;
 }
 
 /* File callbacks that implement the eventpoll file behaviour */
@@ -538,15 +692,17 @@ void eventpoll_release_file(struct file *file)
 	struct epitem *epi;
 
 	/*
-	 * We don't want to get "file->f_ep_lock" because it is not
+	 * We don't want to get "file->f_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
 	 * cleanup path, and this means that noone is using this file anymore.
-	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+	 * So, for example, epoll_ctl() cannot hit here since if we reach this
 	 * point, the file counter already went to zero and fget() would fail.
 	 * The only hit might come from ep_free() but by holding the mutex
 	 * will correctly serialize the operation. We do need to acquire
 	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
 	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
 
@@ -586,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
 
 	*pep = ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
-		     current, ep));
 	return 0;
 
 free_uid:
@@ -621,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 		}
 	}
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
-		     current, file, epir));
-
 	return epir;
 }
 
@@ -639,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-		     current, epi->ffd.file, epi, ep));
-
 	spin_lock_irqsave(&ep->lock, flags);
 
 	/*
@@ -654,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 		goto out_unlock;
 
 	/*
+	 * Check the events coming with the callback. At this stage, not
+	 * every device reports the events in the "key" parameter of the
+	 * callback. We need to be able to handle both cases here, hence the
+	 * test for "key" != NULL before the event match test.
+	 */
+	if (key && !((unsigned long) key & epi->event.events))
+		goto out_unlock;
+
+	/*
 	 * If we are trasfering events to userspace, we can hold no locks
 	 * (because we're accessing user memory, and because of linux f_op->poll()
 	 * semantics). All the events that happens during that period of time are
@@ -668,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	}
 
 	/* If this file is already in the ready list we exit soon */
-	if (ep_is_linked(&epi->rdllink))
-		goto is_linked;
-
-	list_add_tail(&epi->rdllink, &ep->rdllist);
+	if (!ep_is_linked(&epi->rdllink))
+		list_add_tail(&epi->rdllink, &ep->rdllist);
 
-is_linked:
 	/*
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 	 * wait list.
@@ -688,7 +842,7 @@ out_unlock:
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	return 1;
 }
@@ -785,9 +939,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto error_unregister;
 
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_ep_lock);
+	spin_lock(&tfile->f_lock);
 	list_add_tail(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_ep_lock);
+	spin_unlock(&tfile->f_lock);
 
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
@@ -815,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
-		     current, ep, tfile, fd));
+		ep_poll_safewake(&ep->poll_wait);
 
 	return 0;
 
@@ -849,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 {
 	int pwake = 0;
 	unsigned int revents;
-	unsigned long flags;
 
 	/*
-	 * Set the new event interest mask before calling f_op->poll(), otherwise
-	 * a potential race might occur. In fact if we do this operation inside
-	 * the lock, an event might happen between the f_op->poll() call and the
-	 * new event set registering.
+	 * Set the new event interest mask before calling f_op->poll();
+	 * otherwise we might miss an event that happens between the
+	 * f_op->poll() call and the new event set registering.
 	 */
 	epi->event.events = event->events;
+	epi->event.data = event->data; /* protected by mtx */
 
 	/*
 	 * Get current event bits. We can safely use the file* here because
@@ -865,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 */
 	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
 
-	spin_lock_irqsave(&ep->lock, flags);
-
-	/* Copy the data member from inside the lock */
-	epi->event.data = event->data;
-
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
 	 * list, push it inside.
 	 */
 	if (revents & event->events) {
+		spin_lock_irq(&ep->lock);
 		if (!ep_is_linked(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 
@@ -884,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
+		spin_unlock_irq(&ep->lock);
 	}
-	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+		ep_poll_safewake(&ep->poll_wait);
 
 	return 0;
 }
 
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
-			  int maxevents)
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv)
 {
-	int eventcnt, error = -EFAULT, pwake = 0;
+	struct ep_send_events_data *esed = priv;
+	int eventcnt;
 	unsigned int revents;
-	unsigned long flags;
-	struct epitem *epi, *nepi;
-	struct list_head txlist;
-
-	INIT_LIST_HEAD(&txlist);
-
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
-	 */
-	mutex_lock(&ep->mtx);
-
-	/*
-	 * Steal the ready list, and re-init the original one to the
-	 * empty list. Also, set ep->ovflist to NULL so that events
-	 * happening while looping w/out locks, are not lost. We cannot
-	 * have the poll callback to queue directly on ep->rdllist,
-	 * because we are doing it in the loop below, in a lockless way.
-	 */
-	spin_lock_irqsave(&ep->lock, flags);
-	list_splice(&ep->rdllist, &txlist);
-	INIT_LIST_HEAD(&ep->rdllist);
-	ep->ovflist = NULL;
-	spin_unlock_irqrestore(&ep->lock, flags);
+	struct epitem *epi;
+	struct epoll_event __user *uevent;
 
 	/*
-	 * We can loop without lock because this is a task private list.
-	 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
-	 * Items cannot vanish during the loop because we are holding "mtx".
+	 * We can loop without lock because we are passed a task private list.
+	 * Items cannot vanish during the loop because ep_scan_ready_list() is
+	 * holding "mtx" during this call.
 	 */
-	for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
-		epi = list_first_entry(&txlist, struct epitem, rdllink);
+	for (eventcnt = 0, uevent = esed->events;
+	     !list_empty(head) && eventcnt < esed->maxevents;) {
+		epi = list_first_entry(head, struct epitem, rdllink);
 
 		list_del_init(&epi->rdllink);
 
-		/*
-		 * Get the ready file event set. We can safely use the file
-		 * because we are holding the "mtx" and this will guarantee
-		 * that both the file and the item will not vanish.
-		 */
-		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-		revents &= epi->event.events;
+		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+			epi->event.events;
 
 		/*
-		 * Is the event mask intersect the caller-requested one,
-		 * deliver the event to userspace. Again, we are holding
-		 * "mtx", so no operations coming from userspace can change
-		 * the item.
+		 * If the event mask intersect the caller-requested one,
+		 * deliver the event to userspace. Again, ep_scan_ready_list()
+		 * is holding "mtx", so no operations coming from userspace
+		 * can change the item.
 		 */
 		if (revents) {
-			if (__put_user(revents,
-				       &events[eventcnt].events) ||
-			    __put_user(epi->event.data,
-				       &events[eventcnt].data))
-				goto errxit;
+			if (__put_user(revents, &uevent->events) ||
+			    __put_user(epi->event.data, &uevent->data)) {
+				list_add(&epi->rdllink, head);
+				return eventcnt ? eventcnt : -EFAULT;
+			}
+			eventcnt++;
+			uevent++;
 			if (epi->event.events & EPOLLONESHOT)
 				epi->event.events &= EP_PRIVATE_BITS;
-			eventcnt++;
+			else if (!(epi->event.events & EPOLLET)) {
+				/*
+				 * If this file has been added with Level
+				 * Trigger mode, we need to insert back inside
+				 * the ready list, so that the next call to
+				 * epoll_wait() will check again the events
+				 * availability. At this point, noone can insert
+				 * into ep->rdllist besides us. The epoll_ctl()
+				 * callers are locked out by
+				 * ep_scan_ready_list() holding "mtx" and the
+				 * poll callback will queue them in ep->ovflist.
+				 */
+				list_add_tail(&epi->rdllink, &ep->rdllist);
+			}
 		}
-		/*
-		 * At this point, noone can insert into ep->rdllist besides
-		 * us. The epoll_ctl() callers are locked out by us holding
-		 * "mtx" and the poll callback will queue them in ep->ovflist.
-		 */
-		if (!(epi->event.events & EPOLLET) &&
-		    (revents & epi->event.events))
-			list_add_tail(&epi->rdllink, &ep->rdllist);
-	}
-	error = 0;
-
-errxit:
-
-	spin_lock_irqsave(&ep->lock, flags);
-	/*
-	 * During the time we spent in the loop above, some other events
-	 * might have been queued by the poll callback. We re-insert them
-	 * inside the main ready-list here.
-	 */
-	for (nepi = ep->ovflist; (epi = nepi) != NULL;
-	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-		/*
-		 * If the above loop quit with errors, the epoll item might still
-		 * be linked to "txlist", and the list_splice() done below will
-		 * take care of those cases.
-		 */
-		if (!ep_is_linked(&epi->rdllink))
-			list_add_tail(&epi->rdllink, &ep->rdllist);
 	}
-	/*
-	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
-	 * releasing the lock, events will be queued in the normal way inside
-	 * ep->rdllist.
-	 */
-	ep->ovflist = EP_UNACTIVE_PTR;
 
-	/*
-	 * In case of error in the event-send loop, or in case the number of
-	 * ready events exceeds the userspace limit, we need to splice the
-	 * "txlist" back inside ep->rdllist.
-	 */
-	list_splice(&txlist, &ep->rdllist);
-
-	if (!list_empty(&ep->rdllist)) {
-		/*
-		 * Wake up (if active) both the eventpoll wait list and the ->poll()
-		 * wait list (delayed after we release the lock).
-		 */
-		if (waitqueue_active(&ep->wq))
-			wake_up_locked(&ep->wq);
-		if (waitqueue_active(&ep->poll_wait))
-			pwake++;
-	}
-	spin_unlock_irqrestore(&ep->lock, flags);
+	return eventcnt;
+}
 
-	mutex_unlock(&ep->mtx);
+static int ep_send_events(struct eventpoll *ep,
+			  struct epoll_event __user *events, int maxevents)
+{
+	struct ep_send_events_data esed;
 
-	/* We have to call this outside the lock */
-	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
+	esed.maxevents = maxevents;
+	esed.events = events;
 
-	return eventcnt == 0 ? error: eventcnt;
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1031,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	wait_queue_t wait;
 
 	/*
-	 * Calculate the timeout by checking for the "infinite" value ( -1 )
+	 * Calculate the timeout by checking for the "infinite" value (-1)
 	 * and the overflow condition. The passed timeout is in milliseconds,
 	 * that why (t * HZ) / 1000.
 	 */
@@ -1074,9 +1162,8 @@ retry:
 
 		set_current_state(TASK_RUNNING);
 	}
-
 	/* Is it worth to try to dig for events ? */
-	eavail = !list_empty(&ep->rdllist);
+	eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
@@ -1097,41 +1184,30 @@ retry:
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-	int error, fd = -1;
-	struct eventpoll *ep;
+	int error;
+	struct eventpoll *ep = NULL;
 
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
 	if (flags & ~EPOLL_CLOEXEC)
 		return -EINVAL;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-		     current, flags));
-
 	/*
-	 * Create the internal data structure ( "struct eventpoll" ).
+	 * Create the internal data structure ("struct eventpoll").
 	 */
 	error = ep_alloc(&ep);
-	if (error < 0) {
-		fd = error;
-		goto error_return;
-	}
-
+	if (error < 0)
+		return error;
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-			      flags & O_CLOEXEC);
-	if (fd < 0)
+	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+				 flags & O_CLOEXEC);
+	if (error < 0)
 		ep_free(ep);
 
-error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, flags, fd));
-
-	return fd;
+	return error;
 }
 
 SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1156,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	struct epitem *epi;
 	struct epoll_event epds;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
-		     current, epfd, op, fd, event));
-
 	error = -EFAULT;
 	if (ep_op_has_event(op) &&
 	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1209,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds.events |= POLLERR | POLLHUP;
-
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
 			error = -EEXIST;
@@ -1235,8 +1307,6 @@ error_tgt_fput:
 error_fput:
 	fput(file);
 error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
-		     current, epfd, op, fd, event, error));
 
 	return error;
 }
@@ -1252,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	struct file *file;
 	struct eventpoll *ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
-		     current, epfd, events, maxevents, timeout));
-
 	/* The maximum number of event must be greater than zero */
 	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
 		return -EINVAL;
@@ -1291,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 error_fput:
 	fput(file);
 error_return:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
-		     current, epfd, events, maxevents, timeout, error));
 
 	return error;
 }
@@ -1357,17 +1422,18 @@ static int __init eventpoll_init(void)
 		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
-	ep_poll_safewake_init(&psw);
+	ep_nested_calls_init(&poll_safewake_ncalls);
+
+	/* Initialize the structure used to perform file's f_op->poll() calls */
+	ep_nested_calls_init(&poll_readywalk_ncalls);
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
-			NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
-			sizeof(struct eppoll_entry), 0,
-			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
+			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
 
 	return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 929b58004b7e..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,6 +45,7 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -52,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -127,6 +129,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	if (error)
+		goto exit;
 
 	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
 	error = PTR_ERR(file);
@@ -674,6 +679,9 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
 
 	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
 	if (IS_ERR(file))
@@ -1049,32 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned long flags;
-	unsigned n_fs, n_files, n_sighand;
+	unsigned n_fs;
+	int res = 0;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
-	n_files = 1;
-	n_sighand = 1;
+	write_lock(&p->fs->lock);
 	lock_task_sighand(p, &flags);
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
 			n_fs++;
-		if (t->files == files)
-			n_files++;
-		n_sighand++;
 	}
 
-	if (atomic_read(&p->fs->count) > n_fs ||
-	    atomic_read(&p->files->count) > n_files ||
-	    atomic_read(&p->sighand->count) > n_sighand)
+	if (p->fs->users > n_fs) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		if (p->fs->in_exec)
+			res = -EAGAIN;
+		p->fs->in_exec = 1;
+	}
 
 	unlock_task_sighand(p, &flags);
+	write_unlock(&p->fs->lock);
+
+	return res;
 }
 
 /* 
@@ -1184,6 +1195,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = security_bprm_check(bprm);
 	if (retval)
 		return retval;
+	retval = ima_bprm_check(bprm);
+	if (retval)
+		return retval;
 
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
@@ -1284,17 +1298,21 @@ int do_execve(char * filename,
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm, displaced);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1337,6 +1355,10 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
@@ -1354,7 +1376,13 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+	tristate "exofs: OSD based file system support"
+	depends on SCSI_OSD_ULD
+	help
+	  EXOFS is a file system that uses an OSD storage device,
+	  as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+	bool "Enable debugging"
+	depends on EXOFS_FS
+	help
+	  This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
+#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
+#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA	1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT	12
+#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC	0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le32  s_numfiles;	/* Number of files on fs */
+	__le16  s_magic;	/* Magic signature */
+	__le16  s_newfs;	/* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA		5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+	__le64  i_size;			/* Size of the file */
+	__le16  i_mode;         	/* File mode */
+	__le16  i_links_count;  	/* Links count */
+	__le32  i_uid;          	/* Owner Uid */
+	__le32  i_gid;          	/* Group Id */
+	__le32  i_atime;        	/* Access time */
+	__le32  i_ctime;        	/* Creation time */
+	__le32  i_mtime;        	/* Modification time */
+	__le32  i_flags;        	/* File flags (unused for now)*/
+	__le32  i_generation;   	/* File version (for NFS) */
+	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DATA,
+	EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN	255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+	__le64		inode_no;		/* inode number           */
+	__le16		rec_len;		/* directory entry length */
+	u8		name_len;		/* name length            */
+	u8		file_type;		/* umm...file type        */
+	char		name[EXOFS_NAME_LEN];	/* file name              */
+};
+
+enum {
+	EXOFS_FT_UNKNOWN,
+	EXOFS_FT_REG_FILE,
+	EXOFS_FT_DIR,
+	EXOFS_FT_CHRDEV,
+	EXOFS_FT_BLKDEV,
+	EXOFS_FT_FIFO,
+	EXOFS_FT_SOCK,
+	EXOFS_FT_SYMLINK,
+	EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD			4
+#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+	return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+	osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+
+int osd_req_read_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+int osd_req_write_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	loff_t last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	set_page_dirty(page);
+
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+
+	return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct exofs_dir_entry *p;
+	char *error;
+
+	/* if the page is the last one in the directory */
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct exofs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < EXOFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+Ebadsize:
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"size of directory #%lu is not a multiple of chunk size",
+		dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+bad_entry:
+	EXOFS_ERR(
+		"ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+		"offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)),
+		rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct exofs_dir_entry *)(kaddr + offs);
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"entry in directory #%lu spans the page boundary"
+		"offset=%lu, inode=%llu",
+		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			exofs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	exofs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+					struct exofs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode_no)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+	struct exofs_dir_entry *p =
+			(struct exofs_dir_entry *)(base + (offset&mask));
+	while ((char *)p < (char *)de) {
+		if (p->rec_len == 0)
+			break;
+		p = exofs_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXOFS_FT_REG_FILE]	= DT_REG,
+	[EXOFS_FT_DIR]		= DT_DIR,
+	[EXOFS_FT_CHRDEV]	= DT_CHR,
+	[EXOFS_FT_BLKDEV]	= DT_BLK,
+	[EXOFS_FT_FIFO]		= DT_FIFO,
+	[EXOFS_FT_SOCK]		= DT_SOCK,
+	[EXOFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = (filp->f_version != inode->i_version);
+
+	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+		return 0;
+
+	types = exofs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct exofs_dir_entry *de;
+		struct page *page = exofs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			EXOFS_ERR("ERROR: "
+				   "bad page in #%lu",
+				   inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = exofs_validate_entry(kaddr, offset,
+								chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct exofs_dir_entry *)(kaddr + offset);
+		limit = kaddr + exofs_last_byte(inode, n) -
+							EXOFS_DIR_REC_LEN(1);
+		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: "
+					"zero-length directory entry");
+				exofs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode_no) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXOFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode_no),
+						d_type);
+				if (over) {
+					exofs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		exofs_put_page(page);
+	}
+
+	return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+			struct dentry *dentry, struct page **res_page)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct exofs_i_info *oi = exofs_i(dir);
+	struct exofs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	*res_page = NULL;
+
+	start = oi->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = exofs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct exofs_dir_entry *) kaddr;
+			kaddr += exofs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					EXOFS_ERR(
+						"ERROR: exofs_find_entry: "
+						"zero-length directory entry");
+					exofs_put_page(page);
+					goto out;
+				}
+				if (exofs_match(namelen, name, de))
+					goto found;
+				de = exofs_next_entry(de);
+			}
+			exofs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	oi->i_dir_start_lookup = n;
+	return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = exofs_get_page(dir, 0);
+	struct exofs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = exofs_next_entry(
+				(struct exofs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+	struct page *page;
+	struct exofs_dir_entry *de;
+	ino_t ino;
+
+	de = exofs_dotdot(child->d_inode, &page);
+	if (!de)
+		return 0;
+
+	ino = le64_to_cpu(de->inode_no);
+	exofs_put_page(page);
+	return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct exofs_dir_entry *de;
+	struct page *page;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode_no);
+		exofs_put_page(page);
+	}
+	return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+			struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = le16_to_cpu(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+			  err);
+
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, len);
+	exofs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = exofs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + exofs_last_byte(dir, n);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode_no = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_add_link: "
+					"zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (exofs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = EXOFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode_no && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		exofs_put_page(page);
+	}
+
+	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char *)de - (char *)page_address(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode_no) {
+		struct exofs_dir_entry *de1 =
+			(struct exofs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	sbi->s_numfiles++;
+
+out_put:
+	exofs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	loff_t pos;
+	struct exofs_dir_entry *pde = NULL;
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+	int err;
+
+	while (de < dir) {
+		if (de->rec_len == 0) {
+			EXOFS_ERR("ERROR: exofs_delete_entry:"
+				"zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = exofs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+							&page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+			  err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode_no = 0;
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	sbi->s_numfiles--;
+out:
+	exofs_put_page(page);
+	return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = exofs_chunk_size(inode);
+	struct exofs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+							&page, NULL);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	de = (struct exofs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+
+	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+	de->inode_no = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+	exofs_set_de_type(de, inode);
+	kunmap_atomic(page, KM_USER0);
+	err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct exofs_dir_entry *de;
+		page = exofs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_empty_dir: "
+					  "zero-length directory entry"
+					  "kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode_no != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (le64_to_cpu(de->inode_no) !=
+					    inode->i_ino)
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = exofs_next_entry(de);
+		}
+		exofs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	exofs_put_page(page);
+	return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+	struct osd_dev	*s_dev;			/* returned by get_osd_dev    */
+	osd_id		s_pid;			/* partition ID of file system*/
+	int		s_timeout;		/* timeout for OSD operations */
+	uint64_t	s_nextid;		/* highest object ID used     */
+	uint32_t	s_numfiles;		/* number of files on fs      */
+	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
+	u32		s_next_generation;	/* next gen # to use          */
+	atomic_t	s_curr_pending;		/* number of pending commands */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* all-powerful credential    */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+	unsigned long  i_flags;            /* various atomic flags            */
+	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
+	uint64_t       i_commit_size;      /* the object's written length     */
+	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED	0	/* object will be created soon*/
+#define OBJ_CREATED	1	/* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+	if (likely(obj_created(oi)))
+		return 0;
+
+	return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+	return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+					 struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+		    struct inode *);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+			    int datasync)
+{
+	int ret;
+	struct address_space *mapping = filp->f_mapping;
+
+	ret = filemap_write_and_wait(mapping);
+	if (ret)
+		return ret;
+
+	/*Note: file_fsync below also calles sync_blockdev, which is a no-op
+	 *      for exofs, but other then that it does sync_inode and
+	 *      sync_superblock which is what we need here.
+	 */
+	return file_fsync(filp, dentry, datasync);
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+	exofs_file_fsync(file, file->f_path.dentry, 1);
+	/* TODO: Flush the OSD target */
+	return 0;
+}
+
+const struct file_operations exofs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= exofs_release_file,
+	.fsync		= exofs_file_fsync,
+	.flush		= exofs_flush,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+	.truncate	= exofs_truncate,
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
+
+#include "exofs.h"
+
+#ifdef CONFIG_EXOFS_DEBUG
+#  define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+
+struct page_collect {
+	struct exofs_sb_info *sbi;
+	struct request_queue *req_q;
+	struct inode *inode;
+	unsigned expected_pages;
+
+	struct bio *bio;
+	unsigned nr_pages;
+	unsigned long length;
+	loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+		struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+
+	pcol->sbi = sbi;
+	pcol->req_q = req_q;
+	pcol->inode = inode;
+	pcol->expected_pages = expected_pages;
+
+	pcol->bio = NULL;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+
+	EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+		     expected_pages);
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+	pcol->bio = NULL;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+		     pcol->inode->i_ino, pcol->expected_pages);
+
+	/* this is probably the end of the loop but in writes
+	 * it might not end here. don't be left with nothing
+	 */
+	if (!pcol->expected_pages)
+		pcol->expected_pages = 128;
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+	int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+
+	for (; pages; pages >>= 1) {
+		pcol->bio = bio_alloc(GFP_KERNEL, pages);
+		if (likely(pcol->bio))
+			return 0;
+	}
+
+	EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+		  pcol->expected_pages);
+	return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+	bio_put(pcol->bio);
+	pcol->bio = NULL;
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+			 unsigned len)
+{
+	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+	if (unlikely(len != added_len))
+		return -ENOMEM;
+
+	++pcol->nr_pages;
+	pcol->length += len;
+	return 0;
+}
+
+static int update_read_page(struct page *page, int ret)
+{
+	if (ret == 0) {
+		/* Everything is OK */
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+	} else if (ret == -EFAULT) {
+		/* In this case we were trying to read something that wasn't on
+		 * disk yet - return a page full of zeroes.  This should be OK,
+		 * because the object should be empty (if there was a write
+		 * before this read, the read would be waiting with the page
+		 * locked */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		ret = 0; /* recovered error */
+		EXOFS_DBGMSG("recovered read error\n");
+	} else /* Error */
+		SetPageError(page);
+
+	return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		SetPageError(page);
+	}
+	end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+			    bool do_unlock)
+{
+	struct bio_vec *bvec;
+	int i;
+	u64 resid;
+	u64 good_bytes;
+	u64 length = 0;
+	int ret = exofs_check_ok_resid(or, &resid, NULL);
+
+	osd_end_request(or);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else if (!resid)
+		good_bytes = 0;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages at end */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+			  inode->i_ino, page->index,
+			  page_stat ? "bad_bytes" : "good_bytes");
+
+		ret = update_read_page(page, page_stat);
+		if (do_unlock)
+			unlock_page(page);
+		length += bvec->bv_len;
+	}
+
+	pcol_free(pcol);
+	EXOFS_DBGMSG("readpages_done END\n");
+	return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+	struct page_collect *pcol = p;
+
+	__readpages_done(or, pcol, true);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+	kfree(p);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+
+		if (rw == READ)
+			update_read_page(page, ret);
+		else
+			update_write_page(page, ret);
+
+		unlock_page(page);
+	}
+	pcol_free(pcol);
+}
+
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct osd_obj_id obj = {pcol->sbi->s_pid,
+					pcol->inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or = NULL;
+	struct page_collect *pcol_copy = NULL;
+	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!pcol->bio)
+		return 0;
+
+	/* see comment in _readpage() about sync reads */
+	WARN_ON(is_sync && (pcol->nr_pages != 1));
+
+	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	osd_req_read(or, &obj, pcol->bio, i_start);
+
+	if (is_sync) {
+		exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+		return __readpages_done(or, pcol, false);
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+	ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+	if (unlikely(ret))
+		goto err;
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+
+	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+		  obj.id, _LLU(i_start), pcol->length);
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	if (!is_sync)
+		_unlock_pcol_pages(pcol, ret, READ);
+	kfree(pcol_copy);
+	if (or)
+		osd_end_request(or);
+	return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	/* FIXME: Just for debugging, will be removed */
+	if (PageUptodate(page))
+		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+			  page->index);
+
+	if (page->index < end_index)
+		len = PAGE_CACHE_SIZE;
+	else if (page->index == end_index)
+		len = i_size & ~PAGE_CACHE_MASK;
+	else
+		len = 0;
+
+	if (!len || !obj_created(oi)) {
+		/* this will be out of bounds, or doesn't exist yet.
+		 * Current page is cleared and the request is split
+		 */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+
+		unlock_page(page);
+		EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+			     " splitting\n", inode->i_ino, page->index);
+
+		return read_exec(pcol, false);
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = read_exec(pcol, false);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->bio) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	if (len != PAGE_CACHE_SIZE)
+		zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+	EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (ret) {
+		EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+			  page, len, pcol->nr_pages, pcol->length);
+
+		/* split the request, and start again with current page */
+		ret = read_exec(pcol, false);
+		if (unlikely(ret))
+			goto fail;
+
+		goto try_again;
+	}
+
+	return 0;
+
+fail:
+	/* SetPageError(page); ??? */
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, nr_pages, mapping->host);
+
+	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("read_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol, false);
+}
+
+static int _readpage(struct page *page, bool is_sync)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	/* readpage_strip might call read_exec(,async) inside at several places
+	 * but this is safe for is_async=0 since read_exec will not do anything
+	 * when we have a single page.
+	 */
+	ret = readpage_strip(&pcol, page);
+	if (ret) {
+		EXOFS_ERR("_readpage => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol, is_sync);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+	return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+	struct page_collect *pcol = p;
+	struct bio_vec *bvec;
+	int i;
+	u64 resid;
+	u64  good_bytes;
+	u64  length = 0;
+
+	int ret = exofs_check_ok_resid(or, NULL, &resid);
+
+	osd_end_request(or);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else if (!resid)
+		good_bytes = 0;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages to a bio */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		update_write_page(page, page_stat);
+		unlock_page(page);
+		EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+			     inode->i_ino, page->index, page_stat);
+
+		length += bvec->bv_len;
+	}
+
+	pcol_free(pcol);
+	kfree(pcol);
+	EXOFS_DBGMSG("writepages_done END\n");
+}
+
+static int write_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct osd_obj_id obj = {pcol->sbi->s_pid,
+					pcol->inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or = NULL;
+	struct page_collect *pcol_copy = NULL;
+	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!pcol->bio)
+		return 0;
+
+	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+
+	osd_req_write(or, &obj, pcol_copy->bio, i_start);
+	ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+	if (unlikely(ret)) {
+		EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+		goto err;
+	}
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+		  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+		  pcol->length);
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	_unlock_pcol_pages(pcol, ret, WRITE);
+	kfree(pcol_copy);
+	if (or)
+		osd_end_request(or);
+	return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+			   struct writeback_control *wbc_unused, void *data)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	BUG_ON(!PageLocked(page));
+
+	ret = wait_obj_created(oi);
+	if (unlikely(ret))
+		goto fail;
+
+	if (page->index < end_index)
+		/* in this case, the page is within the limits of the file */
+		len = PAGE_CACHE_SIZE;
+	else {
+		len = i_size & ~PAGE_CACHE_MASK;
+
+		if (page->index > end_index || !len) {
+			/* in this case, the page is outside the limits
+			 * (truncate in progress)
+			 */
+			ret = write_exec(pcol);
+			if (unlikely(ret))
+				goto fail;
+			if (PageError(page))
+				ClearPageError(page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = write_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->bio) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Failed pcol_add_page "
+			     "nr_pages=%u total_length=0x%lx\n",
+			     pcol->nr_pages, pcol->length);
+
+		/* split the request, next loop will start again */
+		ret = write_exec(pcol);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("write_exec faild => %d", ret);
+			goto fail;
+		}
+
+		goto try_again;
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	return 0;
+
+fail:
+	set_bit(AS_EIO, &page->mapping->flags);
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	long start, end, expected_pages;
+	int ret;
+
+	start = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = (wbc->range_end == LLONG_MAX) ?
+			start + mapping->nrpages :
+			wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	if (start || end)
+		expected_pages = min(end - start + 1, 32L);
+	else
+		expected_pages = mapping->nrpages;
+
+	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+		     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+		     mapping->nrpages, start, end);
+
+	_pcol_init(&pcol, expected_pages, mapping->host);
+
+	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("write_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	ret = writepage_strip(page, NULL, &pcol);
+	if (ret) {
+		EXOFS_ERR("exofs_writepage => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret = 0;
+	struct page *page;
+
+	page = *pagep;
+	if (page == NULL) {
+		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+					 fsdata);
+		if (ret) {
+			EXOFS_DBGMSG("simple_write_begin faild\n");
+			return ret;
+		}
+
+		page = *pagep;
+	}
+
+	 /* read modify write */
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		ret = _readpage(page, true);
+		if (ret) {
+			/*SetPageError was done by _readpage. Is it ok?*/
+			unlock_page(page);
+			EXOFS_DBGMSG("__readpage_filler faild\n");
+		}
+	}
+
+	return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+		struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+
+	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+					fsdata);
+}
+
+const struct address_space_operations exofs_aops = {
+	.readpage	= exofs_readpage,
+	.readpages	= exofs_readpages,
+	.writepage	= exofs_writepage,
+	.writepages	= exofs_writepages,
+	.write_begin	= exofs_write_begin_export,
+	.write_end	= simple_write_end,
+};
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+
+	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	map_bh(bh_result, inode->i_sb, iblock);
+	return 0;
+}
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	struct osd_attr attr;
+	loff_t isize = i_size_read(inode);
+	__be64 newsize;
+	int ret;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
+		return;
+	if (exofs_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+		goto fail;
+	}
+
+	osd_req_set_attributes(or, &obj);
+
+	newsize = cpu_to_be64((u64)isize);
+	attr = g_attr_logical_length;
+	attr.val_ptr = &newsize;
+	osd_req_add_set_attr_list(or, &attr, 1);
+
+	/* if we are about to truncate an object, and it hasn't been
+	 * created yet, wait
+	 */
+	if (unlikely(wait_obj_created(oi)))
+		goto fail;
+
+	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	osd_end_request(or);
+	if (ret)
+		goto fail;
+
+out:
+	mark_inode_dirty(inode);
+	return;
+fail:
+	make_bad_inode(inode);
+	goto out;
+}
+
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	error = inode_setattr(inode, iattr);
+	return error;
+}
+
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+		    struct exofs_fcb *inode, uint64_t *sanity)
+{
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_request *or;
+	struct osd_attr attr;
+	struct osd_obj_id obj = {sbi->s_pid,
+				 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+	int ret;
+
+	exofs_make_credential(oi->i_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+		return -ENOMEM;
+	}
+	osd_req_get_attributes(or, &obj);
+
+	/* we need the inode attribute */
+	osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	/* we get the size attributes to do a sanity check */
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+
+	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	if (ret)
+		goto out;
+
+	attr = g_attr_inode_data;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret) {
+		EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+		goto out;
+	}
+
+	WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from or failed\n");
+		goto out;
+	}
+	*sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct exofs_i_info *oi;
+	struct exofs_fcb fcb;
+	struct inode *inode;
+	uint64_t uninitialized_var(sanity);
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	oi = exofs_i(inode);
+
+	/* read the inode from the osd */
+	ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+	if (ret)
+		goto bad_inode;
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_created(oi);
+
+	/* copy stuff from on-disk struct to in-memory struct */
+	inode->i_mode = le16_to_cpu(fcb.i_mode);
+	inode->i_uid = le32_to_cpu(fcb.i_uid);
+	inode->i_gid = le32_to_cpu(fcb.i_gid);
+	inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+	inode->i_ctime.tv_nsec =
+		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	oi->i_commit_size = le64_to_cpu(fcb.i_size);
+	i_size_write(inode, oi->i_commit_size);
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	if ((inode->i_size != sanity) &&
+		(!exofs_inode_is_fast_symlink(inode))) {
+		EXOFS_ERR("WARNING: Size of object from inode and "
+			  "attributes differ (%lld != %llu)\n",
+			  inode->i_size, _LLU(sanity));
+	}
+#endif
+
+	oi->i_dir_start_lookup = 0;
+
+	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (fcb.i_data[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+	} else {
+		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &exofs_dir_inode_operations;
+		inode->i_fop = &exofs_dir_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (exofs_inode_is_fast_symlink(inode))
+			inode->i_op = &exofs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &exofs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &exofs_aops;
+		}
+	} else {
+		inode->i_op = &exofs_special_inode_operations;
+		if (fcb.i_data[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+	struct inode *inode = p;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = exofs_check_ok(or);
+	osd_end_request(or);
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (unlikely(ret)) {
+		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+			  _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+		make_bad_inode(inode);
+	} else
+		set_obj_created(oi);
+
+	atomic_dec(&inode->i_count);
+	wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+	struct exofs_sb_info *sbi;
+	struct osd_request *or;
+	struct osd_obj_id obj;
+	int ret;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	oi = exofs_i(inode);
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_2bcreated(oi);
+
+	sbi = sb->s_fs_info;
+
+	sb->s_dirt = 1;
+	inode->i_uid = current->cred->fsuid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		inode->i_gid = current->cred->fsgid;
+	}
+	inode->i_mode = mode;
+
+	inode->i_ino = sbi->s_nextid++;
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	oi->i_commit_size = inode->i_size = 0;
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	mark_inode_dirty(inode);
+
+	obj.partition = sbi->s_pid;
+	obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+	exofs_make_credential(oi->i_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	osd_req_create_object(or, &obj);
+
+	/* increment the refcount so that the inode will still be around when we
+	 * reach the callback
+	 */
+	atomic_inc(&inode->i_count);
+
+	ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+	if (ret) {
+		atomic_dec(&inode->i_count);
+		osd_end_request(or);
+		return ERR_PTR(-EIO);
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return inode;
+}
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+	struct exofs_sb_info	*sbi;
+	struct exofs_fcb	fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+	struct updatei_args *args = p;
+
+	osd_end_request(or);
+
+	atomic_dec(&args->sbi->s_curr_pending);
+
+	kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	struct osd_attr attr;
+	struct exofs_fcb *fcb;
+	struct updatei_args *args;
+	int ret;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	fcb = &args->fcb;
+
+	fcb->i_mode = cpu_to_le16(inode->i_mode);
+	fcb->i_uid = cpu_to_le32(inode->i_uid);
+	fcb->i_gid = cpu_to_le32(inode->i_gid);
+	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	oi->i_commit_size = i_size_read(inode);
+	fcb->i_size = cpu_to_le64(oi->i_commit_size);
+	fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			fcb->i_data[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			fcb->i_data[1] = 0;
+		} else {
+			fcb->i_data[0] = 0;
+			fcb->i_data[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			fcb->i_data[2] = 0;
+		}
+	} else
+		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+		ret = -ENOMEM;
+		goto free_args;
+	}
+
+	osd_req_set_attributes(or, &obj);
+
+	attr = g_attr_inode_data;
+	attr.val_ptr = fcb;
+	osd_req_add_set_attr_list(or, &attr, 1);
+
+	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
+	}
+
+	if (do_sync) {
+		ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+		osd_end_request(or);
+		goto free_args;
+	} else {
+		args->sbi = sbi;
+
+		ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+		if (ret) {
+			osd_end_request(or);
+			goto free_args;
+		}
+		atomic_inc(&sbi->s_curr_pending);
+		goto out; /* deallocation in updatei_done */
+	}
+
+free_args:
+	kfree(args);
+out:
+	EXOFS_DBGMSG("ret=>%d\n", ret);
+	return ret;
+}
+
+int exofs_write_inode(struct inode *inode, int wait)
+{
+	return exofs_update_inode(inode, wait);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+	struct exofs_sb_info *sbi;
+	osd_end_request(or);
+	sbi = p;
+	atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+
+	mark_inode_dirty(inode);
+	exofs_update_inode(inode, inode_needs_sync(inode));
+
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		exofs_truncate(inode);
+
+	clear_inode(inode);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+		return;
+	}
+
+	osd_req_remove_object(or, &obj);
+
+	/* if we are deleting an obj that hasn't been created yet, wait */
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+	if (ret) {
+		EXOFS_ERR(
+		       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+		osd_end_request(or);
+		return;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return;
+
+no_delete:
+	clear_inode(inode);
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = exofs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > EXOFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = exofs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = exofs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+			 struct nameidata *nd)
+{
+	struct inode *inode = exofs_new_inode(dir, mode);
+	int err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = exofs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+			  const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	oi = exofs_i(inode);
+	if (l > sizeof(oi->i_data)) {
+		/* slow symlink */
+		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		memset(oi->i_data, 0, sizeof(oi->i_data));
+
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &exofs_fast_symlink_inode_operations;
+		memcpy(oi->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = exofs_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= EXOFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	atomic_inc(&inode->i_count);
+
+	return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= EXOFS_LINK_MAX)
+		goto out;
+
+	inode_inc_link_count(dir);
+
+	inode = exofs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &exofs_dir_inode_operations;
+	inode->i_fop = &exofs_dir_operations;
+	inode->i_mapping->a_ops = &exofs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = exofs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = exofs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct exofs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	err = exofs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (exofs_empty_dir(inode)) {
+		err = exofs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct exofs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct exofs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = exofs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct exofs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !exofs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		inode_inc_link_count(old_inode);
+		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+		if (err)
+			goto out_dir;
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+				goto out_dir;
+		}
+		inode_inc_link_count(old_inode);
+		err = exofs_add_link(new_dentry, old_inode);
+		if (err) {
+			inode_dec_link_count(old_inode);
+			goto out_dir;
+		}
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+
+	exofs_delete_entry(old_de, old_page);
+	inode_dec_link_count(old_inode);
+
+	if (dir_de) {
+		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+		if (err)
+			goto out_dir;
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+	.create 	= exofs_create,
+	.lookup 	= exofs_lookup,
+	.link   	= exofs_link,
+	.unlink 	= exofs_unlink,
+	.symlink	= exofs_symlink,
+	.mkdir  	= exofs_mkdir,
+	.rmdir  	= exofs_rmdir,
+	.mknod  	= exofs_mknod,
+	.rename 	= exofs_rename,
+	.setattr	= exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+
+#include "exofs.h"
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret) { /* translate to Linux codes */
+		if (osi.additional_code == scsi_invalid_field_in_cdb) {
+			if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+				ret = -EFAULT;
+			if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+				ret = -ENOENT;
+			else
+				ret = -EINVAL;
+		} else if (osi.additional_code == osd_quota_error)
+			ret = -ENOSPC;
+		else
+			ret = -EIO;
+	}
+
+	/* FIXME: should be include in osd_sense_info */
+	if (in_resid)
+		*in_resid = or->in.req ? or->in.req->data_len : 0;
+
+	if (out_resid)
+		*out_resid = or->out.req ? or->out.req->data_len : 0;
+
+	return ret;
+}
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request(or);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+	return ret;
+}
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+int osd_req_read_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+	struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+	if (!bio)
+		return -ENOMEM;
+
+	osd_req_read(or, obj, bio, offset);
+	return 0;
+}
+
+int osd_req_write_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+	struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+	if (!bio)
+		return -ENOMEM;
+
+	osd_req_write(or, obj, bio, offset);
+	return 0;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+	const char *dev_name;
+	uint64_t pid;
+	int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+	{Opt_pid, "pid=%u"},
+	{Opt_to, "to=%u"},
+	{Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	bool s_pid = false;
+
+	EXOFS_DBGMSG("parse_options %s\n", options);
+	/* defaults */
+	memset(opts, 0, sizeof(*opts));
+	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		char str[32];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_pid:
+			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+				return -EINVAL;
+			opts->pid = simple_strtoull(str, NULL, 0);
+			if (opts->pid < EXOFS_MIN_PID) {
+				EXOFS_ERR("Partition ID must be >= %u",
+					  EXOFS_MIN_PID);
+				return -EINVAL;
+			}
+			s_pid = 1;
+			break;
+		case Opt_to:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option <= 0) {
+				EXOFS_ERR("Timout must be > 0");
+				return -EINVAL;
+			}
+			opts->timeout = option * HZ;
+			break;
+		}
+	}
+
+	if (!s_pid) {
+		EXOFS_ERR("Need to specify the following options:\n");
+		EXOFS_ERR("    -o pid=pid_no_to_use\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+	struct exofs_i_info *oi;
+
+	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	oi->vfs_inode.i_version = 1;
+	return &oi->vfs_inode;
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+	struct exofs_i_info *oi = foo;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+				sizeof(struct exofs_i_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				exofs_init_once);
+	if (exofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+	struct exofs_sb_info *sbi;
+	struct exofs_fscb *fscb;
+	struct osd_request *or;
+	struct osd_obj_id obj;
+	int ret;
+
+	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+	if (!fscb) {
+		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+		return;
+	}
+
+	lock_kernel();
+	sbi = sb->s_fs_info;
+	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+	fscb->s_magic = cpu_to_le16(sb->s_magic);
+	fscb->s_newfs = 0;
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+		goto out;
+	}
+
+	obj.partition = sbi->s_pid;
+	obj.id = EXOFS_SUPER_ID;
+	ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+		goto out;
+	}
+
+	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+		goto out;
+	}
+	sb->s_dirt = 0;
+
+out:
+	if (or)
+		osd_end_request(or);
+	unlock_kernel();
+	kfree(fscb);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+	int num_pend;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+
+	/* make sure there are no pending commands */
+	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+	     num_pend = atomic_read(&sbi->s_curr_pending)) {
+		wait_queue_head_t wq;
+		init_waitqueue_head(&wq);
+		wait_event_timeout(wq,
+				  (atomic_read(&sbi->s_curr_pending) == 0),
+				  msecs_to_jiffies(100));
+	}
+
+	osduld_put_device(sbi->s_dev);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root;
+	struct exofs_mountopt *opts = data;
+	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct exofs_fscb fscb;		/*on-disk superblock info        */
+	struct osd_request *or = NULL;
+	struct osd_obj_id obj;
+	int ret;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* use mount options to fill superblock */
+	sbi->s_dev = osduld_path_lookup(opts->dev_name);
+	if (IS_ERR(sbi->s_dev)) {
+		ret = PTR_ERR(sbi->s_dev);
+		sbi->s_dev = NULL;
+		goto free_sbi;
+	}
+
+	sbi->s_pid = opts->pid;
+	sbi->s_timeout = opts->timeout;
+
+	/* fill in some other data by hand */
+	memset(sb->s_id, 0, sizeof(sb->s_id));
+	strcpy(sb->s_id, "exofs");
+	sb->s_blocksize = EXOFS_BLKSIZE;
+	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	atomic_set(&sbi->s_curr_pending, 0);
+	sb->s_bdev = NULL;
+	sb->s_dev = 0;
+
+	/* read data from on-disk superblock object */
+	obj.partition = sbi->s_pid;
+	obj.id = EXOFS_SUPER_ID;
+	exofs_make_credential(sbi->s_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		if (!silent)
+			EXOFS_ERR(
+			       "exofs_fill_super: osd_start_request failed.\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+	ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret)) {
+		if (!silent)
+			EXOFS_ERR(
+			       "exofs_fill_super: osd_req_read_kern failed.\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+	if (unlikely(ret)) {
+		if (!silent)
+			EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+		ret = -EIO;
+		goto free_sbi;
+	}
+
+	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+	/* make sure what we read from the object store is correct */
+	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+		if (!silent)
+			EXOFS_ERR("ERROR: Bad magic value\n");
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* start generation numbers from a random point */
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* set up operation vectors */
+	sb->s_op = &exofs_sops;
+	sb->s_export_op = &exofs_export_ops;
+	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (IS_ERR(root)) {
+		EXOFS_ERR("ERROR: exofs_iget failed\n");
+		ret = PTR_ERR(root);
+		goto free_sbi;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		EXOFS_ERR("ERROR: get root inode failed\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	if (!S_ISDIR(root->i_mode)) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+		       root->i_mode);
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	ret = 0;
+out:
+	if (or)
+		osd_end_request(or);
+	return ret;
+
+free_sbi:
+	osduld_put_device(sbi->s_dev); /* NULL safe */
+	kfree(sbi);
+	goto out;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
+{
+	struct exofs_mountopt opts;
+	int ret;
+
+	ret = parse_options(data, &opts);
+	if (ret)
+		return ret;
+
+	opts.dev_name = dev_name;
+	return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, 0};
+	struct osd_attr attrs[] = {
+		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+	};
+	uint64_t capacity = ULLONG_MAX;
+	uint64_t used = ULLONG_MAX;
+	struct osd_request *or;
+	uint8_t cred_a[OSD_CAP_LEN];
+	int ret;
+
+	/* get used/capacity attributes */
+	exofs_make_credential(cred_a, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+		return -ENOMEM;
+	}
+
+	osd_req_get_attributes(or, &obj);
+	osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+	ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+	if (unlikely(ret))
+		goto out;
+
+	ret = extract_attr_from_req(or, &attrs[0]);
+	if (likely(!ret))
+		capacity = get_unaligned_be64(attrs[0].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+	ret = extract_attr_from_req(or, &attrs[1]);
+	if (likely(!ret))
+		used = get_unaligned_be64(attrs[1].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+	/* fill in the stats buffer */
+	buf->f_type = EXOFS_SUPER_MAGIC;
+	buf->f_bsize = EXOFS_BLKSIZE;
+	buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+	buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_numfiles;
+	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+	buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+static const struct super_operations exofs_sops = {
+	.alloc_inode    = exofs_alloc_inode,
+	.destroy_inode  = exofs_destroy_inode,
+	.write_inode    = exofs_write_inode,
+	.delete_inode   = exofs_delete_inode,
+	.put_super      = exofs_put_super,
+	.write_super    = exofs_write_super,
+	.statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+	unsigned long ino = exofs_parent_ino(child);
+
+	if (!ino)
+		return NULL;
+
+	return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = exofs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+	.fh_to_dentry = exofs_fh_to_dentry,
+	.fh_to_parent = exofs_fh_to_parent,
+	.get_parent = exofs_get_parent,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "exofs",
+	.get_sb         = exofs_get_sb,
+	.kill_sb        = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exofs_type);
+	if (err)
+		goto out_d;
+
+	return 0;
+out_d:
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_exofs(void)
+{
+	unregister_filesystem(&exofs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+	nd_set_link(nd, (char *)oi->i_data);
+	return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= exofs_follow_link,
+};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 4a29d6376081..7f8d2e5a7ea6 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	DQUOT_FREE_BLOCK(inode, freed);
+	vfs_dq_free_block(inode, freed);
 }
 
 /**
@@ -1247,7 +1247,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+	if (vfs_dq_alloc_block(inode, num)) {
 		*errp = -EDQUOT;
 		return 0;
 	}
@@ -1409,7 +1409,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
+	vfs_dq_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1420,7 +1420,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
+		vfs_dq_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 66321a877e74..15387c9c17d8 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
 	if (!is_bad_inode(inode)) {
 		/* Quota is already initialized in iput() */
 		ext2_xattr_delete_inode(inode);
-	    	DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
+		vfs_dq_free_inode(inode);
+		vfs_dq_drop(inode);
 	}
 
 	es = EXT2_SB(sb)->s_es;
@@ -586,7 +586,7 @@ got:
 		goto fail_drop;
 	}
 
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -605,10 +605,10 @@ got:
 	return inode;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 23fff2f87783..b43b95563663 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1444,7 +1444,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		return error;
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		error = DQUOT_TRANSFER(inode, iattr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
 		if (error)
 			return error;
 	}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7c6e3606f0ec..f983225266dc 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1331,6 +1331,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
 				sb->s_blocksize - offset : toread;
 
 		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
 		err = ext2_get_block(inode, blk, &tmp_bh, 0);
 		if (err < 0)
 			return err;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 987a5261cc2e..7913531ec6d5 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -642,7 +642,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				ea_bdebug(new_bh, "reusing block");
 
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				if (vfs_dq_alloc_block(inode, 1)) {
 					unlock_buffer(new_bh);
 					goto cleanup;
 				}
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		 * as if nothing happened and cleanup the unused block */
 		if (error && error != -ENOSPC) {
 			if (new_bh && new_bh != old_bh)
-				DQUOT_FREE_BLOCK(inode, 1);
+				vfs_dq_free_block(inode, 1);
 			goto cleanup;
 		}
 	} else
@@ -731,7 +731,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
 			if (ce)
 				mb_cache_entry_release(ce);
-			DQUOT_FREE_BLOCK(inode, 1);
+			vfs_dq_free_block(inode, 1);
 			mark_buffer_dirty(old_bh);
 			ea_bdebug(old_bh, "refcount now=%d",
 				le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -794,7 +794,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;
 
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ext3.
 
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3 (legacy option)"
+	depends on EXT3_FS
+	help
+	  If a filesystem does not explicitly specify a data ordering
+	  mode, and the journal capability allowed it, ext3 used to
+	  historically default to 'data=ordered'.
+
+	  That was a rather unfortunate choice, because it leads to all
+	  kinds of latency problems, and the 'data=writeback' mode is more
+	  appropriate these days.
+
+	  You should probably always answer 'n' here, and if you really
+	  want to use 'data=ordered' mode, set it in the filesystem itself
+	  with 'tune2fs -o journal_data_ordered'.
+
+	  But if you really want to enable the legacy default, you can do
+	  so by answering 'y' to this question.
+
 config EXT3_FS_XATTR
 	bool "Ext3 extended attributes"
 	depends on EXT3_FS
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0dbf1c048475..225202db8974 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 	}
 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+		vfs_dq_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -1502,7 +1502,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+	if (vfs_dq_alloc_block(inode, num)) {
 		*errp = -EDQUOT;
 		return 0;
 	}
@@ -1714,7 +1714,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
+	vfs_dq_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1729,7 +1729,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
+		vfs_dq_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
-	.ioctl		= ext3_ioctl,		/* BKL held */
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
  */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+	if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+		filemap_flush(inode->i_mapping);
+		EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext3_file_write,
-	.ioctl		= ext3_ioctl,
+	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 8de6c720e510..dd13d60d524b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	ext3_xattr_delete_inode(handle, inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -589,7 +589,7 @@ got:
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
 
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -620,10 +620,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a64..466a332e0bd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+	int ret;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
+	/* Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason */
+	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
 
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
 	}
 write_begin_failed:
 	if (ret) {
-		ext3_journal_stop(handle);
-		unlock_page(page);
-		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before truncate
+		 * finishes.
 		 */
 		if (pos + len > inode->i_size)
+			ext3_orphan_add(handle, inode);
+		ext3_journal_stop(handle);
+		unlock_page(page);
+		page_cache_release(page);
+		if (pos + len > inode->i_size)
 			vmtruncate(inode, inode->i_size);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 	return err;
 }
 
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * Write could have mapped the buffer but it didn't copy the data in
+	 * yet. So avoid filing such buffer into a transaction.
+	 */
+	if (buffer_mapped(bh) && buffer_uptodate(bh))
+		return ext3_journal_dirty_data(handle, bh);
+	return 0;
+}
+
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
- * after block_write_end.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
  */
-static int ext3_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
 {
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
+	/* What matters to us is i_disksize. We don't write i_size anywhere */
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	if (pos + copied > EXT3_I(inode)->i_disksize) {
+		EXT3_I(inode)->i_disksize = pos + copied;
 		mark_inode_dirty(inode);
 	}
-
-	return copied;
 }
 
 /*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
 	unsigned from, to;
 	int ret = 0, ret2;
 
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + copied;
 	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext3_journal_dirty_data);
+		from, to, NULL, journal_dirty_data_fn);
 
-	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
-		loff_t new_i_size;
-
-		new_i_size = pos + copied;
-		if (new_i_size > EXT3_I(inode)->i_disksize)
-			EXT3_I(inode)->i_disksize = new_i_size;
-		ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-		copied = ret2;
-		if (ret2 < 0)
-			ret = ret2;
-	}
+	if (ret == 0)
+		update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size)
+		ext3_orphan_add(handle, inode);
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (pos + len > inode->i_size)
+		vmtruncate(inode, inode->i_size);
 	return ret ? ret : copied;
 }
 
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
 {
 	handle_t *handle = ext3_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
-	int ret = 0, ret2;
-	loff_t new_i_size;
-
-	new_i_size = pos + copied;
-	if (new_i_size > EXT3_I(inode)->i_disksize)
-		EXT3_I(inode)->i_disksize = new_i_size;
-
-	ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	copied = ret2;
-	if (ret2 < 0)
-		ret = ret2;
+	int ret;
 
-	ret2 = ext3_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	update_file_sizes(inode, pos, copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size)
+		ext3_orphan_add(handle, inode);
+	ret = ext3_journal_stop(handle);
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (pos + len > inode->i_size)
+		vmtruncate(inode, inode->i_size);
 	return ret ? ret : copied;
 }
 
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
-		page_zero_new_buffers(page, from+copied, to);
+		page_zero_new_buffers(page, from + copied, to);
+		to = from + copied;
 	}
 
 	ret = walk_page_buffers(handle, page_buffers(page), from,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
-	if (pos+copied > inode->i_size)
-		i_size_write(inode, pos+copied);
+
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+	/*
+	 * There may be allocated blocks outside of i_size because
+	 * we failed to copy some data. Prepare for truncate.
+	 */
+	if (pos + len > inode->i_size)
+		ext3_orphan_add(handle, inode);
 	EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
 	unlock_page(page);
 	page_cache_release(page);
 
+	if (pos + len > inode->i_size)
+		vmtruncate(inode, inode->i_size);
 	return ret ? ret : copied;
 }
 
@@ -1428,11 +1443,9 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
 {
-	if (buffer_mapped(bh))
-		return ext3_journal_dirty_data(handle, bh);
-	return 0;
+	return !buffer_mapped(bh);
 }
 
 /*
@@ -1505,6 +1518,15 @@ static int ext3_ordered_writepage(struct page *page,
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
+		return block_write_full_page(page, NULL, wbc);
+	}
+	page_bufs = page_buffers(page);
+
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 
 	if (IS_ERR(handle)) {
@@ -1512,11 +1534,6 @@ static int ext3_ordered_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 
@@ -2346,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
 	if (!ext3_can_truncate(inode))
 		return;
 
+	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
@@ -3055,7 +3075,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 		if (error) {
 			ext3_journal_stop(handle);
 			return error;
@@ -3146,7 +3166,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 		ret = 2 * (bpp + indirects) + 2;
 
 #ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during DQUOT_INIT so
+	/* We know that structure was already allocated during vfs_dq_init so
 	 * we will be updating only the data blocks + inodes */
 	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
@@ -3237,7 +3257,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned int oldflags;
 		unsigned int jflag;
 
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto flags_out;
-		}
-
-		if (get_user(flags, (int __user *) arg)) {
-			err = -EFAULT;
-			goto flags_out;
-		}
-
 		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
+
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
-			err = -EPERM;
+		err = -EPERM;
+		if (IS_NOQUOTA(inode))
 			goto flags_out;
-		}
+
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_LINUX_IMMUTABLE))
 				goto flags_out;
-			}
 		}
 
 		/*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				mutex_unlock(&inode->i_mutex);
-				err = -EPERM;
+			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
-			}
 		}
 
-
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
-			mutex_unlock(&inode->i_mutex);
 			err = PTR_ERR(handle);
 			goto flags_out;
 		}
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext3_journal_stop(handle);
-		if (err) {
-			mutex_unlock(&inode->i_mutex);
-			return err;
-		}
+		if (err)
+			goto flags_out;
 
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
-		mutex_unlock(&inode->i_mutex);
 flags_out:
+		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
@@ -140,6 +125,7 @@ flags_out:
 
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
+
 		err = mnt_want_write(filp->f_path.mnt);
 		if (err)
 			return err;
@@ -147,6 +133,7 @@ flags_out:
 			err = -EFAULT;
 			goto setversion_out;
 		}
+
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
 #ifdef CONFIG_COMPAT
 long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+	return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4db4ffa1edad..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
 				 struct dx_frame *frame,
 				 int *err);
 static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
 			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
 		struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+		struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 {
 	int count = 0;
 	char *base = (char *) de;
 	struct dx_hash_info h = *hinfo;
 
-	while ((char *) de < base + size)
+	while ((char *) de < base + blocksize)
 	{
 		if (de->name_len && de->inode) {
 			ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 			return ERR_PTR(-EIO);
 		}
 		inode = ext3_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+		if (unlikely(IS_ERR(inode))) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				ext3_error(dir->i_sb, __func__,
+						"deleted inode referenced: %lu",
+						ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
 {
-	struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
+	struct ext3_dir_entry_2 *next, *to, *prev;
+	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
 	unsigned rec_len = 0;
 
 	prev = to = de;
-	while ((char*)de < base + size) {
+	while ((char *)de < base + blocksize) {
 		next = ext3_next_entry(de);
 		if (de->inode && de->name_len) {
 			rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2049,7 +2058,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2108,7 +2117,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2265,14 +2274,14 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	struct inode * old_inode, * new_inode;
 	struct buffer_head * old_bh, * new_bh, * dir_bh;
 	struct ext3_dir_entry_2 * old_de, * new_de;
-	int retval;
+	int retval, flush_file = 0;
 
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		DQUOT_INIT(new_dentry->d_inode);
+		vfs_dq_init(new_dentry->d_inode);
 	handle = ext3_journal_start(old_dir, 2 *
 					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		ext3_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext3_orphan_add(handle, new_inode);
+		if (ext3_should_writeback_data(new_inode))
+			flush_file = 1;
 	}
 	retval = 0;
 
@@ -2409,6 +2420,8 @@ end_rename:
 	brelse (old_bh);
 	brelse (new_bh);
 	ext3_journal_stop(handle);
+	if (retval == 0 && flush_file)
+		filemap_flush(old_inode->i_mapping);
 	return retval;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a970411a458..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
 
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -707,8 +713,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
-static int ext3_dquot_initialize(struct inode *inode, int type);
-static int ext3_dquot_drop(struct inode *inode);
 static int ext3_write_dquot(struct dquot *dquot);
 static int ext3_acquire_dquot(struct dquot *dquot);
 static int ext3_release_dquot(struct dquot *dquot);
@@ -723,8 +727,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static struct dquot_operations ext3_quota_operations = {
-	.initialize	= ext3_dquot_initialize,
-	.drop		= ext3_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
@@ -1438,7 +1442,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		}
 
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %Ld bytes\n",
@@ -1921,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                    cope, else JOURNAL_DATA */
 		if (journal_check_available_features
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
 		else
 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 		break;
@@ -2702,7 +2706,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
  * Process 1                         Process 2
  * ext3_create()                     quota_sync()
  *   journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   vfs_dq_init()                       down(dqio_mutex)
  *     down(dqio_mutex)                    journal_start()
  *
  */
@@ -2714,44 +2718,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
 }
 
-static int ext3_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_initialize(inode, type);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext3_dquot_drop(struct inode *inode)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		return PTR_ERR(handle);
-	}
-	ret = dquot_drop(inode);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
 static int ext3_write_dquot(struct dquot *dquot)
 {
 	int ret, err;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 175414ac2210..83b7be849bd5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -498,7 +498,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext3_journal_dirty_metadata(handle, bh);
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -774,7 +774,7 @@ inserted:
 				/* The old block is released after updating
 				   the inode. */
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1))
+				if (vfs_dq_alloc_block(inode, 1))
 					goto cleanup;
 				error = ext3_journal_get_write_access(handle,
 								      new_bh);
@@ -848,7 +848,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	DQUOT_FREE_BLOCK(inode, 1);
+	vfs_dq_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
 	  filesystem; while there will be some performance gains from
 	  the delayed allocation and inode table readahead, the best
 	  performance gains will require enabling ext4 features in the
-	  filesystem, or formating a new filesystem as an ext4
+	  filesystem, or formatting a new filesystem as an ext4
 	  filesystem initially.
 
 	  To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b8052dcf..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-				ext4_group_t block_group)
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	ext4_fsblk_t tmp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	int used_blocks = sbi->s_itb_per_group + 2;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-		struct ext4_group_desc *gdp;
-		struct buffer_head *bh;
-
-		gdp = ext4_get_group_desc(sb, block_group, &bh);
 		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
 					block_group))
 			used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 	/*
 	 * request to reload the buddy with the
@@ -536,7 +532,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ext4_mb_free_blocks(handle, inode, block, count,
 			    metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+		vfs_dq_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -609,7 +605,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 unsigned int offset)
 {
 	const char *error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len)
-						< EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
 					goto revalidate;
 				stored++;
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c25..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -20,6 +20,7 @@
 #include <linux/blkdev.h>
 #include <linux/magic.h>
 #include <linux/jbd2.h>
+#include <linux/quota.h>
 #include "ext4_i.h"
 
 /*
@@ -32,14 +33,6 @@
 #undef EXT4FS_DEBUG
 
 /*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS	8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS		1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
  * Debug code
  */
 #ifdef EXT4FS_DEBUG
@@ -53,8 +46,6 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
-#define EXT4_MULTIBLOCK_ALLOCATOR	1
-
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -179,8 +170,9 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-	__u32 free_inodes;
-	__u32 free_blocks;
+	atomic_t free_inodes;
+	atomic_t free_blocks;
+	atomic_t used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -248,6 +240,30 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
 /*
  * Inode dynamic state flags
  */
@@ -255,6 +271,7 @@ struct flex_groups {
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -302,7 +319,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -530,7 +549,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION		0x10000	/* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
@@ -665,7 +684,8 @@ struct ext4_super_block {
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
 	__u8	s_reserved_char_pad2;
 	__le16  s_reserved_pad;
-	__u32   s_reserved[162];        /* Padding to the end of the block */
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__u32   s_reserved[160];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -813,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
 
 /*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
+/*
  * Structure of a directory entry
  */
 #define EXT4_NAME_LEN 255
@@ -864,24 +890,6 @@ struct ext4_dir_entry_2 {
 					 ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-	if (len == EXT4_MAX_REC_LEN)
-		return 1 << 16;
-	return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-	if (len == (1 << 16))
-		return cpu_to_le16(EXT4_MAX_REC_LEN);
-	else if (len > (1 << 16))
-		BUG();
-	return cpu_to_le16(len);
-}
-
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -969,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define	EXT4_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_proc,		\
-				&ext4_ui_proc_fops, &sbi->s_##var);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
 /*
  * Function prototypes
  */
@@ -1091,13 +1083,15 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t ext4_get_reserved_space(struct inode *inode);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1105,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
 /*
  * storage for cached extent
  */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
 
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
 	/* allocation reservation info for delalloc */
 	unsigned int i_reserved_data_blocks;
 	unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock s_blockgroup_lock;
+	struct blockgroup_lock *s_blockgroup_lock;
 	struct proc_dir_entry *s_proc;
-
-	/* root of the per fs reservation window tree */
-	spinlock_t s_rsv_window_lock;
-	struct rb_root s_rsv_window_root;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
 
 	/* Journaling */
 	struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
 
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 };
@@ -153,7 +155,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab196875f..ac77d8b8251d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	int depth;
 
 	if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block 
+		 * group for directories and special files.  Regular 
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves 
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 	return max;
 }
 
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			((block + len) > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = idx_pblock(ext_idx);
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+			(block > ext4_blocks_count(es))))
+		return 0;
+	else
+		return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	struct ext4_extent *ext;
+	struct ext4_extent_idx *ext_idx;
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		ext = EXT_FIRST_EXTENT(eh);
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+			ext++;
+			entries--;
+		}
+	} else {
+		ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
 					struct ext4_extent_header *eh,
 					int depth)
 {
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
 	return 0;
 
 corrupted:
 	ext4_error(inode->i_sb, function,
-			"bad header in inode #%lu: %s - magic %x, "
+			"bad header/extent in inode #%lu: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
 	return -EIO;
 }
 
-#define ext4_ext_check_header(inode, eh, depth)	\
-	__ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth)	\
+	__ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
-	if (ext4_ext_check_header(inode, eh, depth))
-		return ERR_PTR(-EIO);
-
 
 	/* account possible depth increase */
 	if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	i = depth;
 	/* walk through the tree */
 	while (i) {
+		int need_to_validate = 0;
+
 		ext_debug("depth %d: num %d, max %d\n",
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = sb_bread(inode->i_sb, path[ppos].p_block);
-		if (!bh)
+		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+		if (unlikely(!bh))
 			goto err;
-
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto err;
+			}
+			/* validate the extent entries */
+			need_to_validate = 1;
+		}
 		eh = ext_block_hdr(bh);
 		ppos++;
 		BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check_header(inode, eh, i))
+		if (need_to_validate && ext4_ext_check(inode, eh, i))
 			goto err;
 	}
 
@@ -1122,7 +1217,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
 	ext4_fsblk_t block;
-	int depth, ee_len;
+	int depth;	/* Note, NOT eh_depth; depth from top of tree */
+	int ee_len;
 
 	BUG_ON(path == NULL);
 	depth = path->p_depth;
@@ -1179,7 +1275,8 @@ got_index:
 		if (bh == NULL)
 			return -EIO;
 		eh = ext_block_hdr(bh);
-		if (ext4_ext_check_header(inode, eh, depth)) {
+		/* subtract from p_depth to get proper eh_depth */
+		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 			put_bh(bh);
 			return -EIO;
 		}
@@ -1192,7 +1289,7 @@ got_index:
 	if (bh == NULL)
 		return -EIO;
 	eh = ext_block_hdr(bh);
-	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
 		put_bh(bh);
 		return -EIO;
 	}
@@ -2135,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		return -ENOMEM;
 	}
 	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
@@ -2189,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 				err = -EIO;
 				break;
 			}
-			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+			if (ext4_ext_check(inode, ext_block_hdr(bh),
 							depth - i - 1)) {
 				err = -EIO;
 				break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+		ext4_alloc_da_blocks(inode);
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d0..47b84e8df568 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,8 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err, count;
-	ext4_group_t flex_group;
+	int fatal = 0, err, count, cleared;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -220,10 +219,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	ext4_xattr_delete_inode(handle, inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -248,8 +247,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
 	else {
@@ -266,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			if (is_directory) {
 				count = ext4_used_dirs_count(sb, gdp) - 1;
 				ext4_used_dirs_set(sb, gdp, count);
+				if (sbi->s_log_groups_per_flex) {
+					ext4_group_t f;
+
+					f = ext4_flex_group(sbi, block_group);
+					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+				}
+
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
@@ -275,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
 			if (sbi->s_log_groups_per_flex) {
-				flex_group = ext4_flex_group(sbi, block_group);
-				spin_lock(sb_bgl_lock(sbi, flex_group));
-				sbi->s_flex_groups[flex_group].free_inodes++;
-				spin_unlock(sb_bgl_lock(sbi, flex_group));
+				ext4_group_t f;
+
+				f = ext4_flex_group(sbi, block_group);
+				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
 			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -358,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
 	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-	if (flex_group[best_flex].free_inodes &&
+	if (atomic_read(&flex_group[best_flex].free_inodes) &&
 	    flex_freeb_ratio > free_block_ratio)
 		goto found_flexbg;
 
@@ -373,24 +381,24 @@ find_close_to_parent:
 		if (i == parent_fbg_group || i == parent_fbg_group - 1)
 			continue;
 
-		flexbg_free_blocks = flex_group[i].free_blocks;
+		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
 		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 
 		if (flex_freeb_ratio > free_block_ratio &&
-		    flex_group[i].free_inodes) {
+		    (atomic_read(&flex_group[i].free_inodes))) {
 			best_flex = i;
 			goto found_flexbg;
 		}
 
-		if (flex_group[best_flex].free_inodes == 0 ||
-		    (flex_group[i].free_blocks >
-		     flex_group[best_flex].free_blocks &&
-		     flex_group[i].free_inodes))
+		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+		    ((atomic_read(&flex_group[i].free_blocks) >
+		      atomic_read(&flex_group[best_flex].free_blocks)) &&
+		     atomic_read(&flex_group[i].free_inodes)))
 			best_flex = i;
 	}
 
-	if (!flex_group[best_flex].free_inodes ||
-	    !flex_group[best_flex].free_blocks)
+	if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+	    !atomic_read(&flex_group[best_flex].free_blocks))
 		return -1;
 
 found_flexbg:
@@ -408,6 +416,42 @@ out:
 	return 0;
 }
 
+struct orlov_stats {
+	__u32 free_inodes;
+	__u32 free_blocks;
+	__u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+		       int flex_size, struct orlov_stats *stats)
+{
+	struct ext4_group_desc *desc;
+	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+	if (flex_size > 1) {
+		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+		stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		return;
+	}
+
+	desc = ext4_get_group_desc(sb, g, NULL);
+	if (desc) {
+		stats->free_inodes = ext4_free_inodes_count(sb, desc);
+		stats->free_blocks = ext4_free_blks_count(sb, desc);
+		stats->used_dirs = ext4_used_dirs_count(sb, desc);
+	} else {
+		stats->free_inodes = 0;
+		stats->free_blocks = 0;
+		stats->used_dirs = 0;
+	}
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -423,35 +467,34 @@ out:
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
  */
 
-#define INODE_COST 64
-#define BLOCK_COST 256
-
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
-	ext4_fsblk_t blocks_per_dir;
 	unsigned int ndirs;
-	int max_debt, max_dirs, min_inodes;
+	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i;
+	ext4_group_t i, grp, g;
 	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+
+	if (flex_size > 1) {
+		ngroups = (ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -460,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	do_div(avefreeb, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-	if ((parent == sb->s_root->d_inode) ||
-	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+	if (S_ISDIR(mode) &&
+	    ((parent == sb->s_root->d_inode) ||
+	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
 		int best_ndir = inodes_per_group;
-		ext4_group_t grp;
 		int ret = -1;
 
 		get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			grp = (parent_group + i) % ngroups;
-			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !ext4_free_inodes_count(sb, desc))
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
 				continue;
-			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+			if (stats.used_dirs >= best_ndir)
 				continue;
-			if (ext4_free_inodes_count(sb, desc) < avefreei)
+			if (stats.free_inodes < avefreei)
 				continue;
-			if (ext4_free_blks_count(sb, desc) < avefreeb)
+			if (stats.free_blocks < avefreeb)
 				continue;
-			*group = grp;
+			grp = g;
 			ret = 0;
-			best_ndir = ext4_used_dirs_count(sb, desc);
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		/*
+		 * We pack inodes at the beginning of the flexgroup's
+		 * inode tables.  Block allocation decisions will do
+		 * something similar, although regular files will
+		 * start at 2nd block group of the flexgroup.  See
+		 * ext4_ext_find_goal() and ext4_find_near().
+		 */
+		grp *= flex_size;
+		for (i = 0; i < flex_size; i++) {
+			if (grp+i >= sbi->s_groups_count)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
 		}
-		if (ret == 0)
-			return ret;
 		goto fallback;
 	}
 
-	blocks_per_dir = ext4_blocks_count(es) - freeb;
-	do_div(blocks_per_dir, ndirs);
-
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group / 4;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-	if (max_debt * INODE_COST > inodes_per_group)
-		max_debt = inodes_per_group / INODE_COST;
-	if (max_debt > 255)
-		max_debt = 255;
-	if (max_debt == 0)
-		max_debt = 1;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+	/*
+	 * Start looking in the flex group where we last allocated an
+	 * inode for this parent directory
+	 */
+	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+		parent_group = EXT4_I(parent)->i_last_alloc_group;
+		if (flex_size > 1)
+			parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !ext4_free_inodes_count(sb, desc))
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
 			continue;
-		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+		if (stats.free_inodes < min_inodes)
 			continue;
-		if (ext4_free_inodes_count(sb, desc) < min_inodes)
+		if (stats.free_blocks < min_blocks)
 			continue;
-		if (ext4_free_blks_count(sb, desc) < min_blocks)
-			continue;
-		return 0;
+		goto found_flex_bg;
 	}
 
 fallback:
+	ngroups = sbi->s_groups_count;
+	avefreei = freei / ngroups;
+	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
 		if (desc && ext4_free_inodes_count(sb, desc) &&
-			ext4_free_inodes_count(sb, desc) >= avefreei)
+		    ext4_free_inodes_count(sb, desc) >= avefreei) {
+			*group = grp;
 			return 0;
+		}
 	}
 
 	if (avefreei) {
@@ -540,12 +609,51 @@ fallback:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	ext4_group_t i;
+	ext4_group_t i, last;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * Try to place the inode is the same flex group as its
+	 * parent.  If we can't find space, use the Orlov algorithm to
+	 * find another flex group, and store that information in the
+	 * parent directory's inode information so that use that flex
+	 * group for future allocations.
+	 */
+	if (flex_size > 1) {
+		int retry = 0;
+
+	try_again:
+		parent_group &= ~(flex_size-1);
+		last = parent_group + flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		for  (i = parent_group; i < last; i++) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search algorithm
+		 * to find a new flex group; we pass in the mode to
+		 * avoid the topdir algorithms.
+		 */
+		*group = parent_group + flex_size;
+		if (*group > ngroups)
+			*group = 0;
+		return find_group_orlov(sb, parent, group, mode);
+	}
 
 	/*
 	 * Try to place the inode in its parent directory
@@ -663,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
 	if (S_ISDIR(mode)) {
 		count = ext4_used_dirs_count(sb, gdp) + 1;
 		ext4_used_dirs_set(sb, gdp, count);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
@@ -696,6 +809,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	static int once = 1;
 	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
@@ -713,8 +827,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	if (sbi->s_log_groups_per_flex) {
+	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
+		if (ret2 == -1) {
+			ret2 = find_group_other(sb, dir, &group, mode);
+			if (ret2 == 0 && once)
+				once = 0;
+				printk(KERN_NOTICE "ext4: find_group_flex "
+				       "failed, fallback succeeded dir %lu\n",
+				       dir->i_ino);
+		}
 		goto got_group;
 	}
 
@@ -722,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group);
+			ret2 = find_group_orlov(sb, dir, &group, mode);
 	} else
-		ret2 = find_group_other(sb, dir, &group);
+		ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
+	EXT4_I(dir)->i_last_alloc_group = group;
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -847,9 +970,7 @@ got:
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_inodes--;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
 	inode->i_uid = current_fsuid();
@@ -874,19 +995,16 @@ got:
 	ei->i_disksize = 0;
 
 	/*
-	 * Don't inherit extent flag from directory. We set extent flag on
-	 * newly created directory and file only if -o extent mount option is
-	 * specified
+	 * Don't inherit extent flag from directory, amongst others. We set
+	 * extent flag on newly created directory and file only if -o extent
+	 * mount option is specified
 	 */
-	ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT4_DIRSYNC_FL;
+	ei->i_flags =
+		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
@@ -904,7 +1022,7 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -945,10 +1063,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..a2e7952bc5f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+					EXT4_SB(inode->i_sb)->s_journal,
+					&EXT4_I(inode)->jinode,
+					new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -369,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
 	return n;
 }
 
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+				 unsigned int *p, unsigned int max) {
+
+	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+	unsigned int *bref = p;
+	while (bref < p+max) {
+		if (unlikely(*bref >= maxblocks)) {
+			ext4_error(inode->i_sb, function,
+				   "block reference %u >= max (%u) "
+				   "in inode #%lu, offset=%d",
+				   *bref, maxblocks,
+				   inode->i_ino, (int)(bref-p));
+ 			return -EIO;
+ 		}
+		bref++;
+ 	}
+ 	return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode)                                \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+			      EXT4_NDIR_BLOCKS)
+
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
@@ -413,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_bread(sb, le32_to_cpu(p->key));
-		if (!bh)
+		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		if (unlikely(!bh))
 			goto failure;
+                  
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto failure;
+			}
+			/* validate block references */
+			if (ext4_check_indirect_blockref(inode, bh)) {
+				put_bh(bh);
+				goto failure;
+			}
+		}
+		
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -457,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
+	ext4_group_t block_group;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -472,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -973,6 +1031,17 @@ out:
 	return err;
 }
 
+qsize_t ext4_get_reserved_space(struct inode *inode)
+{
+	unsigned long long total;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks +
+		EXT4_I(inode)->i_reserved_meta_blocks;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	return total;
+}
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate @blocks for non extent file based file
@@ -1034,8 +1103,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	/* update per-inode reservations */
 	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
 	EXT4_I(inode)->i_reserved_data_blocks -= used;
-
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	/*
+	 * free those over-booking quota for metadata blocks
+	 */
+	if (mdb_free)
+		vfs_dq_release_reservation_block(inode, mdb_free);
+
+	/*
+	 * If we have done all the pending block allocations and if
+	 * there aren't any writers on the inode, we can discard the
+	 * inode's preallocations.
+	 */
+	if (!total && (atomic_read(&inode->i_writecount) == 0))
+		ext4_discard_preallocations(inode);
 }
 
 /*
@@ -1366,6 +1448,10 @@ retry:
 		goto out;
 	}
 
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
@@ -1375,7 +1461,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_get_block);
+				ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -1547,8 +1633,8 @@ static int ext4_journalled_write_end(struct file *file,
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
 	int retries = 0;
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned long md_needed, mdblocks, total = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned long md_needed, mdblocks, total = 0;
 
 	/*
 	 * recalculate the amount of metadata blocks to reserve
@@ -1564,12 +1650,23 @@ repeat:
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
+	/*
+	 * Make quota reservation here to prevent quota overflow
+	 * later. Real quota accounting is done at pages writeout
+	 * time.
+	 */
+	if (vfs_dq_reserve_block(inode, total)) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -EDQUOT;
+	}
+
 	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
 		}
+		vfs_dq_release_reservation_block(inode, total);
 		return -ENOSPC;
 	}
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1623,6 +1720,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	vfs_dq_release_reservation_block(inode, release);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1652,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
 
 struct mpage_da_data {
 	struct inode *inode;
-	struct buffer_head lbh;			/* extent of blocks */
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
 	unsigned long first_page, next_page;	/* extent of pages */
-	get_block_t *get_block;
 	struct writeback_control *wbc;
 	int io_done;
 	int pages_written;
@@ -1668,7 +1768,6 @@ struct mpage_da_data {
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
@@ -1688,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->lbh.b_blocknr we would only be looking
+	 * If we look at mpd->b_blocknr we would only be looking
 	 * at the currently mapped buffer_heads.
 	 */
 	index = mpd->first_page;
@@ -1878,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	if (ret <= 0)
+		return ret;
+
+	bh_result->b_size = (ret << inode->i_blkbits);
+
+	if (ext4_should_order_data(inode)) {
+		int retval;
+		retval = ext4_jbd2_file_inode(handle, inode);
+		if (retval)
+			/*
+			 * Failed to add inode for ordered mode. Don't
+			 * update file size
+			 */
+			return retval;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation we don't
+	 * use 'extend_disksize' as size may change within already
+	 * allocated block -bzzz
+	 */
+	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+	if (disksize > i_size_read(inode))
+		disksize = i_size_read(inode);
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, disksize);
+		ret = ext4_mark_inode_dirty(handle, inode);
+		return ret;
+	}
+	return 0;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
- * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
+ * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
 	struct buffer_head new;
-	struct buffer_head *lbh = &mpd->lbh;
 	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
-	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+	if ((mpd->b_state  & (1 << BH_Mapped)) &&
+	    !(mpd->b_state & (1 << BH_Delay)))
 		return 0;
-	new.b_state = lbh->b_state;
+	new.b_state = mpd->b_state;
 	new.b_blocknr = 0;
-	new.b_size = lbh->b_size;
-	next = lbh->b_blocknr;
+	new.b_size = mpd->b_size;
+	next = mpd->b_blocknr;
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
 	 */
 	if (!new.b_size)
 		return 0;
-	err = mpd->get_block(mpd->inode, next, &new, 1);
-	if (err) {
 
-		/* If get block returns with error
-		 * we simply return. Later writepage
-		 * will redirty the page and writepages
-		 * will find the dirty page again
+	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	if (err) {
+		/*
+		 * If get block returns with error we simply
+		 * return. Later writepage will redirty the page and
+		 * writepages will find the dirty page again
 		 */
 		if (err == -EAGAIN)
 			return 0;
 
 		if (err == -ENOSPC &&
-				ext4_count_free_blocks(mpd->inode->i_sb)) {
+		    ext4_count_free_blocks(mpd->inode->i_sb)) {
 			mpd->retval = err;
 			return 0;
 		}
 
 		/*
-		 * get block failure will cause us
-		 * to loop in writepages. Because
-		 * a_ops->writepage won't be able to
-		 * make progress. The page will be redirtied
-		 * by writepage and writepages will again
-		 * try to write the same.
+		 * get block failure will cause us to loop in
+		 * writepages, because a_ops->writepage won't be able
+		 * to make progress. The page will be redirtied by
+		 * writepage and writepages will again try to write
+		 * the same.
 		 */
 		printk(KERN_EMERG "%s block allocation failed for inode %lu "
 				  "at logical offset %llu with max blocks "
 				  "%zd with error %d\n",
 				  __func__, mpd->inode->i_ino,
 				  (unsigned long long)next,
-				  lbh->b_size >> mpd->inode->i_blkbits, err);
+				  mpd->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_EMERG "This should not happen.!! "
 					"Data will be lost\n");
 		if (err == -ENOSPC) {
@@ -1947,7 +2089,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 		}
 		/* invlaidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
-				lbh->b_size >> mpd->inode->i_blkbits);
+				mpd->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
 	BUG_ON(new.b_size == 0);
@@ -1959,7 +2101,8 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * If blocks are delayed marked, we need to
 	 * put actual blocknr and drop delayed bit
 	 */
-	if (buffer_delay(lbh) || buffer_unwritten(lbh))
+	if ((mpd->b_state & (1 << BH_Delay)) ||
+	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
 	return 0;
@@ -1978,12 +2121,11 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-				   sector_t logical, struct buffer_head *bh)
+				   sector_t logical, size_t b_size,
+				   unsigned long b_state)
 {
 	sector_t next;
-	size_t b_size = bh->b_size;
-	struct buffer_head *lbh = &mpd->lbh;
-	int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
 	/* check if thereserved journal credits might overflow */
 	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2010,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	/*
 	 * First block in the extent
 	 */
-	if (lbh->b_size == 0) {
-		lbh->b_blocknr = logical;
-		lbh->b_size = b_size;
-		lbh->b_state = bh->b_state & BH_FLAGS;
+	if (mpd->b_size == 0) {
+		mpd->b_blocknr = logical;
+		mpd->b_size = b_size;
+		mpd->b_state = b_state & BH_FLAGS;
 		return;
 	}
 
-	next = lbh->b_blocknr + nrblocks;
+	next = mpd->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
-	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-		lbh->b_size += b_size;
+	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+		mpd->b_size += b_size;
 		return;
 	}
 
@@ -2051,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
 {
 	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
-	struct buffer_head *bh, *head, fake;
+	struct buffer_head *bh, *head;
 	sector_t logical;
 
 	if (mpd->io_done) {
@@ -2093,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
 		/*
 		 * ... and blocks
 		 */
-		mpd->lbh.b_size = 0;
-		mpd->lbh.b_state = 0;
-		mpd->lbh.b_blocknr = 0;
+		mpd->b_size = 0;
+		mpd->b_state = 0;
+		mpd->b_blocknr = 0;
 	}
 
 	mpd->next_page = page->index + 1;
@@ -2103,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
 		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	if (!page_has_buffers(page)) {
-		/*
-		 * There is no attached buffer heads yet (mmap?)
-		 * we treat the page asfull of dirty blocks
-		 */
-		bh = &fake;
-		bh->b_size = PAGE_CACHE_SIZE;
-		bh->b_state = 0;
-		set_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		mpage_add_bh_to_extent(mpd, logical, bh);
+		mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+				       (1 << BH_Dirty) | (1 << BH_Uptodate));
 		if (mpd->io_done)
 			return MPAGE_DA_EXTENT_TAIL;
 	} else {
@@ -2130,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
 			 * with the page in ext4_da_writepage
 			 */
 			if (buffer_dirty(bh) &&
-				(!buffer_mapped(bh) || buffer_delay(bh))) {
-				mpage_add_bh_to_extent(mpd, logical, bh);
+			    (!buffer_mapped(bh) || buffer_delay(bh))) {
+				mpage_add_bh_to_extent(mpd, logical,
+						       bh->b_size,
+						       bh->b_state);
 				if (mpd->io_done)
 					return MPAGE_DA_EXTENT_TAIL;
 			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2143,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
 				 * unmapped buffer_head later we need to
 				 * use the b_state flag of that buffer_head.
 				 */
-				if (mpd->lbh.b_size == 0)
-					mpd->lbh.b_state =
-						bh->b_state & BH_FLAGS;
+				if (mpd->b_size == 0)
+					mpd->b_state = bh->b_state & BH_FLAGS;
 			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
@@ -2155,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
 }
 
 /*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
-			       struct writeback_control *wbc,
-			       struct mpage_da_data *mpd)
-{
-	int ret;
-
-	if (!mpd->get_block)
-		return generic_writepages(mapping, wbc);
-
-	mpd->lbh.b_size = 0;
-	mpd->lbh.b_state = 0;
-	mpd->lbh.b_blocknr = 0;
-	mpd->first_page = 0;
-	mpd->next_page = 0;
-	mpd->io_done = 0;
-	mpd->pages_written = 0;
-	mpd->retval = 0;
-
-	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-	/*
-	 * Handle last extent of pages
-	 */
-	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-		if (mpage_da_map_blocks(mpd) == 0)
-			mpage_da_submit_io(mpd);
-
-		mpd->io_done = 1;
-		ret = MPAGE_DA_EXTENT_TAIL;
-	}
-	wbc->nr_to_write -= mpd->pages_written;
-	return ret;
-}
-
-/*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
  */
@@ -2238,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 	return ret;
 }
-#define		EXT4_DELALLOC_RSVED	1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-			bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	if (ret > 0) {
-
-		bh_result->b_size = (ret << inode->i_blkbits);
-
-		if (ext4_should_order_data(inode)) {
-			int retval;
-			retval = ext4_jbd2_file_inode(handle, inode);
-			if (retval)
-				/*
-				 * Failed to add inode for ordered
-				 * mode. Don't update file size
-				 */
-				return retval;
-		}
-
-		/*
-		 * Update on-disk size along with block allocation
-		 * we don't use 'extend_disksize' as size may change
-		 * within already allocated block -bzzz
-		 */
-		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > EXT4_I(inode)->i_disksize) {
-			ext4_update_i_disksize(inode, disksize);
-			ret = ext4_mark_inode_dirty(handle, inode);
-			return ret;
-		}
-		ret = 0;
-	}
-	return ret;
-}
 
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
@@ -2437,6 +2482,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
+	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2488,9 +2534,15 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	if (wbc->range_cyclic)
+	range_cyclic = wbc->range_cyclic;
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
-	else
+		if (index)
+			cycled = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = LLONG_MAX;
+		wbc->range_cyclic = 0;
+	} else
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 	mpd.wbc = wbc;
@@ -2504,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
+retry:
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -2525,12 +2578,42 @@ static int ext4_da_writepages(struct address_space *mapping,
 			dump_stack();
 			goto out_writepages;
 		}
-		mpd.get_block = ext4_da_get_block_write;
-		ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+		/*
+		 * Now call __mpage_da_writepage to find the next
+		 * contiguous region of logical blocks that need
+		 * blocks to be allocated by ext4.  We don't actually
+		 * submit the blocks for I/O here, even though
+		 * write_cache_pages thinks it will, and will set the
+		 * pages as clean for write before calling
+		 * __mpage_da_writepage().
+		 */
+		mpd.b_size = 0;
+		mpd.b_state = 0;
+		mpd.b_blocknr = 0;
+		mpd.first_page = 0;
+		mpd.next_page = 0;
+		mpd.io_done = 0;
+		mpd.pages_written = 0;
+		mpd.retval = 0;
+		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+					&mpd);
+		/*
+		 * If we have a contigous extent of pages and we
+		 * haven't done the I/O yet, map the blocks and submit
+		 * them for I/O.
+		 */
+		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+			if (mpage_da_map_blocks(&mpd) == 0)
+				mpage_da_submit_io(&mpd);
+			mpd.io_done = 1;
+			ret = MPAGE_DA_EXTENT_TAIL;
+		}
+		wbc->nr_to_write -= mpd.pages_written;
 
 		ext4_journal_stop(handle);
 
-		if (mpd.retval == -ENOSPC) {
+		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
@@ -2546,6 +2629,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 			pages_written += mpd.pages_written;
 			wbc->pages_skipped = pages_skipped;
 			ret = 0;
+			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
@@ -2554,6 +2638,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 			 */
 			break;
 	}
+	if (!io_done && !cycled) {
+		cycled = 1;
+		index = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = mapping->writeback_index - 1;
+		goto retry;
+	}
 	if (pages_skipped != wbc->pages_skipped)
 		printk(KERN_EMERG "This should not happen leaving %s "
 				"with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2652,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 
 	/* Update index */
 	index += pages_written;
+	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
@@ -2648,6 +2740,9 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
@@ -2790,6 +2885,48 @@ out:
 	return;
 }
 
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+	if (!EXT4_I(inode)->i_reserved_data_blocks &&
+	    !EXT4_I(inode)->i_reserved_meta_blocks)
+		return 0;
+
+	/*
+	 * We do something simple for now.  The filemap_flush() will
+	 * also start triggering a write of the data blocks, which is
+	 * not strictly speaking necessary (and for users of
+	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * would require replicating code paths in:
+	 * 
+	 * ext4_da_writepages() ->
+	 *    write_cache_pages() ---> (via passed in callback function)
+	 *        __mpage_da_writepage() -->
+	 *           mpage_add_bh_to_extent()
+	 *           mpage_da_map_blocks()
+	 *
+	 * The problem is that write_cache_pages(), located in
+	 * mm/page-writeback.c, marks pages clean in preparation for
+	 * doing I/O, which is not desirable if we're not planning on
+	 * doing I/O at all.
+	 *
+	 * We could call write_cache_pages(), and then redirty all of
+	 * the pages by calling redirty_page_for_writeback() but that
+	 * would be ugly in the extreme.  So instead we would need to
+	 * replicate parts of the code in the above functions,
+	 * simplifying them becuase we wouldn't actually intend to
+	 * write out the pages, but rather only collect contiguous
+	 * logical block extents, call the multi-block allocator, and
+	 * then update the buffer heads with the block allocations.
+	 * 
+	 * For now, though, we'll cheat by calling filemap_flush(),
+	 * which will map the blocks, and start the I/O, but not
+	 * actually wait for the I/O to complete.
+	 */
+	return filemap_flush(inode->i_mapping);
+}
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -3812,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		ext4_ext_truncate(inode);
 		return;
@@ -4054,12 +4194,7 @@ make_io:
 			unsigned num;
 
 			table = ext4_inode_table(sb, gdp);
-			/* Make sure s_inode_readahead_blks is a power of 2 */
-			while (EXT4_SB(sb)->s_inode_readahead_blks &
-			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
-				EXT4_SB(sb)->s_inode_readahead_blks = 
-				   (EXT4_SB(sb)->s_inode_readahead_blks &
-				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			/* s_inode_readahead_blks is always a power of 2 */
 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
 			if (table > b)
 				b = table;
@@ -4231,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4273,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
+	if (ei->i_flags & EXT4_EXTENTS_FL) {
+		/* Validate extent which is part of inode */
+		ret = ext4_ext_check_inode(inode);
+ 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		   (S_ISLNK(inode->i_mode) &&
+		    !ext4_inode_is_fast_symlink(inode))) {
+	 	/* Validate block references which are part of inode */
+		ret = ext4_check_inode_blockref(inode);
+	}
+	if (ret) {
+ 		brelse(bh);
+ 		goto bad_inode;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -4289,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-	} else {
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
@@ -4297,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else {
+		brelse(bh);
+		ret = -EIO;
+		ext4_error(inode->i_sb, __func__, 
+			   "bogus i_mode (%o) for inode=%lu",
+			   inode->i_mode, inode->i_ino);
+		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
@@ -4586,7 +4744,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
@@ -4965,7 +5123,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
@@ -5090,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
@@ -5143,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out_unlock;
 	ret = 0;
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT4_DIRSYNC_FL;
+		flags = ext4_mask_flags(inode->i_mode, flags);
 
 		err = -EPERM;
 		mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
 		return err;
 	}
 
+	case EXT4_IOC_ALLOC_DA_BLKS:
+	{
+		int err;
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		err = ext4_alloc_da_blocks(inode);
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbed..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
  * The allocation request involve request for multiple number of blocks
  * near to the goal(block) value specified.
  *
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
  *
  * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
  *
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
  * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
  * stripe=<value> option the group prealloc request is normalized to the
  * stripe value (sbi->s_stripe)
  *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
  *
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
  *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
  * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
  * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
  * best extent in the found extents. Searching for the blocks starts with
  * the group specified as the goal value in allocation context via
  * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
@@ -1447,7 +1446,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
 	struct ext4_free_extent *gex = &ac->ac_g_ex;
 
 	BUG_ON(ex->fe_len <= 0);
-	BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
 
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
 	unsigned free, fragments;
 	unsigned i, bits;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
 			return 0;
 
+		/* Avoid using the first bg of a flexgroup for data files */
+		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+		    ((group % flex_size) == 0))
+			return 0;
+
 		bits = ac->ac_sb->s_blocksize_bits + 1;
 		for (i = ac->ac_2order; i <= bits; i++)
 			if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	/*
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
-	 * You can tune it via /proc/fs/ext4/<partition>/order2_req
+	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
 	 */
 	if (i >= sbi->s_mb_order2_reqs) {
 		/*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		kfree(sbi->s_mb_maxs);
+		kfree(sbi->s_mb_offsets);
 		return -ENOMEM;
 	}
 
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
-	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
 	if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
 
 	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
-	ext4_mb_destroy_per_dev_proc(sb);
 
 	return 0;
 }
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 
-#define EXT4_MB_STATS_NAME		"stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
-#define EXT4_MB_ORDER2_REQ		"order2_req"
-#define EXT4_MB_STREAM_REQ		"stream_req"
-#define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct proc_dir_entry *proc;
-
-	if (sbi->s_proc == NULL)
-		return -EINVAL;
-
-	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
-	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
-	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
-	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
-	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
-	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
-	return 0;
-
-err_out:
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-	return -ENOMEM;
-#else
-	return 0;
-#endif
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_proc == NULL)
-		return -EINVAL;
-
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
-	return 0;
-}
-
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -3086,16 +3034,18 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
 		/* release all the reserved blocks if non delalloc */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
-	else
+	else {
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 						ac->ac_b_ex.fe_len);
+		/* convert reserved quota blocks to real quota blocks */
+		vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
+	}
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_sub(ac->ac_b_ex.fe_len,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3113,7 +3063,7 @@ out_err:
  * here we normalize request for locality group
  * Group request are normalized to s_strip size if we set the same via mount
  * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
  *
  * XXX: should we try to preallocate more than the group has now?
  */
@@ -3292,7 +3242,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	}
 	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
 			start > ac->ac_o_ex.fe_logical);
-	BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 
 	/* now prepare goal request */
 
@@ -3589,6 +3539,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 			struct super_block *sb, struct ext4_prealloc_space *pa)
 {
 	ext4_group_t grp;
+	ext4_fsblk_t grp_blk;
 
 	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
 		return;
@@ -3603,8 +3554,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	pa->pa_deleted = 1;
 	spin_unlock(&pa->pa_lock);
 
-	/* -1 is to protect from crossing allocation group */
-	ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+	grp_blk = pa->pa_pstart;
+	/* 
+	 * If doing group-based preallocation, pa_pstart may be in the
+	 * next group when pa is used up
+	 */
+	if (pa->pa_type == MB_GROUP_PA)
+		grp_blk--;
+
+	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
 
 	/*
 	 * possible race:
@@ -3693,8 +3651,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 0;
+	pa->pa_type = MB_INODE_PA;
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3755,8 +3715,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
 	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 1;
+	pa->pa_type = MB_GROUP_PA;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
 		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4010,7 +3971,7 @@ repeat:
 		list_del_rcu(&pa->pa_inode_list);
 		spin_unlock(pa->pa_obj_lock);
 
-		if (pa->pa_linear)
+		if (pa->pa_type == MB_GROUP_PA)
 			ext4_mb_release_group_pa(&e4b, pa, ac);
 		else
 			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4110,7 +4071,7 @@ repeat:
 	spin_unlock(&ei->i_prealloc_lock);
 
 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-		BUG_ON(pa->pa_linear != 0);
+		BUG_ON(pa->pa_type != MB_INODE_PA);
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4221,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  * file is determined by the current size or the resulting size after
  * allocation which ever is larger
  *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
  */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4362,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 			continue;
 		}
 		/* only lg prealloc space */
-		BUG_ON(!pa->pa_linear);
+		BUG_ON(pa->pa_type != MB_GROUP_PA);
 
 		/* seems this one can be freed ... */
 		pa->pa_deleted = 1;
@@ -4431,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 						pa_inode_list) {
 		spin_lock(&tmp_pa->pa_lock);
 		if (tmp_pa->pa_deleted) {
-			spin_unlock(&pa->pa_lock);
+			spin_unlock(&tmp_pa->pa_lock);
 			continue;
 		}
 		if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4468,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
 	struct ext4_prealloc_space *pa = ac->ac_pa;
 	if (pa) {
-		if (pa->pa_linear) {
+		if (pa->pa_type == MB_GROUP_PA) {
 			/* see comment in ext4_mb_use_group_pa() */
 			spin_lock(&pa->pa_lock);
 			pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4476,23 +4437,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			pa->pa_free -= ac->ac_b_ex.fe_len;
 			pa->pa_len -= ac->ac_b_ex.fe_len;
 			spin_unlock(&pa->pa_lock);
-			/*
-			 * We want to add the pa to the right bucket.
-			 * Remove it from the list and while adding
-			 * make sure the list to which we are adding
-			 * doesn't grow big.
-			 */
-			if (likely(pa->pa_free)) {
-				spin_lock(pa->pa_obj_lock);
-				list_del_rcu(&pa->pa_inode_list);
-				spin_unlock(pa->pa_obj_lock);
-				ext4_mb_add_n_trim(ac);
-			}
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->alloc_semp)
 		up_read(ac->alloc_semp);
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.  We need to release
+		 * alloc_semp before calling ext4_mb_add_n_trim()
+		 */
+		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
@@ -4533,7 +4497,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	unsigned int inquota;
+	unsigned int inquota = 0;
 	unsigned int reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
@@ -4551,9 +4515,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		   (unsigned long long) ar->pleft,
 		   (unsigned long long) ar->pright);
 
-	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
-		/*
-		 * With delalloc we already reserved the blocks
+	/*
+	 * For delayed allocation, we could skip the ENOSPC and
+	 * EDQUOT check, as blocks and quotas have been already
+	 * reserved when data being copied into pagecache.
+	 */
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+	else {
+		/* Without delayed allocation we need to verify
+		 * there is enough free blocks to do block allocation
+		 * and verify allocation doesn't exceed the quota limits.
 		 */
 		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
 			/* let others to free the space */
@@ -4565,19 +4537,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
+		while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+			ar->len--;
+		}
+		inquota = ar->len;
+		if (ar->len == 0) {
+			*errp = -EDQUOT;
+			goto out3;
+		}
 	}
-	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
-		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-		ar->len--;
-	}
-	if (ar->len == 0) {
-		*errp = -EDQUOT;
-		goto out3;
-	}
-	inquota = ar->len;
-
-	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
-		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
@@ -4643,8 +4612,8 @@ repeat:
 out2:
 	kmem_cache_free(ext4_ac_cachep, ac);
 out1:
-	if (ar->len < inquota)
-		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+	if (inquota && ar->len < inquota)
+		vfs_dq_free_block(ar->inode, inquota - ar->len);
 out3:
 	if (!ar->len) {
 		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4917,9 +4886,7 @@ do_more:
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += count;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
 	ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
 	ext4_lblk_t		pa_lstart;	/* log. block */
 	unsigned short		pa_len;		/* len of preallocated chunk */
 	unsigned short		pa_free;	/* how many blocks are free */
-	unsigned short		pa_linear;	/* consumed in one direction
-						 * strictly, for grp prealloc */
+	unsigned short		pa_type;	/* pa type. inode or group */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
 };
 
+enum {
+	MB_INODE_PA = 0,
+	MB_GROUP_PA = 1
+};
 
 struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
 					+ 1);
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		goto err_out;
+		return retval;
 	}
 	tmp_inode = ext4_new_inode(handle,
 				inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
-		tmp_inode = NULL;
-		goto err_out;
+		return retval;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -618,8 +617,7 @@ err_out:
 
 	ext4_journal_stop(handle);
 
-	if (tmp_inode)
-		iput(tmp_inode);
+	iput(tmp_inode);
 
 	return retval;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ba702bd7910d..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct dx_frame *frame,
 				 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+		struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else 
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
 /*
  * p is at least 6 bytes before the end of page
  */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
 	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
+		ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
 			space += EXT4_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, size);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-	for (; de < top; de = ext4_next_entry(de)) {
+	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
 		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
 					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
 		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
 			goto errout;
 		count++;
@@ -713,15 +737,15 @@ errout:
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo,
+		       struct dx_map_entry *map_tail)
 {
 	int count = 0;
 	char *base = (char *) de;
 	struct dx_hash_info h = *hinfo;
 
-	while ((char *) de < base + size)
-	{
+	while ((char *) de < base + blocksize) {
 		if (de->name_len && de->inode) {
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = ext4_rec_len_from_disk(de->rec_len);
+		de_len = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de)) {
+		for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
 			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
 				  + ((char *) de - bh->b_data);
 
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+		if (unlikely(IS_ERR(inode))) {
+			if (PTR_ERR(inode) == -ESTALE) {
+				ext4_error(dir->i_sb, __func__,
+						"deleted inode referenced: %u",
+						ino);
+				return ERR_PTR(-EIO);
+			} else {
+				return ERR_CAST(inode);
+			}
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+		unsigned blocksize)
 {
 	unsigned rec_len = 0;
 
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-				ext4_rec_len_to_disk(rec_len);
+				ext4_rec_len_to_disk(rec_len, blocksize);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
 	unsigned rec_len = 0;
 
 	prev = to = de;
-	while ((char*)de < base + size) {
-		next = ext4_next_entry(de);
+	while ((char*)de < base + blocksize) {
+		next = ext4_next_entry(de, blocksize);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = ext4_rec_len_to_disk(rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
 			prev = to;
 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 					hash2, split, count-split));
 
 	/* Fancy dance to stay within two buffers */
-	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
 	de = dx_pack_dirents(data1, blocksize);
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+					    blocksize);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
 	unsigned int	offset = 0;
+	unsigned int	blocksize = dir->i_sb->s_blocksize;
 	unsigned short	reclen;
 	int		nlen, rlen, err;
 	char		*top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
 		de = (struct ext4_dir_entry_2 *)bh->b_data;
-		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+		top = bh->b_data + blocksize - reclen;
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
-			rlen = ext4_rec_len_from_disk(de->rec_len);
+			rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
 				break;
 			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling */
 	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = ext4_rec_len_from_disk(de->rec_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
 	if (de->inode) {
 		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-		de->rec_len = ext4_rec_len_to_disk(nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+		de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
 		de = de1;
 	}
 	de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	/* The 0th block becomes the root, move the dirents out */
 	fde = &root->dotdot;
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len));
+		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
 		ext4_error(dir->i_sb, __func__,
 			   "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2 = ext4_next_entry(de)) < top)
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
 		de = de2;
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+					   blocksize);
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+					   blocksize);
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = ext4_rec_len_to_disk(blocksize);
+	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+							   sb->s_blocksize);
 		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
 			     struct buffer_head *bh)
 {
 	struct ext4_dir_entry_2 *de, *pde;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int i;
 
 	i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_journal_get_write_access(handle, bh);
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
-					ext4_rec_len_from_disk(pde->rec_len) +
-					ext4_rec_len_from_disk(de->rec_len));
+					ext4_rec_len_from_disk(pde->rec_len,
+							       blocksize) +
+					ext4_rec_len_from_disk(de->rec_len,
+							       blocksize),
+					blocksize);
 			else
 				de->inode = 0;
 			dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
 			ext4_handle_dirty_metadata(handle, dir, bh);
 			return 0;
 		}
-		i += ext4_rec_len_from_disk(de->rec_len);
+		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
 		pde = de;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return -ENOENT;
 }
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct inode *inode;
 	struct buffer_head *dir_block;
 	struct ext4_dir_entry_2 *de;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
-	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
 	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = ext4_next_entry(de);
+	de = ext4_next_entry(de, blocksize);
 	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
-						EXT4_DIR_REC_LEN(1));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+					   blocksize);
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
 		return 1;
 	}
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	de1 = ext4_next_entry(de);
+	de1 = ext4_next_entry(de, sb->s_blocksize);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
 			strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
 		brelse(bh);
 		return 1;
 	}
-	offset = ext4_rec_len_from_disk(de->rec_len) +
-		 ext4_rec_len_from_disk(de1->rec_len);
-	de = ext4_next_entry(de1);
+	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+	de = ext4_next_entry(de1, sb->s_blocksize);
 	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
 			brelse(bh);
 			return 0;
 		}
-		offset += ext4_rec_len_from_disk(de->rec_len);
-		de = ext4_next_entry(de);
+		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+		de = ext4_next_entry(de, sb->s_blocksize);
 	}
 	brelse(bh);
 	return 1;
@@ -2092,7 +2138,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2151,7 +2197,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2297,8 +2343,8 @@ retry:
 	return err;
 }
 
-#define PARENT_INO(buffer) \
-	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,14 +2357,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh, *dir_bh;
 	struct ext4_dir_entry_2 *old_de, *new_de;
-	int retval;
+	int retval, force_da_alloc = 0;
 
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		DQUOT_INIT(new_dentry->d_inode);
+		vfs_dq_init(new_dentry->d_inode);
 	handle = ext4_journal_start(old_dir, 2 *
 					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
-		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+				old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
 		ext4_journal_get_write_access(handle, dir_bh);
-		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
 		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
 		ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ext4_mark_inode_dirty(handle, new_inode);
 		if (!new_inode->i_nlink)
 			ext4_orphan_add(handle, new_inode);
+		if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+			force_da_alloc = 1;
 	}
 	retval = 0;
 
@@ -2457,6 +2507,8 @@ end_rename:
 	brelse(old_bh);
 	brelse(new_bh);
 	ext4_journal_stop(handle);
+	if (retval == 0 && force_da_alloc)
+		ext4_alloc_da_blocks(old_inode);
 	return retval;
 }
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
-		sbi->s_flex_groups[flex_group].free_blocks +=
-			input->free_blocks_count;
-		sbi->s_flex_groups[flex_group].free_inodes +=
-			EXT4_INODES_PER_GROUP(sb);
+		atomic_add(input->free_blocks_count,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(EXT4_INODES_PER_GROUP(sb),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
 	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..9987bba99db3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -48,6 +49,7 @@
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_commit_super(sb, es, 1);
 	}
 	if (sbi->s_proc) {
-		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
+	kobject_del(&sbi->s_kobj);
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	/*
+	 * Now that we are completely done shutting down the
+	 * superblock, we need to actually destroy the kobject.
+	 */
+	unlock_kernel();
+	unlock_super(sb);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	lock_super(sb);
+	lock_kernel();
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",noacl");
 #endif
-	if (!test_opt(sb, RESERVATION))
-		seq_puts(seq, ",noreservation");
 	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
+	if (test_opt(sb, NO_AUTO_DA_ALLOC))
+		seq_puts(seq, ",noauto_da_alloc");
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -926,8 +940,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
-static int ext4_dquot_initialize(struct inode *inode, int type);
-static int ext4_dquot_drop(struct inode *inode);
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
 static int ext4_release_dquot(struct dquot *dquot);
@@ -942,9 +954,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static struct dquot_operations ext4_quota_operations = {
-	.initialize	= ext4_dquot_initialize,
-	.drop		= ext4_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
+	.reserve_space	= dquot_reserve_space,
+	.claim_space	= dquot_claim_space,
+	.release_rsv	= dquot_release_reserved_space,
+	.get_reserved_space = ext4_get_reserved_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
@@ -1002,7 +1018,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
@@ -1010,8 +1026,8 @@ enum {
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_i_version,
+	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1037,8 +1053,6 @@ static const match_table_t tokens = {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
-	{Opt_reservation, "reservation"},
-	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
@@ -1066,6 +1080,8 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
@@ -1073,6 +1089,9 @@ static const match_table_t tokens = {
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc"},
+	{Opt_noauto_da_alloc, "noauto_da_alloc"},
 	{Opt_err, NULL},
 };
 
@@ -1205,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
 			       "not supported\n");
 			break;
 #endif
-		case Opt_reservation:
-			set_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_noreservation:
-			clear_opt(sbi->s_mount_opt, RESERVATION);
-			break;
 		case Opt_journal_update:
 			/* @@@ FIXME */
 			/* Eventually we will want to be able to create
@@ -1413,9 +1426,14 @@ set_qf_format:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option))
-				return 0;
+			if (match_int(&args[0], &option)) {
+				set_opt(sbi->s_mount_opt, BARRIER);
+				break;
+			}
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1461,6 +1479,11 @@ set_qf_format:
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
+			if (option & (option - 1)) {
+				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+				       " must be a power of 2\n");
+				return 0;
+			}
 			sbi->s_inode_readahead_blks = option;
 			break;
 		case Opt_journal_ioprio:
@@ -1471,6 +1494,19 @@ set_qf_format:
 			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
 							    option);
 			break;
+		case Opt_noauto_da_alloc:
+			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+			break;
+		case Opt_auto_da_alloc:
+			if (match_int(&args[0], &option)) {
+				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+				break;
+			}
+			if (option)
+				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+			else
+				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1610,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, &bh);
 
 		flex_group = ext4_flex_group(sbi, i);
-		sbi->s_flex_groups[flex_group].free_inodes +=
-			ext4_free_inodes_count(sb, gdp);
-		sbi->s_flex_groups[flex_group].free_blocks +=
-			ext4_free_blks_count(sb, gdp);
+		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+			   ext4_free_inodes_count(sb, gdp));
+		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+			   ext4_free_blks_count(sb, gdp));
+		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+			   ext4_used_dirs_count(sb, gdp));
 	}
 
 	return 1;
@@ -1802,7 +1840,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %lld bytes\n",
@@ -1989,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	return 0;
 }
 
+/* sysfs supprt */
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+			 const char *, size_t);
+	int offset;
+};
+
+static int parse_strtoul(const char *buf,
+		unsigned long max, unsigned long *value)
+{
+	char *endp;
+
+	while (*buf && isspace(*buf))
+		buf++;
+	*value = simple_strtoul(buf, &endp, 0);
+	while (*endp && isspace(*endp))
+		endp++;
+	if (*endp || *value > max)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			sbi->s_kbytes_written + 
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0x40000000, &t))
+		return -EINVAL;
+
+	/* inode_readahead_blks must be a power of 2 */
+	if (t & (t-1))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+				struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned long t;
+
+	if (parse_strtoul(buf, 0xffffffff, &t))
+		return -EINVAL;
+	*ui = t;
+	return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.offset = offsetof(struct ext4_sb_info, _elname),	\
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2019,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
+	sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+						      sectors[1]);
 
 	unlock_kernel();
 
@@ -2062,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT4_SUPER_MAGIC)
 		goto cantfind_ext4;
+	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2099,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
-	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
@@ -2323,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
-	if (sbi->s_proc)
-		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-				 &ext4_ui_proc_fops,
-				 &sbi->s_inode_readahead_blks);
 #endif
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
@@ -2562,6 +2779,16 @@ no_journal:
 		goto failed_mount4;
 	}
 
+	sbi->s_kobj.kset = ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err) {
+		ext4_mb_release(sb);
+		ext4_ext_release(sb);
+		goto failed_mount4;
+	};
+
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2616,7 +2843,6 @@ failed_mount2:
 	kfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_proc) {
-		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
@@ -2911,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
 		set_buffer_uptodate(sbh);
 	}
 	es->s_wtime = cpu_to_le32(get_seconds());
+	es->s_kbytes_written =
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3046,14 +3276,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
+	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
-		if (wait)
-			ret = ext4_force_commit(sb);
-		else
- 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+					      &target)) {
+			if (wait)
+				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+						     target);
+		}
 	} else {
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
 	}
@@ -3088,7 +3321,6 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		if (error)
 			goto out;
@@ -3365,8 +3597,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  * is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
- *   jbd2_journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   jbd2_journal_start()                  write_dquot()
+ *   vfs_dq_init()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
@@ -3378,44 +3610,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
 }
 
-static int ext4_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_initialize(inode, type);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext4_dquot_drop(struct inode *inode)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		return PTR_ERR(handle);
-	}
-	ret = dquot_drop(inode);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
 static int ext4_write_dquot(struct dquot *dquot)
 {
 	int ret, err;
@@ -3681,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-	unsigned int *p = m->private;
-
-	seq_printf(m, "%u\n", *p);
-	return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-			       size_t cnt, loff_t *ppos)
-{
-	unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-	char str[32];
-
-	if (cnt >= sizeof(str))
-		return -EINVAL;
-	if (copy_from_user(str, buf, cnt))
-		return -EFAULT;
-
-	*p = simple_strtoul(str, NULL, 0);
-	return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ext4_ui_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= ext4_ui_proc_write,
-};
-#endif
-
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
@@ -3753,6 +3908,9 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+	if (!ext4_kset)
+		return -ENOMEM;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
@@ -3794,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 157ce6589c54..62b31c246994 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -490,7 +490,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -784,7 +784,7 @@ inserted:
 				/* The old block is released after updating
 				   the inode. */
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1))
+				if (vfs_dq_alloc_block(inode, 1))
 					goto cleanup;
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
@@ -860,7 +860,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	DQUOT_FREE_BLOCK(inode, 1);
+	vfs_dq_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09adbe5..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 	sector_t blocknr;
 
 	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
-	mutex_lock(&mapping->host->i_mutex);
+	down_read(&mapping->host->i_alloc_sem);
 	blocknr = generic_block_bmap(mapping, block, fat_get_block);
-	mutex_unlock(&mapping->host->i_mutex);
+	up_read(&mapping->host->i_alloc_sem);
 
 	return blocknr;
 }
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 
 static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+	struct super_block *sb = dentry->d_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
 	buf->f_bavail = sbi->free_clusters;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = sbi->options.isvfat ? 260 : 12;
 
 	return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 	opts->fs_uid = current_uid();
 	opts->fs_gid = current_gid();
-	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->fs_fmask = current_umask();
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7ba03a4acbe0..da3f361a37dd 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -188,7 +188,7 @@ old_compare:
 	goto out;
 }
 
-static struct dentry_operations msdos_dentry_operations = {
+static const struct dentry_operations msdos_dentry_operations = {
 	.d_hash		= msdos_hash,
 	.d_compare	= msdos_cmp,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 8ae32e37673c..a0e00e3a46e9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -166,13 +166,13 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	return 1;
 }
 
-static struct dentry_operations vfat_ci_dentry_ops = {
+static const struct dentry_operations vfat_ci_dentry_ops = {
 	.d_revalidate	= vfat_revalidate_ci,
 	.d_hash		= vfat_hashi,
 	.d_compare	= vfat_cmpi,
 };
 
-static struct dentry_operations vfat_dentry_ops = {
+static const struct dentry_operations vfat_dentry_ops = {
 	.d_revalidate	= vfat_revalidate,
 	.d_hash		= vfat_hash,
 	.d_compare	= vfat_cmp,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc791da..cc8e4de2fee5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -141,7 +141,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
 	return ret;
 }
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -177,21 +177,21 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		return error;
 
 	/*
-	 * We still need a lock here for now to keep multiple FASYNC calls
-	 * from racing with each other.
+	 * ->fasync() is responsible for setting the FASYNC bit.
 	 */
-	lock_kernel();
-	if ((arg ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync) {
-			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
-			if (error < 0)
-				goto out;
-		}
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
+			filp->f_op->fasync) {
+		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+		if (error < 0)
+			goto out;
+		if (error > 0)
+			error = 0;
 	}
-
+	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
+
  out:
-	unlock_kernel();
 	return error;
 }
 
@@ -516,7 +516,7 @@ static DEFINE_RWLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 
 /*
- * fasync_helper() is used by some character device drivers (mainly mice)
+ * fasync_helper() is used by almost all character device drivers
  * to set up the fasync queue. It returns negative on error, 0 if it did
  * no changes and positive if it added/deleted the entry.
  */
@@ -531,6 +531,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		if (!new)
 			return -ENOMEM;
 	}
+
+	/*
+	 * We need to take f_lock first since it's not an IRQ-safe
+	 * lock.
+	 */
+	spin_lock(&filp->f_lock);
 	write_lock_irq(&fasync_lock);
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file == filp) {
@@ -555,7 +561,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		result = 1;
 	}
 out:
+	if (on)
+		filp->f_flags |= FASYNC;
+	else
+		filp->f_flags &= ~FASYNC;
 	write_unlock_irq(&fasync_lock);
+	spin_unlock(&filp->f_lock);
 	return result;
 }
 
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6efa1a..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -127,6 +128,7 @@ struct file *get_empty_filp(void)
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_cred = get_cred(cred);
+	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
@@ -167,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
 		fmode_t mode, const struct file_operations *fop)
 {
 	struct file *file;
-	struct path;
 
 	file = get_empty_filp();
 	if (!file)
@@ -279,6 +280,7 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
+	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
 		struct inode *tail_inode;
 
 		tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
-		if (!time_after_eq(inode->dirtied_when,
+		if (time_before(inode->dirtied_when,
 				tail_inode->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
 	wake_up_bit(&inode->i_state, __I_SYNC);
 }
 
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+	bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+	/*
+	 * For inodes being constantly redirtied, dirtied_when can get stuck.
+	 * It _appears_ to be in the future, but is actually in distant past.
+	 * This test is necessary to prevent such wrapped-around relative times
+	 * from permanently stopping the whole pdflush writeback.
+	 */
+	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+	return ret;
+}
+
 /*
  * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
  */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 		struct inode *inode = list_entry(delaying_queue->prev,
 						struct inode, i_list);
 		if (older_than_this &&
-			time_after(inode->dirtied_when, *older_than_this))
+		    inode_dirtied_after(inode, *older_than_this))
 			break;
 		list_move(&inode->i_list, dispatch_queue);
 	}
@@ -274,6 +289,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	BUG_ON(inode->i_state & I_SYNC);
+	WARN_ON(inode->i_state & I_NEW);
 
 	/* Set I_SYNC, reset I_DIRTY */
 	dirty = inode->i_state & I_DIRTY;
@@ -298,6 +314,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (!(inode->i_state & I_DIRTY) &&
@@ -418,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * If older_than_this is non-NULL, then only write out inodes which
  * had their first dirtying at a time earlier than *older_than_this.
  *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * If we're a pdflush thread, then implement pdflush collision avoidance
  * against the entire list.
  *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -470,6 +487,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
 			break;
 		}
 
+		if (inode->i_state & I_NEW) {
+			requeue_io(inode);
+			continue;
+		}
+
 		if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			wbc->encountered_congestion = 1;
 			if (!sb_is_blkdev_sb(sb))
@@ -485,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
 			continue;		/* blockdev has wrong queue */
 		}
 
-		/* Was this inode dirtied after sync_sb_inodes was called? */
-		if (time_after(inode->dirtied_when, start))
+		/*
+		 * Was this inode dirtied after sync_sb_inodes was called?
+		 * This keeps sync from extra jobs and livelock.
+		 */
+		if (inode_dirtied_after(inode, start))
 			break;
 
 		/* Is another pdflush already flushing this queue? */
@@ -531,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 			struct address_space *mapping;
 
-			if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			if (inode->i_state &
+					(I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
 				continue;
 			mapping = inode->i_mapping;
 			if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	write_lock(&fs->lock);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	write_lock(&fs->lock);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			write_lock(&fs->lock);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void free_fs_struct(struct fs_struct *fs)
+{
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct *fs = tsk->fs;
+
+	if (fs) {
+		int kill;
+		task_lock(tsk);
+		write_lock(&fs->lock);
+		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+		task_unlock(tsk);
+		if (kill)
+			free_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		fs->users = 1;
+		fs->in_exec = 0;
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
+		return -ENOMEM;
+
+	task_lock(current);
+	write_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	write_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.users		= 1,
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
+
+		write_lock(&init_fs.lock);
+		init_fs.users++;
+		write_unlock(&init_fs.lock);
+
+		write_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
+
+config FSCACHE
+	tristate "General filesystem local caching manager"
+	depends on EXPERIMENTAL
+	select SLOW_WORK
+	help
+	  This option enables a generic filesystem caching manager that can be
+	  used by various network and other filesystems to cache data locally.
+	  Different sorts of caches can be plugged in, depending on the
+	  resources available.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_STATS
+	bool "Gather statistical information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+	bool "Gather latency information on local caching"
+	depends on FSCACHE && PROC_FS
+	help
+	  This option causes latency information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/histogram
+
+	  The generation of this histogram adds a certain amount of overhead to
+	  execution as there are a number of points at which data is gathered,
+	  and on a multi-CPU system these may be on cachelines that keep
+	  bouncing between CPUs.  On the other hand, the histogram may be
+	  useful for debugging purposes.  Saying 'N' here is recommended.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_DEBUG
+	bool "Debug FS-Cache"
+	depends on FSCACHE
+	help
+	  This permits debugging to be dynamically enabled in the local caching
+	  management module.  If this is set, the debugging output may be
+	  enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+	  See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for general filesystem caching code
+#
+
+fscache-y := \
+	cache.o \
+	cookie.o \
+	fsdef.o \
+	main.o \
+	netfs.o \
+	object.o \
+	operation.o \
+	page.o
+
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+	struct fscache_cache_tag *tag, *xtag;
+
+	/* firstly check for the existence of the tag under read lock */
+	down_read(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_read(&fscache_addremove_sem);
+			return tag;
+		}
+	}
+
+	up_read(&fscache_addremove_sem);
+
+	/* the tag does not exist - create a candidate */
+	xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+	if (!xtag)
+		/* return a dummy tag if out of memory */
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&xtag->usage, 1);
+	strcpy(xtag->name, name);
+
+	/* write lock, search again and add if still not present */
+	down_write(&fscache_addremove_sem);
+
+	list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+		if (strcmp(tag->name, name) == 0) {
+			atomic_inc(&tag->usage);
+			up_write(&fscache_addremove_sem);
+			kfree(xtag);
+			return tag;
+		}
+	}
+
+	list_add_tail(&xtag->link, &fscache_cache_tag_list);
+	up_write(&fscache_addremove_sem);
+	return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+	if (tag != ERR_PTR(-ENOMEM)) {
+		down_write(&fscache_addremove_sem);
+
+		if (atomic_dec_and_test(&tag->usage))
+			list_del_init(&tag->link);
+		else
+			tag = NULL;
+
+		up_write(&fscache_addremove_sem);
+
+		kfree(tag);
+	}
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *cookie)
+{
+	struct fscache_cache_tag *tag;
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("");
+
+	if (list_empty(&fscache_cache_list)) {
+		_leave(" = NULL [no cache]");
+		return NULL;
+	}
+
+	/* we check the parent to determine the cache to use */
+	spin_lock(&cookie->lock);
+
+	/* the first in the parent's backing list should be the preferred
+	 * cache */
+	if (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object, cookie_link);
+
+		cache = object->cache;
+		if (object->state >= FSCACHE_OBJECT_DYING ||
+		    test_bit(FSCACHE_IOERROR, &cache->flags))
+			cache = NULL;
+
+		spin_unlock(&cookie->lock);
+		_leave(" = %p [parent]", cache);
+		return cache;
+	}
+
+	/* the parent is unbacked */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* cookie not an index and is unbacked */
+		spin_unlock(&cookie->lock);
+		_leave(" = NULL [cookie ub,ni]");
+		return NULL;
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (!cookie->def->select_cache)
+		goto no_preference;
+
+	/* ask the netfs for its preference */
+	tag = cookie->def->select_cache(cookie->parent->netfs_data,
+					cookie->netfs_data);
+	if (!tag)
+		goto no_preference;
+
+	if (tag == ERR_PTR(-ENOMEM)) {
+		_leave(" = NULL [nomem tag]");
+		return NULL;
+	}
+
+	if (!tag->cache) {
+		_leave(" = NULL [unbacked tag]");
+		return NULL;
+	}
+
+	if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+		return NULL;
+
+	_leave(" = %p [specific]", tag->cache);
+	return tag->cache;
+
+no_preference:
+	/* netfs has no preference - just select first cache */
+	cache = list_entry(fscache_cache_list.next,
+			   struct fscache_cache, link);
+	_leave(" = %p [first]", cache);
+	return cache;
+}
+
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+			const struct fscache_cache_ops *ops,
+			const char *idfmt,
+			...)
+{
+	va_list va;
+
+	memset(cache, 0, sizeof(*cache));
+
+	cache->ops = ops;
+
+	va_start(va, idfmt);
+	vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+	va_end(va);
+
+	INIT_WORK(&cache->op_gc, fscache_operation_gc);
+	INIT_LIST_HEAD(&cache->link);
+	INIT_LIST_HEAD(&cache->object_list);
+	INIT_LIST_HEAD(&cache->op_gc_list);
+	spin_lock_init(&cache->object_list_lock);
+	spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+		      struct fscache_object *ifsdef,
+		      const char *tagname)
+{
+	struct fscache_cache_tag *tag;
+
+	BUG_ON(!cache->ops);
+	BUG_ON(!ifsdef);
+
+	cache->flags = 0;
+	ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+	ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+
+	if (!tagname)
+		tagname = cache->identifier;
+
+	BUG_ON(!tagname[0]);
+
+	_enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+
+	/* we use the cache tag to uniquely identify caches */
+	tag = __fscache_lookup_cache_tag(tagname);
+	if (IS_ERR(tag))
+		goto nomem;
+
+	if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+		goto tag_in_use;
+
+	cache->kobj = kobject_create_and_add(tagname, fscache_root);
+	if (!cache->kobj)
+		goto error;
+
+	ifsdef->cookie = &fscache_fsdef_index;
+	ifsdef->cache = cache;
+	cache->fsdef = ifsdef;
+
+	down_write(&fscache_addremove_sem);
+
+	tag->cache = cache;
+	cache->tag = tag;
+
+	/* add the cache to the list */
+	list_add(&cache->link, &fscache_cache_list);
+
+	/* add the cache's netfs definition index object to the cache's
+	 * list */
+	spin_lock(&cache->object_list_lock);
+	list_add_tail(&ifsdef->cache_link, &cache->object_list);
+	spin_unlock(&cache->object_list_lock);
+
+	/* add the cache's netfs definition index object to the top level index
+	 * cookie as a known backing object */
+	spin_lock(&fscache_fsdef_index.lock);
+
+	hlist_add_head(&ifsdef->cookie_link,
+		       &fscache_fsdef_index.backing_objects);
+
+	atomic_inc(&fscache_fsdef_index.usage);
+
+	/* done */
+	spin_unlock(&fscache_fsdef_index.lock);
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+	       cache->tag->name, cache->ops->name);
+	kobject_uevent(cache->kobj, KOBJ_ADD);
+
+	_leave(" = 0 [%s]", cache->identifier);
+	return 0;
+
+tag_in_use:
+	printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EXIST");
+	return -EEXIST;
+
+error:
+	__fscache_release_cache_tag(tag);
+	_leave(" = -EINVAL");
+	return -EINVAL;
+
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+	set_bit(FSCACHE_IOERROR, &cache->flags);
+
+	printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+	       cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+					 struct list_head *dying_objects)
+{
+	struct fscache_object *object;
+
+	spin_lock(&cache->object_list_lock);
+
+	while (!list_empty(&cache->object_list)) {
+		object = list_entry(cache->object_list.next,
+				    struct fscache_object, cache_link);
+		list_move_tail(&object->cache_link, dying_objects);
+
+		_debug("withdraw %p", object->cookie);
+
+		spin_lock(&object->lock);
+		spin_unlock(&cache->object_list_lock);
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+		spin_unlock(&object->lock);
+
+		cond_resched();
+		spin_lock(&cache->object_list_lock);
+	}
+
+	spin_unlock(&cache->object_list_lock);
+}
+
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+	LIST_HEAD(dying_objects);
+
+	_enter("");
+
+	printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+	       cache->tag->name);
+
+	/* make the cache unavailable for cookie acquisition */
+	if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+		BUG();
+
+	down_write(&fscache_addremove_sem);
+	list_del_init(&cache->link);
+	cache->tag->cache = NULL;
+	up_write(&fscache_addremove_sem);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disk */
+	cache->ops->sync_cache(cache);
+
+	/* dissociate all the netfs pages backed by this cache from the block
+	 * mappings in the cache */
+	cache->ops->dissociate_pages(cache);
+
+	/* we now have to destroy all the active objects pertaining to this
+	 * cache - which we do by passing them off to thread pool to be
+	 * disposed of */
+	_debug("destroy");
+
+	fscache_withdraw_all_objects(cache, &dying_objects);
+
+	/* wait for all extant objects to finish their outstanding operations
+	 * and go away */
+	_debug("wait for finish");
+	wait_event(fscache_cache_cleared_wq,
+		   atomic_read(&cache->object_count) == 0);
+	_debug("wait for clearance");
+	wait_event(fscache_cache_cleared_wq,
+		   list_empty(&cache->object_list));
+	_debug("cleared");
+	ASSERT(list_empty(&dying_objects));
+
+	kobject_put(cache->kobj);
+
+	clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+	fscache_release_cache_tag(cache->tag);
+	cache->tag = NULL;
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct kmem_cache *fscache_cookie_jar;
+
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object);
+
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+	struct fscache_cookie *cookie = _cookie;
+
+	memset(cookie, 0, sizeof(*cookie));
+	spin_lock_init(&cookie->lock);
+	INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+	struct fscache_cookie *parent,
+	const struct fscache_cookie_def *def,
+	void *netfs_data)
+{
+	struct fscache_cookie *cookie;
+
+	BUG_ON(!def);
+
+	_enter("{%s},{%s},%p",
+	       parent ? (char *) parent->def->name : "<no-parent>",
+	       def->name, netfs_data);
+
+	fscache_stat(&fscache_n_acquires);
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (!parent) {
+		fscache_stat(&fscache_n_acquires_null);
+		_leave(" [no parent]");
+		return NULL;
+	}
+
+	/* validate the definition */
+	BUG_ON(!def->get_key);
+	BUG_ON(!def->name[0]);
+
+	BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+	if (!cookie) {
+		fscache_stat(&fscache_n_acquires_oom);
+		_leave(" [ENOMEM]");
+		return NULL;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->n_children, 0);
+
+	atomic_inc(&parent->usage);
+	atomic_inc(&parent->n_children);
+
+	cookie->def		= def;
+	cookie->parent		= parent;
+	cookie->netfs_data	= netfs_data;
+	cookie->flags		= 0;
+
+	INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+
+	switch (cookie->def->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		fscache_stat(&fscache_n_cookie_index);
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		fscache_stat(&fscache_n_cookie_data);
+		break;
+	default:
+		fscache_stat(&fscache_n_cookie_special);
+		break;
+	}
+
+	/* if the object is an index then we need do nothing more here - we
+	 * create indices on disk when we need them as an index may exist in
+	 * multiple caches */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		if (fscache_acquire_non_index_cookie(cookie) < 0) {
+			atomic_dec(&parent->n_children);
+			__fscache_cookie_put(cookie);
+			fscache_stat(&fscache_n_acquires_nobufs);
+			_leave(" = NULL");
+			return NULL;
+		}
+	}
+
+	fscache_stat(&fscache_n_acquires_ok);
+	_leave(" = %p", cookie);
+	return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+	uint64_t i_size;
+	int ret;
+
+	_enter("");
+
+	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&fscache_addremove_sem);
+
+	if (list_empty(&fscache_cache_list)) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = 0 [no caches]");
+		return 0;
+	}
+
+	/* select a cache in which to store the object */
+	cache = fscache_select_cache_for_object(cookie->parent);
+	if (!cache) {
+		up_read(&fscache_addremove_sem);
+		fscache_stat(&fscache_n_acquires_no_cache);
+		_leave(" = -ENOMEDIUM [no cache]");
+		return -ENOMEDIUM;
+	}
+
+	_debug("cache %s", cache->tag->name);
+
+	cookie->flags =
+		(1 << FSCACHE_COOKIE_LOOKING_UP) |
+		(1 << FSCACHE_COOKIE_CREATING) |
+		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+	/* ask the cache to allocate objects for this cookie and its parent
+	 * chain */
+	ret = fscache_alloc_object(cache, cookie);
+	if (ret < 0) {
+		up_read(&fscache_addremove_sem);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* pass on how big the object we're caching is supposed to be */
+	cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+	spin_lock(&cookie->lock);
+	if (hlist_empty(&cookie->backing_objects)) {
+		spin_unlock(&cookie->lock);
+		goto unavailable;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	fscache_set_store_limit(object, i_size);
+
+	/* initiate the process of looking up all the objects in the chain
+	 * (done by fscache_initialise_object()) */
+	fscache_enqueue_object(object);
+
+	spin_unlock(&cookie->lock);
+
+	/* we may be required to wait for lookup to complete at this point */
+	if (!fscache_defer_lookup) {
+		_debug("non-deferred lookup %p", &cookie->flags);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("complete");
+		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+			goto unavailable;
+	}
+
+	up_read(&fscache_addremove_sem);
+	_leave(" = 0 [deferred]");
+	return 0;
+
+unavailable:
+	up_read(&fscache_addremove_sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+				struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+	spin_lock(&cookie->lock);
+	hlist_for_each_entry(object, _n, &cookie->backing_objects,
+			     cookie_link) {
+		if (object->cache == cache)
+			goto object_already_extant;
+	}
+	spin_unlock(&cookie->lock);
+
+	/* ask the cache to allocate an object (we may end up with duplicate
+	 * objects at this stage, but we sort that out later) */
+	object = cache->ops->alloc_object(cache, cookie);
+	if (IS_ERR(object)) {
+		fscache_stat(&fscache_n_object_no_alloc);
+		ret = PTR_ERR(object);
+		goto error;
+	}
+
+	fscache_stat(&fscache_n_object_alloc);
+
+	object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+	_debug("ALLOC OBJ%x: %s {%lx}",
+	       object->debug_id, cookie->def->name, object->events);
+
+	ret = fscache_alloc_object(cache, cookie->parent);
+	if (ret < 0)
+		goto error_put;
+
+	/* only attach if we managed to allocate all we needed, otherwise
+	 * discard the object we just allocated and instead use the one
+	 * attached to the cookie */
+	if (fscache_attach_object(cookie, object) < 0)
+		cache->ops->put_object(object);
+
+	_leave(" = 0");
+	return 0;
+
+object_already_extant:
+	ret = -ENOBUFS;
+	if (object->state >= FSCACHE_OBJECT_DYING) {
+		spin_unlock(&cookie->lock);
+		goto error;
+	}
+	spin_unlock(&cookie->lock);
+	_leave(" = 0 [found]");
+	return 0;
+
+error_put:
+	cache->ops->put_object(object);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+				 struct fscache_object *object)
+{
+	struct fscache_object *p;
+	struct fscache_cache *cache = object->cache;
+	struct hlist_node *_n;
+	int ret;
+
+	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+	spin_lock(&cookie->lock);
+
+	/* there may be multiple initial creations of this object, but we only
+	 * want one */
+	ret = -EEXIST;
+	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING)
+				ret = -ENOBUFS;
+			goto cant_attach_object;
+		}
+	}
+
+	/* pin the parent object */
+	spin_lock_nested(&cookie->parent->lock, 1);
+	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+			     cookie_link) {
+		if (p->cache == object->cache) {
+			if (p->state >= FSCACHE_OBJECT_DYING) {
+				ret = -ENOBUFS;
+				spin_unlock(&cookie->parent->lock);
+				goto cant_attach_object;
+			}
+			object->parent = p;
+			spin_lock(&p->lock);
+			p->n_children++;
+			spin_unlock(&p->lock);
+			break;
+		}
+	}
+	spin_unlock(&cookie->parent->lock);
+
+	/* attach to the cache's object list */
+	if (list_empty(&object->cache_link)) {
+		spin_lock(&cache->object_list_lock);
+		list_add(&object->cache_link, &cache->object_list);
+		spin_unlock(&cache->object_list_lock);
+	}
+
+	/* attach to the cookie */
+	object->cookie = cookie;
+	atomic_inc(&cookie->usage);
+	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+	ret = 0;
+
+cant_attach_object:
+	spin_unlock(&cookie->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+	struct fscache_object *object;
+	struct hlist_node *_p;
+
+	fscache_stat(&fscache_n_updates);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_updates_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("{%s}", cookie->def->name);
+
+	BUG_ON(!cookie->def->get_aux);
+
+	spin_lock(&cookie->lock);
+
+	/* update the index entry on disk in each cache backing this cookie */
+	hlist_for_each_entry(object, _p,
+			     &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	}
+
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+	struct fscache_cache *cache;
+	struct fscache_object *object;
+	unsigned long event;
+
+	fscache_stat(&fscache_n_relinquishes);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p},%d",
+	       cookie, cookie->def->name, cookie->netfs_data, retire);
+
+	if (atomic_read(&cookie->n_children) != 0) {
+		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+		       cookie->def->name);
+		BUG();
+	}
+
+	/* wait for the cookie to finish being instantiated (or to fail) */
+	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+		fscache_stat(&fscache_n_relinquishes_waitcrt);
+		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+
+	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+	/* detach pointers back to the netfs */
+	spin_lock(&cookie->lock);
+
+	cookie->netfs_data	= NULL;
+	cookie->def		= NULL;
+
+	/* break links with all the active objects */
+	while (!hlist_empty(&cookie->backing_objects)) {
+		object = hlist_entry(cookie->backing_objects.first,
+				     struct fscache_object,
+				     cookie_link);
+
+		_debug("RELEASE OBJ%x", object->debug_id);
+
+		/* detach each cache object from the object cookie */
+		spin_lock(&object->lock);
+		hlist_del_init(&object->cookie_link);
+
+		cache = object->cache;
+		object->cookie = NULL;
+		fscache_raise_event(object, event);
+		spin_unlock(&object->lock);
+
+		if (atomic_dec_and_test(&cookie->usage))
+			/* the cookie refcount shouldn't be reduced to 0 yet */
+			BUG();
+	}
+
+	spin_unlock(&cookie->lock);
+
+	if (cookie->parent) {
+		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+		ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+		atomic_dec(&cookie->parent->n_children);
+	}
+
+	/* finally dispose of the cookie */
+	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+	fscache_cookie_put(cookie);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	struct fscache_cookie *parent;
+
+	_enter("%p", cookie);
+
+	for (;;) {
+		_debug("FREE COOKIE %p", cookie);
+		parent = cookie->parent;
+		BUG_ON(!hlist_empty(&cookie->backing_objects));
+		kmem_cache_free(fscache_cookie_jar, cookie);
+
+		if (!parent)
+			break;
+
+		cookie = parent;
+		BUG_ON(atomic_read(&cookie->usage) <= 0);
+		if (!atomic_dec_and_test(&cookie->usage))
+			break;
+	}
+
+	_leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax);
+
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+						    const void *data,
+						    uint16_t datalen);
+
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *				     FSDEF
+ *				       |
+ *				 +-----------+
+ *				 |           |
+ *				NFS         AFS
+ *			       [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+	.name		= ".FS-Cache",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+struct fscache_cookie fscache_fsdef_index = {
+	.usage		= ATOMIC_INIT(1),
+	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+	.backing_objects = HLIST_HEAD_INIT,
+	.def		= &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+	.name		= "FSDEF.netfs",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= fscache_fsdef_netfs_get_key,
+	.get_aux	= fscache_fsdef_netfs_get_aux,
+	.check_aux	= fscache_fsdef_netfs_check_aux,
+};
+
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned klen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	klen = strlen(netfs->name);
+	if (klen > bufmax)
+		return 0;
+
+	memcpy(buffer, netfs->name, klen);
+	return klen;
+}
+
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+					    void *buffer, uint16_t bufmax)
+{
+	const struct fscache_netfs *netfs = cookie_netfs_data;
+	unsigned dlen;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	dlen = sizeof(uint32_t);
+	if (dlen > bufmax)
+		return 0;
+
+	memcpy(buffer, &netfs->version, dlen);
+	return dlen;
+}
+
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+	void *cookie_netfs_data,
+	const void *data,
+	uint16_t datalen)
+{
+	struct fscache_netfs *netfs = cookie_netfs_data;
+	uint32_t version;
+
+	_enter("{%s},,%hu", netfs->name, datalen);
+
+	if (datalen != sizeof(version)) {
+		_leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	memcpy(&version, data, sizeof(version));
+	if (version != netfs->version) {
+		_leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+		return FSCACHE_CHECKAUX_OBSOLETE;
+	}
+
+	_leave(" = OKAY");
+	return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+	unsigned long index;
+	unsigned n[5], t;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+			 " RETRV DLY RETRIEVLS\n");
+		return 0;
+	case 2:
+		seq_puts(m, "===== ===== ========= ========= ========="
+			 " ========= =========\n");
+		return 0;
+	default:
+		index = (unsigned long) v - 3;
+		n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+		n[1] = atomic_read(&fscache_ops_histogram[index]);
+		n[2] = atomic_read(&fscache_objs_histogram[index]);
+		n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+		n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+		if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+			return 0;
+
+		t = (index * 1000) / HZ;
+
+		seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+			   index, t, n[0], n[1], n[2], n[3], n[4]);
+		return 0;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+	if ((unsigned long long)*_pos >= HZ + 2)
+		return NULL;
+	if (*_pos == 0)
+		*_pos = 1;
+	return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (unsigned long long)*pos > HZ + 2 ?
+		NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+	.start		= fscache_histogram_start,
+	.stop		= fscache_histogram_stop,
+	.next		= fscache_histogram_next,
+	.show		= fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_histogram_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..e0cbd16f6dc9
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - fscache_thread_lock
+ *
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+
+#define FSCACHE_MIN_THREADS	4
+#define FSCACHE_MAX_THREADS	32
+
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+	struct fscache_cookie *);
+
+/*
+ * fsc-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+
+/*
+ * fsc-fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+	unsigned long jif = jiffies - start_jif;
+	if (jif >= HZ)
+		jif = HZ - 1;
+	atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * fsc-main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+
+/*
+ * fsc-object.c
+ */
+extern void fscache_withdrawing_object(struct fscache_cache *,
+				       struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+
+/*
+ * fsc-operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+				       struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+			     struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()	(0)
+#define fscache_proc_cleanup()	do {} while (0)
+#endif
+
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define fscache_stat(stat) do {} while (0)
+#endif
+
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+				       unsigned event)
+{
+	if (!test_and_set_bit(event, &object->events) &&
+	    test_bit(event, &object->event_mask))
+		fscache_enqueue_object(object);
+}
+
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+	BUG_ON(atomic_read(&cookie->usage) <= 0);
+	if (atomic_dec_and_test(&cookie->usage))
+		__fscache_cookie_put(cookie);
+}
+
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->get_context)
+		cookie->def->get_context(cookie->netfs_data, context);
+	return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+	if (cookie->def->put_context)
+		cookie->def->put_context(cookie->netfs_data, context);
+}
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf, 1, 2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)			\
+do {						\
+	if (__do_kdebug(ENTER))			\
+		kenter(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _leave(FMT, ...)			\
+do {						\
+	if (__do_kdebug(LEAVE))			\
+		kleave(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _debug(FMT, ...)			\
+do {						\
+	if (__do_kdebug(DEBUG))			\
+		kdebug(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+	unlikely((fscache_debug & \
+		  (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+	____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+	___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+
+#define FSCACHE_DEBUG_CACHE	0
+#define FSCACHE_DEBUG_COOKIE	1
+#define FSCACHE_DEBUG_PAGE	2
+#define FSCACHE_DEBUG_OPERATION	3
+
+#define FSCACHE_POINT_ENTER	1
+#define FSCACHE_POINT_LEAVE	2
+#define FSCACHE_POINT_DEBUG	4
+
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)							\
+do {									\
+	if (unlikely(!(X))) {						\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIF(C, X)							\
+do {									\
+	if (unlikely((C) && !(X))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		BUG();							\
+	}								\
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "FS-Cache: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while (0)
+
+#else
+
+#define ASSERT(X)			do {} while (0)
+#define ASSERTCMP(X, OP, Y)		do {} while (0)
+#define ASSERTIF(C, X)			do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)	do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+		 "Defer cookie lookup to background thread");
+
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+		 "Defer cookie creation to background thread");
+
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+		 "FS-Cache debugging mask");
+
+struct kobject *fscache_root;
+
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+	int ret;
+
+	ret = slow_work_register_user();
+	if (ret < 0)
+		goto error_slow_work;
+
+	ret = fscache_proc_init();
+	if (ret < 0)
+		goto error_proc;
+
+	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+					       sizeof(struct fscache_cookie),
+					       0,
+					       0,
+					       fscache_cookie_init_once);
+	if (!fscache_cookie_jar) {
+		printk(KERN_NOTICE
+		       "FS-Cache: Failed to allocate a cookie jar\n");
+		ret = -ENOMEM;
+		goto error_cookie_jar;
+	}
+
+	fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+	if (!fscache_root)
+		goto error_kobj;
+
+	printk(KERN_NOTICE "FS-Cache: Loaded\n");
+	return 0;
+
+error_kobj:
+	kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+	fscache_proc_cleanup();
+error_proc:
+	slow_work_unregister_user();
+error_slow_work:
+	return ret;
+}
+
+fs_initcall(fscache_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+	_enter("");
+
+	kobject_put(fscache_root);
+	kmem_cache_destroy(fscache_cookie_jar);
+	fscache_proc_cleanup();
+	slow_work_unregister_user();
+	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+
+module_exit(fscache_exit);
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+	schedule();
+	return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+	struct fscache_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->n_children, 0);
+
+	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
+	netfs->primary_index->parent		= &fscache_fsdef_index;
+	netfs->primary_index->netfs_data	= netfs;
+
+	atomic_inc(&netfs->primary_index->parent->usage);
+	atomic_inc(&netfs->primary_index->parent->n_children);
+
+	spin_lock_init(&netfs->primary_index->lock);
+	INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+	/* check the netfs type is not already present */
+	down_write(&fscache_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &fscache_netfs_list, link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &fscache_netfs_list);
+	ret = 0;
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+	       netfs->name);
+
+already_registered:
+	up_write(&fscache_addremove_sem);
+
+	if (ret < 0) {
+		netfs->primary_index->parent = NULL;
+		__fscache_cookie_put(netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&fscache_addremove_sem);
+
+	list_del(&netfs->link);
+	fscache_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&fscache_addremove_sem);
+
+	printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+	       netfs->name);
+
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+
+const char *fscache_object_states[] = {
+	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
+	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
+	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
+	[FSCACHE_OBJECT_AVAILABLE]	= "OBJECT_AVAILABLE",
+	[FSCACHE_OBJECT_ACTIVE]		= "OBJECT_ACTIVE",
+	[FSCACHE_OBJECT_UPDATING]	= "OBJECT_UPDATING",
+	[FSCACHE_OBJECT_DYING]		= "OBJECT_DYING",
+	[FSCACHE_OBJECT_LC_DYING]	= "OBJECT_LC_DYING",
+	[FSCACHE_OBJECT_ABORT_INIT]	= "OBJECT_ABORT_INIT",
+	[FSCACHE_OBJECT_RELEASING]	= "OBJECT_RELEASING",
+	[FSCACHE_OBJECT_RECYCLING]	= "OBJECT_RECYCLING",
+	[FSCACHE_OBJECT_WITHDRAWING]	= "OBJECT_WITHDRAWING",
+	[FSCACHE_OBJECT_DEAD]		= "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+
+static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_object_slow_work_execute(struct slow_work *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+
+const struct slow_work_ops fscache_object_slow_work_ops = {
+	.get_ref	= fscache_object_slow_work_get_ref,
+	.put_ref	= fscache_object_slow_work_put_ref,
+	.execute	= fscache_object_slow_work_execute,
+};
+EXPORT_SYMBOL(fscache_object_slow_work_ops);
+
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("OBJ%x {OBJ%x,%x}",
+	       object->debug_id, parent->debug_id, parent->n_ops);
+
+	spin_lock_nested(&parent->lock, 1);
+	parent->n_ops--;
+	parent->n_obj_ops--;
+	if (parent->n_ops == 0)
+		fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+	spin_unlock(&parent->lock);
+}
+
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+	enum fscache_object_state new_state;
+
+	ASSERT(object != NULL);
+
+	_enter("{OBJ%x,%s,%lx}",
+	       object->debug_id, fscache_object_states[object->state],
+	       object->events);
+
+	switch (object->state) {
+		/* wait for the parent object to become ready */
+	case FSCACHE_OBJECT_INIT:
+		object->event_mask =
+			ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+		fscache_initialise_object(object);
+		goto done;
+
+		/* look up the object metadata on disk */
+	case FSCACHE_OBJECT_LOOKING_UP:
+		fscache_lookup_object(object);
+		goto lookup_transit;
+
+		/* create the object metadata on disk */
+	case FSCACHE_OBJECT_CREATING:
+		fscache_lookup_object(object);
+		goto lookup_transit;
+
+		/* handle an object becoming available; start pending
+		 * operations and queue dependent operations for processing */
+	case FSCACHE_OBJECT_AVAILABLE:
+		fscache_object_available(object);
+		goto active_transit;
+
+		/* normal running state */
+	case FSCACHE_OBJECT_ACTIVE:
+		goto active_transit;
+
+		/* update the object metadata on disk */
+	case FSCACHE_OBJECT_UPDATING:
+		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+		fscache_stat(&fscache_n_updates_run);
+		object->cache->ops->update_object(object);
+		goto active_transit;
+
+		/* handle an object dying during lookup or creation */
+	case FSCACHE_OBJECT_LC_DYING:
+		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+		object->cache->ops->lookup_complete(object);
+
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DYING;
+		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+				       &object->cookie->flags))
+			wake_up_bit(&object->cookie->flags,
+				    FSCACHE_COOKIE_CREATING);
+		spin_unlock(&object->lock);
+
+		fscache_done_parent_op(object);
+
+		/* wait for completion of all active operations on this object
+		 * and the death of all child objects of this object */
+	case FSCACHE_OBJECT_DYING:
+	dying:
+		clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+		spin_lock(&object->lock);
+		_debug("dying OBJ%x {%d,%d}",
+		       object->debug_id, object->n_ops, object->n_children);
+		if (object->n_ops == 0 && object->n_children == 0) {
+			object->event_mask &=
+				~(1 << FSCACHE_OBJECT_EV_CLEARED);
+			object->event_mask |=
+				(1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+				(1 << FSCACHE_OBJECT_EV_RETIRE) |
+				(1 << FSCACHE_OBJECT_EV_RELEASE) |
+				(1 << FSCACHE_OBJECT_EV_ERROR);
+		} else {
+			object->event_mask &=
+				~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+				  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+				  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+				  (1 << FSCACHE_OBJECT_EV_ERROR));
+			object->event_mask |=
+				1 << FSCACHE_OBJECT_EV_CLEARED;
+		}
+		spin_unlock(&object->lock);
+		fscache_enqueue_dependents(object);
+		goto terminal_transit;
+
+		/* handle an abort during initialisation */
+	case FSCACHE_OBJECT_ABORT_INIT:
+		_debug("handle abort init %lx", object->events);
+		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+
+		spin_lock(&object->lock);
+		fscache_dequeue_object(object);
+
+		object->state = FSCACHE_OBJECT_DYING;
+		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+				       &object->cookie->flags))
+			wake_up_bit(&object->cookie->flags,
+				    FSCACHE_COOKIE_CREATING);
+		spin_unlock(&object->lock);
+		goto dying;
+
+		/* handle the netfs releasing an object and possibly marking it
+		 * obsolete too */
+	case FSCACHE_OBJECT_RELEASING:
+	case FSCACHE_OBJECT_RECYCLING:
+		object->event_mask &=
+			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			  (1 << FSCACHE_OBJECT_EV_ERROR));
+		fscache_release_object(object);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DEAD;
+		spin_unlock(&object->lock);
+		fscache_stat(&fscache_n_object_dead);
+		goto terminal_transit;
+
+		/* handle the parent cache of this object being withdrawn from
+		 * active service */
+	case FSCACHE_OBJECT_WITHDRAWING:
+		object->event_mask &=
+			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			  (1 << FSCACHE_OBJECT_EV_ERROR));
+		fscache_withdraw_object(object);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_DEAD;
+		spin_unlock(&object->lock);
+		fscache_stat(&fscache_n_object_dead);
+		goto terminal_transit;
+
+		/* complain about the object being woken up once it is
+		 * deceased */
+	case FSCACHE_OBJECT_DEAD:
+		printk(KERN_ERR "FS-Cache:"
+		       " Unexpected event in dead state %lx\n",
+		       object->events & object->event_mask);
+		BUG();
+
+	default:
+		printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+		       object->state);
+		BUG();
+	}
+
+	/* determine the transition from a lookup state */
+lookup_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+	case FSCACHE_OBJECT_EV_RETIRE:
+	case FSCACHE_OBJECT_EV_RELEASE:
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_LC_DYING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_REQUEUE:
+		goto done;
+	case -1:
+		goto done; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+	/* determine the transition from an active state */
+active_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+	case FSCACHE_OBJECT_EV_RETIRE:
+	case FSCACHE_OBJECT_EV_RELEASE:
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_DYING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_UPDATE:
+		new_state = FSCACHE_OBJECT_UPDATING;
+		goto change_state;
+	case -1:
+		new_state = FSCACHE_OBJECT_ACTIVE;
+		goto change_state; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+	/* determine the transition from a terminal state */
+terminal_transit:
+	switch (fls(object->events & object->event_mask) - 1) {
+	case FSCACHE_OBJECT_EV_WITHDRAW:
+		new_state = FSCACHE_OBJECT_WITHDRAWING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_RETIRE:
+		new_state = FSCACHE_OBJECT_RECYCLING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_RELEASE:
+		new_state = FSCACHE_OBJECT_RELEASING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_ERROR:
+		new_state = FSCACHE_OBJECT_WITHDRAWING;
+		goto change_state;
+	case FSCACHE_OBJECT_EV_CLEARED:
+		new_state = FSCACHE_OBJECT_DYING;
+		goto change_state;
+	case -1:
+		goto done; /* sleep until event */
+	default:
+		goto unsupported_event;
+	}
+
+change_state:
+	spin_lock(&object->lock);
+	object->state = new_state;
+	spin_unlock(&object->lock);
+
+done:
+	_leave(" [->%s]", fscache_object_states[object->state]);
+	return;
+
+unsupported_event:
+	printk(KERN_ERR "FS-Cache:"
+	       " Unsupported event %lx [mask %lx] in state %s\n",
+	       object->events, object->event_mask,
+	       fscache_object_states[object->state]);
+	BUG();
+}
+
+/*
+ * execute an object
+ */
+static void fscache_object_slow_work_execute(struct slow_work *work)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+	unsigned long start;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+
+	start = jiffies;
+	fscache_object_state_machine(object);
+	fscache_hist(fscache_objs_histogram, start);
+	if (object->events & object->event_mask)
+		fscache_enqueue_object(object);
+}
+
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+	struct fscache_object *parent;
+
+	_enter("");
+	ASSERT(object->cookie != NULL);
+	ASSERT(object->cookie->parent != NULL);
+	ASSERT(list_empty(&object->work.link));
+
+	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
+			      (1 << FSCACHE_OBJECT_EV_RETIRE) |
+			      (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+		_debug("abort init %lx", object->events);
+		spin_lock(&object->lock);
+		object->state = FSCACHE_OBJECT_ABORT_INIT;
+		spin_unlock(&object->lock);
+		return;
+	}
+
+	spin_lock(&object->cookie->lock);
+	spin_lock_nested(&object->cookie->parent->lock, 1);
+
+	parent = object->parent;
+	if (!parent) {
+		_debug("no parent");
+		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+	} else {
+		spin_lock(&object->lock);
+		spin_lock_nested(&parent->lock, 1);
+		_debug("parent %s", fscache_object_states[parent->state]);
+
+		if (parent->state >= FSCACHE_OBJECT_DYING) {
+			_debug("bad parent");
+			set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+		} else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+			_debug("wait");
+
+			/* we may get woken up in this state by child objects
+			 * binding on to us, so we need to make sure we don't
+			 * add ourself to the list multiple times */
+			if (list_empty(&object->dep_link)) {
+				object->cache->ops->grab_object(object);
+				list_add(&object->dep_link,
+					 &parent->dependents);
+
+				/* fscache_acquire_non_index_cookie() uses this
+				 * to wake the chain up */
+				if (parent->state == FSCACHE_OBJECT_INIT)
+					fscache_enqueue_object(parent);
+			}
+		} else {
+			_debug("go");
+			parent->n_ops++;
+			parent->n_obj_ops++;
+			object->lookup_jif = jiffies;
+			object->state = FSCACHE_OBJECT_LOOKING_UP;
+			set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		}
+
+		spin_unlock(&parent->lock);
+		spin_unlock(&object->lock);
+	}
+
+	spin_unlock(&object->cookie->parent->lock);
+	spin_unlock(&object->cookie->lock);
+	_leave("");
+}
+
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_object *parent;
+
+	_enter("");
+
+	parent = object->parent;
+	ASSERT(parent != NULL);
+	ASSERTCMP(parent->n_ops, >, 0);
+	ASSERTCMP(parent->n_obj_ops, >, 0);
+
+	/* make sure the parent is still available */
+	ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+
+	if (parent->state >= FSCACHE_OBJECT_DYING ||
+	    test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+		_debug("unavailable");
+		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+		_leave("");
+		return;
+	}
+
+	_debug("LOOKUP \"%s/%s\" in \"%s\"",
+	       parent->cookie->def->name, cookie->def->name,
+	       object->cache->tag->name);
+
+	fscache_stat(&fscache_n_object_lookups);
+	object->cache->ops->lookup_object(object);
+
+	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+	_leave("");
+}
+
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}",
+	       object->debug_id, fscache_object_states[object->state]);
+
+	spin_lock(&object->lock);
+	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+		fscache_stat(&fscache_n_object_lookups_negative);
+
+		/* transit here to allow write requests to begin stacking up
+		 * and read requests to begin returning ENODATA */
+		object->state = FSCACHE_OBJECT_CREATING;
+		spin_unlock(&object->lock);
+
+		set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+		_debug("wake up lookup %p", &cookie->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	} else {
+		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+		spin_unlock(&object->lock);
+	}
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+
+	_enter("{OBJ%x,%s}",
+	       object->debug_id, fscache_object_states[object->state]);
+
+	/* if we were still looking up, then we must have a positive lookup
+	 * result, in which case there may be data available */
+	spin_lock(&object->lock);
+	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+		fscache_stat(&fscache_n_object_lookups_positive);
+
+		clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+		object->state = FSCACHE_OBJECT_AVAILABLE;
+		spin_unlock(&object->lock);
+
+		smp_mb__before_clear_bit();
+		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	} else {
+		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+		fscache_stat(&fscache_n_object_created);
+
+		object->state = FSCACHE_OBJECT_AVAILABLE;
+		spin_unlock(&object->lock);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		smp_wmb();
+	}
+
+	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+
+	_leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	spin_lock(&object->lock);
+
+	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+		wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+
+	fscache_done_parent_op(object);
+	if (object->n_in_progress == 0) {
+		if (object->n_ops > 0) {
+			ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+			ASSERTIF(object->n_ops > object->n_obj_ops,
+				 !list_empty(&object->pending_ops));
+			fscache_start_operations(object);
+		} else {
+			ASSERT(list_empty(&object->pending_ops));
+		}
+	}
+	spin_unlock(&object->lock);
+
+	object->cache->ops->lookup_complete(object);
+	fscache_enqueue_dependents(object);
+
+	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+	fscache_stat(&fscache_n_object_avail);
+
+	_leave("");
+}
+
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+	struct fscache_object *parent = object->parent;
+	struct fscache_cache *cache = object->cache;
+
+	_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+
+	spin_lock(&cache->object_list_lock);
+	list_del_init(&object->cache_link);
+	spin_unlock(&cache->object_list_lock);
+
+	cache->ops->drop_object(object);
+
+	if (parent) {
+		_debug("release parent OBJ%x {%d}",
+		       parent->debug_id, parent->n_children);
+
+		spin_lock(&parent->lock);
+		parent->n_children--;
+		if (parent->n_children == 0)
+			fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+		spin_unlock(&parent->lock);
+		object->parent = NULL;
+	}
+
+	/* this just shifts the object release to the slow work processor */
+	object->cache->ops->put_object(object);
+
+	_leave("");
+}
+
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+	_enter("");
+
+	fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie;
+	bool detached;
+
+	_enter("");
+
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* need to get the cookie lock before the object lock, starting
+		 * from the object pointer */
+		atomic_inc(&cookie->usage);
+		spin_unlock(&object->lock);
+
+		detached = false;
+		spin_lock(&cookie->lock);
+		spin_lock(&object->lock);
+
+		if (object->cookie == cookie) {
+			hlist_del_init(&object->cookie_link);
+			object->cookie = NULL;
+			detached = true;
+		}
+		spin_unlock(&cookie->lock);
+		fscache_cookie_put(cookie);
+		if (detached)
+			fscache_cookie_put(cookie);
+	}
+
+	spin_unlock(&object->lock);
+
+	fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use object
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+				struct fscache_object *object)
+{
+	bool enqueue = false;
+
+	_enter(",OBJ%x", object->debug_id);
+
+	spin_lock(&object->lock);
+	if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+		object->state = FSCACHE_OBJECT_WITHDRAWING;
+		enqueue = true;
+	}
+	spin_unlock(&object->lock);
+
+	if (enqueue)
+		fscache_enqueue_object(object);
+
+	_leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an object
+ */
+static int fscache_object_slow_work_get_ref(struct slow_work *work)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+
+	return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on a work item
+ */
+static void fscache_object_slow_work_put_ref(struct slow_work *work)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+
+	return object->cache->ops->put_object(object);
+}
+
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	slow_work_enqueue(&object->work);
+}
+
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+	struct fscache_object *dep;
+
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (list_empty(&object->dependents))
+		return;
+
+	spin_lock(&object->lock);
+
+	while (!list_empty(&object->dependents)) {
+		dep = list_entry(object->dependents.next,
+				 struct fscache_object, dep_link);
+		list_del_init(&dep->dep_link);
+
+
+		/* sort onto appropriate lists */
+		fscache_enqueue_object(dep);
+		dep->cache->ops->put_object(dep);
+
+		if (!list_empty(&object->dependents))
+			cond_resched_lock(&object->lock);
+	}
+
+	spin_unlock(&object->lock);
+}
+
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	if (!list_empty(&object->dep_link)) {
+		spin_lock(&object->parent->lock);
+		list_del_init(&object->dep_link);
+		spin_unlock(&object->parent->lock);
+	}
+
+	_leave("");
+}
+
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+					const void *data, uint16_t datalen)
+{
+	enum fscache_checkaux result;
+
+	if (!object->cookie->def->check_aux) {
+		fscache_stat(&fscache_n_checkaux_none);
+		return FSCACHE_CHECKAUX_OKAY;
+	}
+
+	result = object->cookie->def->check_aux(object->cookie->netfs_data,
+						data, datalen);
+	switch (result) {
+		/* entry okay as is */
+	case FSCACHE_CHECKAUX_OKAY:
+		fscache_stat(&fscache_n_checkaux_okay);
+		break;
+
+		/* entry requires update */
+	case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+		fscache_stat(&fscache_n_checkaux_update);
+		break;
+
+		/* entry requires deletion */
+	case FSCACHE_CHECKAUX_OBSOLETE:
+		fscache_stat(&fscache_n_checkaux_obsolete);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include "internal.h"
+
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+	_enter("{OBJ%x OP%x,%u}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(op->processor != NULL);
+	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	if (list_empty(&op->pend_link)) {
+		switch (op->flags & FSCACHE_OP_TYPE) {
+		case FSCACHE_OP_FAST:
+			_debug("queue fast");
+			atomic_inc(&op->usage);
+			if (!schedule_work(&op->fast_work))
+				fscache_put_operation(op);
+			break;
+		case FSCACHE_OP_SLOW:
+			_debug("queue slow");
+			slow_work_enqueue(&op->slow_work);
+			break;
+		case FSCACHE_OP_MYTHREAD:
+			_debug("queue for caller's attention");
+			break;
+		default:
+			printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+			       op->flags);
+			BUG();
+			break;
+		}
+		fscache_stat(&fscache_n_op_enqueue);
+	}
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+			   struct fscache_operation *op)
+{
+	object->n_in_progress++;
+	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+	if (op->processor)
+		fscache_enqueue_operation(op);
+	fscache_stat(&fscache_n_op_run);
+}
+
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+				struct fscache_operation *op)
+{
+	int ret;
+
+	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+	ret = -ENOBUFS;
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+
+		if (object->n_ops > 0) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_in_progress, ==, 0);
+			fscache_run_op(object, op);
+		}
+
+		/* need to issue a new write op after this */
+		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+		op->object = object;
+		object->n_ops++;
+		object->n_exclusive++;	/* reads and writes must wait */
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else {
+		/* not allowed to submit ops in any other state */
+		BUG();
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+						 struct fscache_operation *op,
+						 unsigned long ostate)
+{
+	static bool once_only;
+	struct fscache_operation *p;
+	unsigned n;
+
+	if (once_only)
+		return;
+	once_only = true;
+
+	kdebug("unexpected submission OP%x [OBJ%x %s]",
+	       op->debug_id, object->debug_id,
+	       fscache_object_states[object->state]);
+	kdebug("objstate=%s [%s]",
+	       fscache_object_states[object->state],
+	       fscache_object_states[ostate]);
+	kdebug("objflags=%lx", object->flags);
+	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+	kdebug("ops=%u inp=%u exc=%u",
+	       object->n_ops, object->n_in_progress, object->n_exclusive);
+
+	if (!list_empty(&object->pending_ops)) {
+		n = 0;
+		list_for_each_entry(p, &object->pending_ops, pend_link) {
+			ASSERTCMP(p->object, ==, object);
+			kdebug("%p %p", op->processor, op->release);
+			n++;
+		}
+
+		kdebug("n=%u", n);
+	}
+
+	dump_stack();
+}
+
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+		      struct fscache_operation *op)
+{
+	unsigned long ostate;
+	int ret;
+
+	_enter("{OBJ%x OP%x},{%u}",
+	       object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	spin_lock(&object->lock);
+	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+	ostate = object->state;
+	smp_rmb();
+
+	if (fscache_object_is_active(object)) {
+		op->object = object;
+		object->n_ops++;
+
+		if (object->n_exclusive > 0) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+		} else if (!list_empty(&object->pending_ops)) {
+			atomic_inc(&op->usage);
+			list_add_tail(&op->pend_link, &object->pending_ops);
+			fscache_stat(&fscache_n_op_pend);
+			fscache_start_operations(object);
+		} else {
+			ASSERTCMP(object->n_exclusive, ==, 0);
+			fscache_run_op(object, op);
+		}
+		ret = 0;
+	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+		op->object = object;
+		object->n_ops++;
+		atomic_inc(&op->usage);
+		list_add_tail(&op->pend_link, &object->pending_ops);
+		fscache_stat(&fscache_n_op_pend);
+		ret = 0;
+	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+		fscache_report_unexpected_submission(object, op, ostate);
+		ASSERT(!fscache_object_is_active(object));
+		ret = -ENOBUFS;
+	} else {
+		ret = -ENOBUFS;
+	}
+
+	spin_unlock(&object->lock);
+	return ret;
+}
+
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+	_enter("{OBJ%x}", object->debug_id);
+
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+	struct fscache_operation *op;
+	bool stop = false;
+
+	while (!list_empty(&object->pending_ops) && !stop) {
+		op = list_entry(object->pending_ops.next,
+				struct fscache_operation, pend_link);
+
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+			if (object->n_in_progress > 0)
+				break;
+			stop = true;
+		}
+		list_del_init(&op->pend_link);
+		object->n_in_progress++;
+
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		if (op->processor)
+			fscache_enqueue_operation(op);
+
+		/* the pending queue was holding a ref on the object */
+		fscache_put_operation(op);
+	}
+
+	ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+
+	_debug("woke %d ops on OBJ%x",
+	       object->n_in_progress, object->debug_id);
+}
+
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+	struct fscache_object *object;
+	struct fscache_cache *cache;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+	if (!atomic_dec_and_test(&op->usage))
+		return;
+
+	_debug("PUT OP");
+	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+		BUG();
+
+	fscache_stat(&fscache_n_op_release);
+
+	if (op->release) {
+		op->release(op);
+		op->release = NULL;
+	}
+
+	object = op->object;
+
+	/* now... we may get called with the object spinlock held, so we
+	 * complete the cleanup here only if we can immediately acquire the
+	 * lock, and defer it otherwise */
+	if (!spin_trylock(&object->lock)) {
+		_debug("defer put");
+		fscache_stat(&fscache_n_op_deferred_release);
+
+		cache = object->cache;
+		spin_lock(&cache->op_gc_list_lock);
+		list_add_tail(&op->pend_link, &cache->op_gc_list);
+		spin_unlock(&cache->op_gc_list_lock);
+		schedule_work(&cache->op_gc);
+		_leave(" [defer]");
+		return;
+	}
+
+	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+		ASSERTCMP(object->n_exclusive, >, 0);
+		object->n_exclusive--;
+	}
+
+	ASSERTCMP(object->n_in_progress, >, 0);
+	object->n_in_progress--;
+	if (object->n_in_progress == 0)
+		fscache_start_operations(object);
+
+	ASSERTCMP(object->n_ops, >, 0);
+	object->n_ops--;
+	if (object->n_ops == 0)
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+	spin_unlock(&object->lock);
+
+	kfree(op);
+	_leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+	struct fscache_cache *cache =
+		container_of(work, struct fscache_cache, op_gc);
+	int count = 0;
+
+	_enter("");
+
+	do {
+		spin_lock(&cache->op_gc_list_lock);
+		if (list_empty(&cache->op_gc_list)) {
+			spin_unlock(&cache->op_gc_list_lock);
+			break;
+		}
+
+		op = list_entry(cache->op_gc_list.next,
+				struct fscache_operation, pend_link);
+		list_del(&op->pend_link);
+		spin_unlock(&cache->op_gc_list_lock);
+
+		object = op->object;
+
+		_debug("GC DEFERRED REL OBJ%x OP%x",
+		       object->debug_id, op->debug_id);
+		fscache_stat(&fscache_n_op_gc);
+
+		ASSERTCMP(atomic_read(&op->usage), ==, 0);
+
+		spin_lock(&object->lock);
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+			ASSERTCMP(object->n_exclusive, >, 0);
+			object->n_exclusive--;
+		}
+
+		ASSERTCMP(object->n_in_progress, >, 0);
+		object->n_in_progress--;
+		if (object->n_in_progress == 0)
+			fscache_start_operations(object);
+
+		ASSERTCMP(object->n_ops, >, 0);
+		object->n_ops--;
+		if (object->n_ops == 0)
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+		spin_unlock(&object->lock);
+
+	} while (count++ < 20);
+
+	if (!list_empty(&cache->op_gc_list))
+		schedule_work(&cache->op_gc);
+
+	_leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an operation
+ */
+static int fscache_op_get_ref(struct slow_work *work)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, slow_work);
+
+	atomic_inc(&op->usage);
+	return 0;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on an operation
+ */
+static void fscache_op_put_ref(struct slow_work *work)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, slow_work);
+
+	fscache_put_operation(op);
+}
+
+/*
+ * execute an operation using the slow thread pool to provide processing context
+ * - the caller holds a ref to this object, so we don't need to hold one
+ */
+static void fscache_op_execute(struct slow_work *work)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, slow_work);
+	unsigned long start;
+
+	_enter("{OBJ%x OP%x,%d}",
+	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+	ASSERT(op->processor != NULL);
+	start = jiffies;
+	op->processor(op);
+	fscache_hist(fscache_ops_histogram, start);
+
+	_leave("");
+}
+
+const struct slow_work_ops fscache_op_slow_work_ops = {
+	.get_ref	= fscache_op_get_ref,
+	.put_ref	= fscache_op_put_ref,
+	.execute	= fscache_op_execute,
+};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	void *val;
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	rcu_read_unlock();
+
+	return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+	struct page *xpage;
+
+	spin_lock(&cookie->lock);
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->lock);
+	ASSERT(xpage != NULL);
+
+	wake_up_bit(&cookie->flags, 0);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+	struct fscache_object *object = op->object;
+
+	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+	fscache_stat(&fscache_n_attr_changed_calls);
+
+	if (fscache_object_is_active(object) &&
+	    object->cache->ops->attr_changed(object) < 0)
+		fscache_abort_object(object);
+
+	_leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+	struct fscache_operation *op;
+	struct fscache_object *object;
+
+	_enter("%p", cookie);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	fscache_stat(&fscache_n_attr_changed);
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		fscache_stat(&fscache_n_attr_changed_nomem);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	fscache_operation_init(op, NULL);
+	fscache_operation_init_slow(op, fscache_attr_changed_op);
+	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_exclusive_op(object, op) < 0)
+		goto nobufs;
+	spin_unlock(&cookie->lock);
+	fscache_stat(&fscache_n_attr_changed_ok);
+	fscache_put_operation(op);
+	_leave(" = 0");
+	return 0;
+
+nobufs:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+	fscache_stat(&fscache_n_attr_changed_nobufs);
+	_leave(" = %d", -ENOBUFS);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+	struct fscache_retrieval *op =
+		container_of(work, struct fscache_retrieval, op.fast_work);
+	unsigned long start;
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	start = jiffies;
+	op->op.processor(&op->op);
+	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(&op->op);
+}
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	_enter("{OP%x}", op->op.debug_id);
+
+	fscache_hist(fscache_retrieval_histogram, op->start_time);
+	if (op->context)
+		fscache_put_context(op->op.object->cookie, op->context);
+
+	_leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct address_space *mapping,
+	fscache_rw_complete_t end_io_func,
+	void *context)
+{
+	struct fscache_retrieval *op;
+
+	/* allocate a retrieval operation and attempt to submit it */
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op) {
+		fscache_stat(&fscache_n_retrievals_nomem);
+		return NULL;
+	}
+
+	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	op->mapping	= mapping;
+	op->end_io_func	= end_io_func;
+	op->context	= context;
+	op->start_time	= jiffies;
+	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+	INIT_LIST_HEAD(&op->to_do);
+	return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+	unsigned long jif;
+
+	_enter("");
+
+	if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+		_leave(" = 0 [imm]");
+		return 0;
+	}
+
+	fscache_stat(&fscache_n_retrievals_wait);
+
+	jif = jiffies;
+	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) != 0) {
+		fscache_stat(&fscache_n_retrievals_intr);
+		_leave(" = -ERESTARTSYS");
+		return -ERESTARTSYS;
+	}
+
+	ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+	smp_rmb();
+	fscache_hist(fscache_retrieval_delay_histogram, jif);
+	_leave(" = 0 [dly]");
+	return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   -ENODATA	- no data available in the backing object for this block
+ *   0		- dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+				 struct page *page,
+				 fscache_rw_complete_t end_io_func,
+				 void *context,
+				 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	if (!op) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+		ret = object->cache->ops->allocate_page(op, page, gfp);
+		if (ret == 0)
+			ret = -ENODATA;
+	} else {
+		ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+	}
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned *nr_pages,
+				  fscache_rw_complete_t end_io_func,
+				  void *context,
+				  gfp_t gfp)
+{
+	fscache_pages_retrieval_func_t func;
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,,%d,,,", cookie, *nr_pages);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(*nr_pages, >, 0);
+	ASSERT(!list_empty(pages));
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure */
+	fscache_get_context(object->cookie, op->context);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_retrieval_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+		func = object->cache->ops->allocate_pages;
+	else
+		func = object->cache->ops->read_or_alloc_pages;
+	ret = func(op, pages, nr_pages, gfp);
+
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS	- no backing object available in which to cache the block
+ *   0		- block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%p,,,", cookie, page);
+
+	fscache_stat(&fscache_n_allocs);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_alloc_ops);
+
+	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+		_debug(">>> WT");
+		fscache_stat(&fscache_n_alloc_op_waits);
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+		_debug("<<< GO");
+	}
+
+	/* ask the cache to honour the operation */
+	ret = object->cache->ops->allocate_page(op, page, gfp);
+
+	if (ret < 0)
+		fscache_stat(&fscache_n_allocs_nobufs);
+	else
+		fscache_stat(&fscache_n_allocs_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	kfree(op);
+nobufs:
+	fscache_stat(&fscache_n_allocs_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+	_enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+	struct fscache_storage *op =
+		container_of(_op, struct fscache_storage, op);
+	struct fscache_object *object = op->op.object;
+	struct fscache_cookie *cookie = object->cookie;
+	struct page *page;
+	unsigned n;
+	void *results[1];
+	int ret;
+
+	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+	spin_lock(&cookie->lock);
+	spin_lock(&object->lock);
+
+	if (!fscache_object_is_active(object)) {
+		spin_unlock(&object->lock);
+		spin_unlock(&cookie->lock);
+		_leave("");
+		return;
+	}
+
+	fscache_stat(&fscache_n_store_calls);
+
+	/* find a page to store */
+	page = NULL;
+	n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+				       FSCACHE_COOKIE_PENDING_TAG);
+	if (n != 1)
+		goto superseded;
+	page = results[0];
+	_debug("gang %d [%lx]", n, page->index);
+	if (page->index > op->store_limit)
+		goto superseded;
+
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
+
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+
+	if (page) {
+		ret = object->cache->ops->write_page(op, page);
+		fscache_end_page_write(cookie, page);
+		page_cache_release(page);
+		if (ret < 0)
+			fscache_abort_object(object);
+		else
+			fscache_enqueue_operation(&op->op);
+	}
+
+	_leave("");
+	return;
+
+superseded:
+	/* this writer is going away and there aren't any more things to
+	 * write */
+	_debug("cease");
+	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	_leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM	- out of memory, nothing done
+ *   -ENOBUFS	- no backing object available in which to cache the page
+ *   0		- dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *	    fill op)
+ *
+ *	(b) writes deferred till post-creation (mark page for writing and
+ *	    return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *	(a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *	(b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+			 struct page *page,
+			 gfp_t gfp)
+{
+	struct fscache_storage *op;
+	struct fscache_object *object;
+	int ret;
+
+	_enter("%p,%x,", cookie, (u32) page->flags);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERT(PageFsCache(page));
+
+	fscache_stat(&fscache_n_stores);
+
+	op = kzalloc(sizeof(*op), GFP_NOIO);
+	if (!op)
+		goto nomem;
+
+	fscache_operation_init(&op->op, fscache_release_write_op);
+	fscache_operation_init_slow(&op->op, fscache_write_op);
+	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+
+	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+	if (ret < 0)
+		goto nomem_free;
+
+	ret = -ENOBUFS;
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+	if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+		goto nobufs;
+
+	/* add the page to the pending-storage radix tree on the backing
+	 * object */
+	spin_lock(&object->lock);
+
+	_debug("store limit %llx", (unsigned long long) object->store_limit);
+
+	ret = radix_tree_insert(&cookie->stores, page->index, page);
+	if (ret < 0) {
+		if (ret == -EEXIST)
+			goto already_queued;
+		_debug("insert failed %d", ret);
+		goto nobufs_unlock_obj;
+	}
+
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_PENDING_TAG);
+	page_cache_get(page);
+
+	/* we only want one writer at a time, but we do need to queue new
+	 * writers after exclusive ops */
+	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+		goto already_pending;
+
+	spin_unlock(&object->lock);
+
+	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
+	op->store_limit = object->store_limit;
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto submit_failed;
+
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	fscache_stat(&fscache_n_store_ops);
+	fscache_stat(&fscache_n_stores_ok);
+
+	/* the slow work queue now carries its own ref on the object */
+	fscache_put_operation(&op->op);
+	_leave(" = 0");
+	return 0;
+
+already_queued:
+	fscache_stat(&fscache_n_stores_again);
+already_pending:
+	spin_unlock(&object->lock);
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_ok);
+	_leave(" = 0");
+	return 0;
+
+submit_failed:
+	radix_tree_delete(&cookie->stores, page->index);
+	page_cache_release(page);
+	ret = -ENOBUFS;
+	goto nobufs;
+
+nobufs_unlock_obj:
+	spin_unlock(&object->lock);
+nobufs:
+	spin_unlock(&cookie->lock);
+	radix_tree_preload_end();
+	kfree(op);
+	fscache_stat(&fscache_n_stores_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+nomem_free:
+	kfree(op);
+nomem:
+	fscache_stat(&fscache_n_stores_oom);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+	struct fscache_object *object;
+
+	_enter(",%p", page);
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+	ASSERTCMP(page, !=, NULL);
+
+	fscache_stat(&fscache_n_uncaches);
+
+	/* cache withdrawal may beat us to it */
+	if (!PageFsCache(page))
+		goto done;
+
+	/* get the object */
+	spin_lock(&cookie->lock);
+
+	if (hlist_empty(&cookie->backing_objects)) {
+		ClearPageFsCache(page);
+		goto done_unlock;
+	}
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	/* there might now be stuff on disk we could read */
+	clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	/* only invoke the cache backend if we managed to mark the page
+	 * uncached here; this deals with synchronisation vs withdrawal */
+	if (TestClearPageFsCache(page) &&
+	    object->cache->ops->uncache_page) {
+		/* the cache backend releases the cookie lock */
+		object->cache->ops->uncache_page(object, page);
+		goto done;
+	}
+
+done_unlock:
+	spin_unlock(&cookie->lock);
+done:
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+			       struct pagevec *pagevec)
+{
+	struct fscache_cookie *cookie = op->op.object->cookie;
+	unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+	atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+	for (loop = 0; loop < pagevec->nr; loop++) {
+		struct page *page = pagevec->pages[loop];
+
+		_debug("- mark %p{%lx}", page, page->index);
+		if (TestSetPageFsCache(page)) {
+			static bool once_only;
+			if (!once_only) {
+				once_only = true;
+				printk(KERN_WARNING "FS-Cache:"
+				       " Cookie type %s marked page %lx"
+				       " multiple times\n",
+				       cookie->def->name, page->index);
+			}
+		}
+	}
+
+	if (cookie->def->mark_pages_cached)
+		cookie->def->mark_pages_cached(cookie->netfs_data,
+					       op->mapping, pagevec);
+	pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+	_enter("");
+
+	if (!proc_mkdir("fs/fscache", NULL))
+		goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+			 &fscache_stats_fops))
+		goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+			 &fscache_histogram_fops))
+		goto error_histogram;
+#endif
+
+	_leave(" = 0");
+	return 0;
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+error_dir:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+	remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+	remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "FS-Cache statistics\n");
+
+	seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+		   atomic_read(&fscache_n_cookie_index),
+		   atomic_read(&fscache_n_cookie_data),
+		   atomic_read(&fscache_n_cookie_special));
+
+	seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+		   atomic_read(&fscache_n_object_alloc),
+		   atomic_read(&fscache_n_object_no_alloc),
+		   atomic_read(&fscache_n_object_avail),
+		   atomic_read(&fscache_n_object_dead));
+	seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+		   atomic_read(&fscache_n_checkaux_none),
+		   atomic_read(&fscache_n_checkaux_okay),
+		   atomic_read(&fscache_n_checkaux_update),
+		   atomic_read(&fscache_n_checkaux_obsolete));
+
+	seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+		   atomic_read(&fscache_n_marks),
+		   atomic_read(&fscache_n_uncaches));
+
+	seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+		   " oom=%u\n",
+		   atomic_read(&fscache_n_acquires),
+		   atomic_read(&fscache_n_acquires_null),
+		   atomic_read(&fscache_n_acquires_no_cache),
+		   atomic_read(&fscache_n_acquires_ok),
+		   atomic_read(&fscache_n_acquires_nobufs),
+		   atomic_read(&fscache_n_acquires_oom));
+
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+		   atomic_read(&fscache_n_object_lookups),
+		   atomic_read(&fscache_n_object_lookups_negative),
+		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_created));
+
+	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+		   atomic_read(&fscache_n_updates),
+		   atomic_read(&fscache_n_updates_null),
+		   atomic_read(&fscache_n_updates_run));
+
+	seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+		   atomic_read(&fscache_n_relinquishes),
+		   atomic_read(&fscache_n_relinquishes_null),
+		   atomic_read(&fscache_n_relinquishes_waitcrt));
+
+	seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+		   atomic_read(&fscache_n_attr_changed),
+		   atomic_read(&fscache_n_attr_changed_ok),
+		   atomic_read(&fscache_n_attr_changed_nobufs),
+		   atomic_read(&fscache_n_attr_changed_nomem),
+		   atomic_read(&fscache_n_attr_changed_calls));
+
+	seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+		   atomic_read(&fscache_n_allocs),
+		   atomic_read(&fscache_n_allocs_ok),
+		   atomic_read(&fscache_n_allocs_wait),
+		   atomic_read(&fscache_n_allocs_nobufs));
+	seq_printf(m, "Allocs : ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_alloc_ops),
+		   atomic_read(&fscache_n_alloc_op_waits));
+
+	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+		   " int=%u oom=%u\n",
+		   atomic_read(&fscache_n_retrievals),
+		   atomic_read(&fscache_n_retrievals_ok),
+		   atomic_read(&fscache_n_retrievals_wait),
+		   atomic_read(&fscache_n_retrievals_nodata),
+		   atomic_read(&fscache_n_retrievals_nobufs),
+		   atomic_read(&fscache_n_retrievals_intr),
+		   atomic_read(&fscache_n_retrievals_nomem));
+	seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+		   atomic_read(&fscache_n_retrieval_ops),
+		   atomic_read(&fscache_n_retrieval_op_waits));
+
+	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+		   atomic_read(&fscache_n_stores),
+		   atomic_read(&fscache_n_stores_ok),
+		   atomic_read(&fscache_n_stores_again),
+		   atomic_read(&fscache_n_stores_nobufs),
+		   atomic_read(&fscache_n_stores_oom));
+	seq_printf(m, "Stores : ops=%u run=%u\n",
+		   atomic_read(&fscache_n_store_ops),
+		   atomic_read(&fscache_n_store_calls));
+
+	seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+		   atomic_read(&fscache_n_op_pend),
+		   atomic_read(&fscache_n_op_run),
+		   atomic_read(&fscache_n_op_enqueue));
+	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_deferred_release),
+		   atomic_read(&fscache_n_op_release),
+		   atomic_read(&fscache_n_op_gc));
+	return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fdff346e96fd..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -224,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
 	return !nodeid || nodeid == FUSE_ROOT_ID;
 }
 
-struct dentry_operations fuse_dentry_operations = {
+const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 		fuse_put_request(fc, req);
 		return -ENOMEM;
 	}
+	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d9fdb7cec538..2b25133524a3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_read_in);
 	req->in.args[0].value = inarg;
-	req->out.argpages = 1;
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	attr_ver = fuse_get_attr_version(fc);
 
 	req->out.page_zeroing = 1;
+	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+
+	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
 	inarg->flags = file ? file->f_flags : 0;
 	req->in.h.opcode = FUSE_WRITE;
 	req->in.h.nodeid = get_node_id(inode);
-	req->in.argpages = 1;
 	req->in.numargs = 2;
 	if (fc->minor < 9)
 		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	req->in.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 	size_t count = 0;
 	int err;
 
+	req->in.argpages = 1;
 	req->page_offset = offset;
 
 	do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-			       unsigned nbytes, int write)
+			       unsigned *nbytesp, int write)
 {
+	unsigned nbytes = *nbytesp;
 	unsigned long user_addr = (unsigned long) buf;
 	unsigned offset = user_addr & ~PAGE_MASK;
 	int npages;
 
-	/* This doesn't work with nfsd */
-	if (!current->mm)
-		return -EPERM;
+	/* Special case for kernel I/O: can copy directly into the buffer */
+	if (segment_eq(get_fs(), KERNEL_DS)) {
+		if (write)
+			req->in.args[1].value = (void *) user_addr;
+		else
+			req->out.args[0].value = (void *) user_addr;
+
+		return 0;
+	}
 
 	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
-	npages = get_user_pages(current, current->mm, user_addr, npages, write,
+	npages = get_user_pages(current, current->mm, user_addr, npages, !write,
 				0, req->pages, NULL);
 	up_read(&current->mm->mmap_sem);
 	if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 
 	req->num_pages = npages;
 	req->page_offset = offset;
+
+	if (write)
+		req->in.argpages = 1;
+	else
+		req->out.argpages = 1;
+
+	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+	*nbytesp = min(*nbytesp, nbytes);
+
 	return 0;
 }
 
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 
 	while (count) {
 		size_t nres;
-		size_t nbytes_limit = min(count, nmax);
-		size_t nbytes;
-		int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
+		size_t nbytes = min(count, nmax);
+		int err = fuse_get_user_pages(req, buf, &nbytes, write);
 		if (err) {
 			res = err;
 			break;
 		}
-		nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-		nbytes = min(nbytes_limit, nbytes);
+
 		if (write)
 			nres = fuse_send_write(req, file, inode, pos, nbytes,
 					       current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
 	fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
 
 	copy_highpage(tmp_page, page);
+	req->in.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = tmp_page;
 	req->page_offset = 0;
@@ -1234,8 +1252,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	/*
 	 * Don't use page->mapping as it may become NULL from a
 	 * concurrent truncate.
@@ -1273,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Can't provide the coherency needed for MAP_SHARED */
+	if (vma->vm_flags & VM_MAYSHARE)
+		return -ENODEV;
+
+	return generic_file_mmap(file, vma);
+}
+
 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
 				  struct file_lock *fl)
 {
@@ -1465,7 +1493,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	case SEEK_END:
 		retval = fuse_update_attributes(inode, NULL, file, NULL);
 		if (retval)
-			return retval;
+			goto exit;
 		offset += i_size_read(inode);
 		break;
 	case SEEK_CUR:
@@ -1479,6 +1507,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
+exit:
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
@@ -1906,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= fuse_direct_read,
 	.write		= fuse_direct_write,
+	.mmap		= fuse_direct_mmap,
 	.open		= fuse_open,
 	.flush		= fuse_flush,
 	.release	= fuse_release,
@@ -1915,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
-	/* no mmap and splice_read */
+	/* no splice_read */
 };
 
 static const struct address_space_operations fuse_file_aops  = {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5e64b815a5a1..6fc5aedaa0d5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -493,7 +493,7 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
-extern struct dentry_operations fuse_dentry_operations;
+extern const struct dentry_operations fuse_dentry_operations;
 
 /**
  * Get a filled in inode
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
 	mode_t mode = inode->i_mode;
 	int error;
 
-	inode->i_mode = mode & ~current->fs->umask;
+	inode->i_mode = mode & ~current_umask();
 	if (!S_ISLNK(inode->i_mode))
 		acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
 	if (acl) {
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index e563a6449811..3a981b7f64ca 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,10 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
 	depends on EXPERIMENTAL && (64BIT || LBD)
+	select DLM if GFS2_FS_LOCKING_DLM
+	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
+	select SYSFS if GFS2_FS_LOCKING_DLM
+	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
 	help
@@ -18,17 +22,16 @@ config GFS2_FS
 	  the locking module below. Documentation and utilities for GFS2 can
 	  be found here: http://sources.redhat.com/cluster
 
-	  The "nolock" lock module is now built in to GFS2 by default.
+	  The "nolock" lock module is now built in to GFS2 by default. If
+	  you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+	  networking.
 
 config GFS2_FS_LOCKING_DLM
-	tristate "GFS2 DLM locking module"
-	depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n)
-	select IP_SCTP if DLM_SCTP
-	select CONFIGFS_FS
-	select DLM
+	bool "GFS2 DLM locking"
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
 	help
 	  Multiple node locking module for GFS2
 
-	  Most users of GFS2 will require this module. It provides the locking
+	  Most users of GFS2 will require this. It provides the locking
 	  interface between GFS2 and the DLM, which is required to use GFS2
 	  in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index c1b4ec6a9650..a851ea4bdf70 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
-	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
+	glops.o inode.o log.o lops.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e335dceb6a4f..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -15,7 +15,6 @@
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -216,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (error)
 		return error;
 	if (!acl) {
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		if (mode != ip->i_inode.i_mode)
 			error = munge_mode(ip, mode);
 		return error;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 11ffc56f1f81..3a5d3f883e10 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b7c8e5c70791..aef4d0c06748 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,7 +60,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index f114ba2b3557..dee9b03e5b37 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -14,7 +14,6 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0d1c76d906ae..899763aed217 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b983aef785d..3984e47d1d33 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -10,7 +10,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/delay.h>
 #include <linux/sort.h>
@@ -18,7 +17,6 @@
 #include <linux/kallsyms.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/list.h>
-#include <linux/lm_interface.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
@@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
-
 	if (aspace)
 		gfs2_aspace_put(aspace);
 
-	kmem_cache_free(gfs2_glock_cachep, gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
 }
 
 /**
@@ -172,6 +167,7 @@ static void glock_free(struct gfs2_glock *gl)
 
 static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
+	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
 	atomic_inc(&gl->gl_ref);
 }
 
@@ -211,17 +207,15 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 			atomic_dec(&lru_count);
 		}
 		spin_unlock(&lru_lock);
-		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
 	}
-	write_unlock(gl_lock_addr(gl->gl_hash));
 	/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
 	if (atomic_read(&gl->gl_ref) == 2)
 		gfs2_glock_schedule_for_reclaim(gl);
+	write_unlock(gl_lock_addr(gl->gl_hash));
 out:
 	return rv;
 }
@@ -256,27 +250,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 }
 
 /**
- * gfs2_glock_find() - Find glock by lock number
- * @sdp: The GFS2 superblock
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
-					  const struct lm_lockname *name)
-{
-	unsigned int hash = gl_hash(sdp, name);
-	struct gfs2_glock *gl;
-
-	read_lock(gl_lock_addr(hash));
-	gl = search_bucket(hash, sdp, name);
-	read_unlock(gl_lock_addr(hash));
-
-	return gl;
-}
-
-/**
  * may_grant - check if its ok to grant a new lock
  * @gl: The glock
  * @gh: The lock request which we wish to grant
@@ -523,7 +496,7 @@ out_locked:
 }
 
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int req_state,
 				 unsigned int flags)
 {
 	int ret = LM_OUT_ERROR;
@@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
 
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
 							 req_state, flags);
 	return ret;
 }
@@ -575,7 +548,7 @@ __acquires(&gl->gl_spin)
 	    gl->gl_state == LM_ST_DEFERRED) &&
 	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 		lck_flags |= LM_FLAG_TRY_1CB;
-	ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+	ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
 	if (!(ret & LM_OUT_ASYNC)) {
 		finish_xmote(gl, ret);
@@ -624,10 +597,11 @@ __acquires(&gl->gl_spin)
 
 	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
 
+	down_read(&gfs2_umount_flush_sem);
 	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_demote_state != gl->gl_state) {
 		if (find_first_holder(gl))
-			goto out;
+			goto out_unlock;
 		if (nonblock)
 			goto out_sched;
 		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
@@ -638,23 +612,26 @@ __acquires(&gl->gl_spin)
 			gfs2_demote_wake(gl);
 		ret = do_promote(gl);
 		if (ret == 0)
-			goto out;
+			goto out_unlock;
 		if (ret == 2)
-			return;
+			goto out_sem;
 		gh = find_first_waiter(gl);
 		gl->gl_target = gh->gh_state;
 		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 			do_error(gl, 0); /* Fail queued try locks */
 	}
 	do_xmote(gl, gh, gl->gl_target);
+out_sem:
+	up_read(&gfs2_umount_flush_sem);
 	return;
 
 out_sched:
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
-out:
+out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
+	goto out_sem;
 }
 
 static void glock_work_func(struct work_struct *work)
@@ -681,18 +658,6 @@ static void glock_work_func(struct work_struct *work)
 		gfs2_glock_put(gl);
 }
 
-static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		     void **lockp)
-{
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-				sdp->sd_lockstruct.ls_lockspace, name, lockp);
-	return error;
-}
-
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -719,10 +684,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl = search_bucket(hash, sdp, &name);
 	read_unlock(gl_lock_addr(hash));
 
-	if (gl || !create) {
-		*glp = gl;
+	*glp = gl;
+	if (gl)
 		return 0;
-	}
+	if (!create)
+		return -ENOENT;
 
 	gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
 	if (!gl)
@@ -736,7 +702,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
 	gl->gl_ops = glops;
-	gl->gl_stamp = jiffies;
+	snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
+	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+	gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
@@ -753,10 +721,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		}
 	}
 
-	error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
-	if (error)
-		goto fail_aspace;
-
 	write_lock(gl_lock_addr(hash));
 	tmp = search_bucket(hash, sdp, &name);
 	if (tmp) {
@@ -772,9 +736,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 
 	return 0;
 
-fail_aspace:
-	if (gl->gl_aspace)
-		gfs2_aspace_put(gl->gl_aspace);
 fail:
 	kmem_cache_free(gfs2_glock_cachep, gl);
 	return error;
@@ -966,7 +927,7 @@ do_cancel:
 	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
 		spin_unlock(&gl->gl_spin);
 		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
-			sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
 		spin_lock(&gl->gl_spin);
 	}
 	return;
@@ -1051,7 +1012,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 			spin_lock(&gl->gl_spin);
 			clear_bit(GLF_LOCK, &gl->gl_flags);
 		}
-		gl->gl_stamp = jiffies;
 		if (list_empty(&gl->gl_holders) &&
 		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1240,70 +1200,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 		gfs2_glock_dq_uninit(&ghs[x]);
 }
 
-static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-	return error;
-}
-
-/**
- * gfs2_lvb_hold - attach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-int gfs2_lvb_hold(struct gfs2_glock *gl)
-{
-	int error;
-
-	if (!atomic_read(&gl->gl_lvb_count)) {
-		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-		if (error) 
-			return error;
-		gfs2_glock_hold(gl);
-	}
-	atomic_inc(&gl->gl_lvb_count);
-
-	return 0;
-}
-
-/**
- * gfs2_lvb_unhold - detach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-void gfs2_lvb_unhold(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	gfs2_glock_hold(gl);
-	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
-	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
-			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
-		gl->gl_lvb = NULL;
-		gfs2_glock_put(gl);
-	}
-	gfs2_glock_put(gl);
-}
-
-static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
-			unsigned int state)
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 {
-	struct gfs2_glock *gl;
 	unsigned long delay = 0;
 	unsigned long holdtime;
 	unsigned long now = jiffies;
 
-	gl = gfs2_glock_find(sdp, name);
-	if (!gl)
-		return;
-
+	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
@@ -1317,74 +1220,33 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		gfs2_glock_put(gl);
 }
 
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-	struct gfs2_jdesc *jd;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_jid != jid)
-			continue;
-		jd->jd_dirty = 1;
-		break;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-}
-
 /**
- * gfs2_glock_cb - Callback used by locking module
- * @sdp: Pointer to the superblock
- * @type: Type of callback
- * @data: Type dependent data pointer
+ * gfs2_glock_complete - Callback used by locking
+ * @gl: Pointer to the glock
+ * @ret: The return value from the dlm
  *
- * Called by the locking module when it wants to tell us something.
- * Either we need to drop a lock, one of our ASYNC requests completed, or
- * a journal from another client needs to be recovered.
  */
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
-	struct gfs2_sbd *sdp = cb_data;
-
-	switch (type) {
-	case LM_CB_NEED_E:
-		blocking_cb(sdp, data, LM_ST_UNLOCKED);
-		return;
-
-	case LM_CB_NEED_D:
-		blocking_cb(sdp, data, LM_ST_DEFERRED);
-		return;
-
-	case LM_CB_NEED_S:
-		blocking_cb(sdp, data, LM_ST_SHARED);
-		return;
-
-	case LM_CB_ASYNC: {
-		struct lm_async_cb *async = data;
-		struct gfs2_glock *gl;
-
-		down_read(&gfs2_umount_flush_sem);
-		gl = gfs2_glock_find(sdp, &async->lc_name);
-		if (gfs2_assert_warn(sdp, gl))
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	gl->gl_reply = ret;
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+		struct gfs2_holder *gh;
+		spin_lock(&gl->gl_spin);
+		gh = find_first_waiter(gl);
+		if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
+		     (gl->gl_target != LM_ST_UNLOCKED)) ||
+		    ((ret & ~LM_OUT_ST_MASK) != 0))
+			set_bit(GLF_FROZEN, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+		if (test_bit(GLF_FROZEN, &gl->gl_flags))
 			return;
-		gl->gl_reply = async->lc_ret;
-		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-			gfs2_glock_put(gl);
-		up_read(&gfs2_umount_flush_sem);
-		return;
-	}
-
-	case LM_CB_NEED_RECOVERY:
-		gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
-		if (sdp->sd_recoverd_process)
-			wake_up_process(sdp->sd_recoverd_process);
-		return;
-
-	default:
-		gfs2_assert_warn(sdp, 0);
-		return;
 	}
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1515,6 +1377,25 @@ out:
 	return has_entries;
 }
 
+
+/**
+ * thaw_glock - thaw out a glock which has an unprocessed reply waiting
+ * @gl: The glock to thaw
+ *
+ * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
+ * so this has to result in the ref count being dropped by one.
+ */
+
+static void thaw_glock(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		return;
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -1540,6 +1421,20 @@ static void clear_glock(struct gfs2_glock *gl)
 }
 
 /**
+ * gfs2_glock_thaw - Thaw any frozen glocks
+ * @sdp: The super block
+ *
+ */
+
+void gfs2_glock_thaw(struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(thaw_glock, sdp, x);
+}
+
+/**
  * gfs2_gl_hash_clear - Empty out the glock hash table
  * @sdp: the filesystem
  * @wait: wait until it's all gone
@@ -1619,7 +1514,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 	if (flags & LM_FLAG_NOEXP)
 		*p++ = 'e';
 	if (flags & LM_FLAG_ANY)
-		*p++ = 'a';
+		*p++ = 'A';
 	if (flags & LM_FLAG_PRIORITY)
 		*p++ = 'p';
 	if (flags & GL_ASYNC)
@@ -1683,6 +1578,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'i';
 	if (test_bit(GLF_REPLY_PENDING, gflags))
 		*p++ = 'r';
+	if (test_bit(GLF_INITIAL, gflags))
+		*p++ = 'I';
+	if (test_bit(GLF_FROZEN, gflags))
+		*p++ = 'F';
 	*p = 0;
 	return buf;
 }
@@ -1717,14 +1616,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
 		  gflags2str(gflags_buf, &gl->gl_flags),
 		  state2str(gl->gl_target),
 		  state2str(gl->gl_demote_state), dtime,
-		  atomic_read(&gl->gl_lvb_count),
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_ref));
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7ecfbda..a602a28f6f08 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -11,15 +11,130 @@
 #define __GLOCK_DOT_H__
 
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include "incore.h"
 
-/* Flags for lock requests; used in gfs2_holder gh_flag field.
-   From lm_interface.h:
+/* Options for hostdata parser */
+
+enum {
+	Opt_jid,
+	Opt_id,
+	Opt_first,
+	Opt_nodir,
+	Opt_err,
+};
+
+/*
+ * lm_lockname types
+ */
+
+#define LM_TYPE_RESERVED	0x00
+#define LM_TYPE_NONDISK		0x01
+#define LM_TYPE_INODE		0x02
+#define LM_TYPE_RGRP		0x03
+#define LM_TYPE_META		0x04
+#define LM_TYPE_IOPEN		0x05
+#define LM_TYPE_FLOCK		0x06
+#define LM_TYPE_PLOCK		0x07
+#define LM_TYPE_QUOTA		0x08
+#define LM_TYPE_JOURNAL		0x09
+
+/*
+ * lm_lock() states
+ *
+ * SHARED is compatible with SHARED, not with DEFERRED or EX.
+ * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
+ */
+
+#define LM_ST_UNLOCKED		0
+#define LM_ST_EXCLUSIVE		1
+#define LM_ST_DEFERRED		2
+#define LM_ST_SHARED		3
+
+/*
+ * lm_lock() flags
+ *
+ * LM_FLAG_TRY
+ * Don't wait to acquire the lock if it can't be granted immediately.
+ *
+ * LM_FLAG_TRY_1CB
+ * Send one blocking callback if TRY is set and the lock is not granted.
+ *
+ * LM_FLAG_NOEXP
+ * GFS sets this flag on lock requests it makes while doing journal recovery.
+ * These special requests should not be blocked due to the recovery like
+ * ordinary locks would be.
+ *
+ * LM_FLAG_ANY
+ * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
+ * also be granted in SHARED.  The preferred state is whichever is compatible
+ * with other granted locks, or the specified state if no other locks exist.
+ *
+ * LM_FLAG_PRIORITY
+ * Override fairness considerations.  Suppose a lock is held in a shared state
+ * and there is a pending request for the deferred state.  A shared lock
+ * request with the priority flag would be allowed to bypass the deferred
+ * request and directly join the other shared lock.  A shared lock request
+ * without the priority flag might be forced to wait until the deferred
+ * requested had acquired and released the lock.
+ */
+
 #define LM_FLAG_TRY		0x00000001
 #define LM_FLAG_TRY_1CB		0x00000002
 #define LM_FLAG_NOEXP		0x00000004
 #define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010 */
+#define LM_FLAG_PRIORITY	0x00000010
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_ATIME		0x00000200
+#define GL_NOCACHE		0x00000400
+  
+/*
+ * lm_lock() and lm_async_cb return flags
+ *
+ * LM_OUT_ST_MASK
+ * Masks the lower two bits of lock state in the returned value.
+ *
+ * LM_OUT_CANCELED
+ * The lock request was canceled.
+ *
+ * LM_OUT_ASYNC
+ * The result of the request will be returned in an LM_CB_ASYNC callback.
+ *
+ */
+
+#define LM_OUT_ST_MASK		0x00000003
+#define LM_OUT_CANCELED		0x00000008
+#define LM_OUT_ASYNC		0x00000080
+#define LM_OUT_ERROR		0x00000100
+
+/*
+ * lm_recovery_done() messages
+ */
+
+#define LM_RD_GAVEUP		308
+#define LM_RD_SUCCESS		309
+
+#define GLR_TRYFAILED		13
+
+struct lm_lockops {
+	const char *lm_proto_name;
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
+ 	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	void (*lm_withdraw) (struct gfs2_sbd *sdp);
+	void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+	unsigned int (*lm_lock) (struct gfs2_glock *gl,
+				 unsigned int req_state, unsigned int flags);
+	void (*lm_cancel) (struct gfs2_glock *gl);
+	const match_table_t *lm_tokens;
+};
+
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010
 
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
@@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
@@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
 int gfs2_register_debugfs(void);
 void gfs2_unregister_debugfs(void);
 
+extern const struct lm_lockops gfs2_dlm_ops;
+
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8522d3aa64fc..bf23a62aa925 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/bio.h>
 
 #include "gfs2.h"
@@ -38,20 +37,25 @@
 static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	unsigned int blocks;
 	struct list_head *head = &gl->gl_ail_list;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
-	int error;
+	struct gfs2_trans tr;
 
-	blocks = atomic_read(&gl->gl_ail_count);
-	if (!blocks)
-		return;
+	memset(&tr, 0, sizeof(tr));
+	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
 
-	error = gfs2_trans_begin(sdp, 0, blocks);
-	if (gfs2_assert_withdraw(sdp, !error))
+	if (!tr.tr_revokes)
 		return;
 
+	/* A shortened, inline version of gfs2_trans_begin() */
+	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+	tr.tr_ip = (unsigned long)__builtin_return_address(0);
+	INIT_LIST_HEAD(&tr.tr_list_buf);
+	gfs2_log_reserve(sdp, tr.tr_reserved);
+	BUG_ON(current->journal_info);
+	current->journal_info = &tr;
+
 	gfs2_log_lock(sdp);
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata,
@@ -72,29 +76,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 }
 
 /**
- * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_pte_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_inode *ip;
-	struct inode *inode;
-
-	ip = gl->gl_object;
-	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(inode->i_mode))
-		return;
-
-	unmap_shared_mapping_range(inode->i_mapping, 0, 0);
-	if (test_bit(GIF_SW_PAGED, &ip->i_flags))
-		set_bit(GLF_DIRTY, &gl->gl_flags);
-
-}
-
-/**
- * meta_go_sync - sync out the metadata for this glock
+ * rgrp_go_sync - sync out the metadata for this glock
  * @gl: the glock
  *
  * Called when demoting or unlocking an EX glock.  We must flush
@@ -102,36 +84,42 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
  * not return to caller to demote/unlock the glock until I/O is complete.
  */
 
-static void meta_go_sync(struct gfs2_glock *gl)
+static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
+	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	int error;
+
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
 
-	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
-		gfs2_meta_sync(gl);
-		gfs2_ail_empty_gl(gl);
-	}
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	error = filemap_fdatawait(metamapping);
+        mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
- * meta_go_inval - invalidate the metadata for this glock
+ * rgrp_go_inval - invalidate the metadata for this glock
  * @gl: the glock
  * @flags:
  *
+ * We never used LM_ST_DEFERRED with resource groups, so that we
+ * should always see the metadata flag set here.
+ *
  */
 
-static void meta_go_inval(struct gfs2_glock *gl, int flags)
+static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	if (!(flags & DIO_METADATA))
-		return;
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
 
-	gfs2_meta_inval(gl);
-	if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
-		gl->gl_sbd->sd_rindex_uptodate = 0;
-	else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
-		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+	BUG_ON(!(flags & DIO_METADATA));
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+	truncate_inode_pages(mapping, 0);
 
+	if (gl->gl_object) {
+		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
 		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	}
 }
@@ -148,48 +136,54 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	struct address_space *metamapping = gl->gl_aspace->i_mapping;
 	int error;
 
-	if (gl->gl_state != LM_ST_UNLOCKED)
-		gfs2_pte_inval(gl);
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
-		return;
-
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
 		ip = NULL;
+	if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+		unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
 
-	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
-		filemap_fdatawrite(metamapping);
-		if (ip) {
-			struct address_space *mapping = ip->i_inode.i_mapping;
-			filemap_fdatawrite(mapping);
-			error = filemap_fdatawait(mapping);
-			mapping_set_error(mapping, error);
-		}
-		error = filemap_fdatawait(metamapping);
-		mapping_set_error(metamapping, error);
-		clear_bit(GLF_DIRTY, &gl->gl_flags);
-		gfs2_ail_empty_gl(gl);
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	if (ip) {
+		struct address_space *mapping = ip->i_inode.i_mapping;
+		filemap_fdatawrite(mapping);
+		error = filemap_fdatawait(mapping);
+		mapping_set_error(mapping, error);
 	}
+	error = filemap_fdatawait(metamapping);
+	mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
  * inode_go_inval - prepare a inode glock to be released
  * @gl: the glock
  * @flags:
+ * 
+ * Normally we invlidate everything, but if we are moving into
+ * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
+ * can keep hold of the metadata, since it won't have changed.
  *
  */
 
 static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
 	struct gfs2_inode *ip = gl->gl_object;
-	int meta = (flags & DIO_METADATA);
 
-	if (meta) {
-		gfs2_meta_inval(gl);
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+
+	if (flags & DIO_METADATA) {
+		struct address_space *mapping = gl->gl_aspace->i_mapping;
+		truncate_inode_pages(mapping, 0);
 		if (ip)
 			set_bit(GIF_INVALID, &ip->i_flags);
 	}
 
+	if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
+		gl->gl_sbd->sd_rindex_uptodate = 0;
 	if (ip && S_ISREG(ip->i_inode.i_mode))
 		truncate_inode_pages(ip->i_inode.i_mapping, 0);
 }
@@ -390,20 +384,7 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 	return 0;
 }
 
-/**
- * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int quota_go_demote_ok(const struct gfs2_glock *gl)
-{
-	return !atomic_read(&gl->gl_lvb_count);
-}
-
 const struct gfs2_glock_operations gfs2_meta_glops = {
-	.go_xmote_th = meta_go_sync,
 	.go_type = LM_TYPE_META,
 };
 
@@ -418,8 +399,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
-	.go_xmote_th = meta_go_sync,
-	.go_inval = meta_go_inval,
+	.go_xmote_th = rgrp_go_sync,
+	.go_inval = rgrp_go_inval,
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
@@ -448,7 +429,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_quota_glops = {
-	.go_demote_ok = quota_go_demote_ok,
 	.go_type = LM_TYPE_QUOTA,
 };
 
@@ -456,3 +436,15 @@ const struct gfs2_glock_operations gfs2_journal_glops = {
 	.go_type = LM_TYPE_JOURNAL,
 };
 
+const struct gfs2_glock_operations *gfs2_glops_list[] = {
+	[LM_TYPE_META] = &gfs2_meta_glops,
+	[LM_TYPE_INODE] = &gfs2_inode_glops,
+	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
+	[LM_TYPE_NONDISK] = &gfs2_trans_glops,
+	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
+	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
+	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
+	[LM_TYPE_QUOTA] = &gfs2_quota_glops,
+	[LM_TYPE_JOURNAL] = &gfs2_journal_glops,
+};
+
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index a1d9b5b024e6..b3aa2e3210fd 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -21,5 +21,6 @@ extern const struct gfs2_glock_operations gfs2_flock_glops;
 extern const struct gfs2_glock_operations gfs2_nondisk_glops;
 extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
+extern const struct gfs2_glock_operations *gfs2_glops_list[];
 
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 608849d00021..399d1b978049 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,8 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/dlm.h>
+#include <linux/buffer_head.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -26,6 +28,7 @@ struct gfs2_trans;
 struct gfs2_ail;
 struct gfs2_jdesc;
 struct gfs2_sbd;
+struct lm_lockops;
 
 typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
 
@@ -121,6 +124,28 @@ struct gfs2_bufdata {
 	struct list_head bd_ail_gl_list;
 };
 
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.
+ */
+
+#define GDLM_STRNAME_BYTES	25
+#define GDLM_LVB_SIZE		32
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+};
+
+struct lm_lockname {
+	u64 ln_number;
+	unsigned int ln_type;
+};
+
+#define lm_name_equal(name1, name2) \
+        (((name1)->ln_number == (name2)->ln_number) && \
+         ((name1)->ln_type == (name2)->ln_type))
+
+
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
@@ -162,6 +187,8 @@ enum {
 	GLF_LFLUSH			= 7,
 	GLF_INVALIDATE_IN_PROGRESS	= 8,
 	GLF_REPLY_PENDING		= 9,
+	GLF_INITIAL			= 10,
+	GLF_FROZEN			= 11,
 };
 
 struct gfs2_glock {
@@ -176,16 +203,15 @@ struct gfs2_glock {
 	unsigned int gl_target;
 	unsigned int gl_reply;
 	unsigned int gl_hash;
+	unsigned int gl_req;
 	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
 	struct list_head gl_holders;
 
 	const struct gfs2_glock_operations *gl_ops;
-	void *gl_lock;
-	char *gl_lvb;
-	atomic_t gl_lvb_count;
-
-	unsigned long gl_stamp;
+	char gl_strname[GDLM_STRNAME_BYTES];
+	struct dlm_lksb gl_lksb;
+	char gl_lvb[32];
 	unsigned long gl_tchange;
 	void *gl_object;
 
@@ -283,7 +309,9 @@ enum {
 
 struct gfs2_quota_data {
 	struct list_head qd_list;
-	unsigned int qd_count;
+	struct list_head qd_reclaim;
+
+	atomic_t qd_count;
 
 	u32 qd_id;
 	unsigned long qd_flags;		/* QDF_... */
@@ -303,7 +331,6 @@ struct gfs2_quota_data {
 
 	u64 qd_sync_gen;
 	unsigned long qd_last_warn;
-	unsigned long qd_last_touched;
 };
 
 struct gfs2_trans {
@@ -390,7 +417,7 @@ struct gfs2_args {
 	unsigned int ar_suiddir:1;		/* suiddir support */
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
-	unsigned int ar_num_glockd;		/* Number of glockd threads */
+	unsigned int ar_discard:1;		/* discard requests */
 };
 
 struct gfs2_tune {
@@ -406,7 +433,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
 	unsigned int gt_quota_scale_num; /* Numerator */
 	unsigned int gt_quota_scale_den; /* Denominator */
-	unsigned int gt_quota_cache_secs;
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
@@ -445,6 +471,31 @@ struct gfs2_sb_host {
 
 	char sb_lockproto[GFS2_LOCKNAME_LEN];
 	char sb_locktable[GFS2_LOCKNAME_LEN];
+	u8 sb_uuid[16];
+};
+
+/*
+ * lm_mount() return values
+ *
+ * ls_jid - the journal ID this node should use
+ * ls_first - this node is the first to mount the file system
+ * ls_lockspace - lock module's context for this file system
+ * ls_ops - lock module's functions
+ */
+
+struct lm_lockstruct {
+	u32 ls_id;
+	unsigned int ls_jid;
+	unsigned int ls_first;
+	unsigned int ls_first_done;
+	unsigned int ls_nodir;
+	const struct lm_lockops *ls_ops;
+	unsigned long ls_flags;
+	dlm_lockspace_t *ls_dlm;
+
+	int ls_recover_jid;
+	int ls_recover_jid_done;
+	int ls_recover_jid_status;
 };
 
 struct gfs2_sbd {
@@ -520,7 +571,6 @@ struct gfs2_sbd {
 	spinlock_t sd_jindex_spin;
 	struct mutex sd_jindex_mutex;
 	unsigned int sd_journals;
-	unsigned long sd_jindex_refresh_time;
 
 	struct gfs2_jdesc *sd_jdesc;
 	struct gfs2_holder sd_journal_gh;
@@ -540,7 +590,6 @@ struct gfs2_sbd {
 
 	struct list_head sd_quota_list;
 	atomic_t sd_quota_count;
-	spinlock_t sd_quota_spin;
 	struct mutex sd_quota_mutex;
 	wait_queue_head_t sd_quota_wait;
 	struct list_head sd_trunc_list;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3b87c188da41..7b277d449155 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -16,7 +16,6 @@
 #include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/security.h>
 #include <linux/time.h>
 
@@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode)
 
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_file_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_file_fops_nolock;
 		else
-			inode->i_fop = &gfs2_file_fops;
+			inode->i_fop = gfs2_file_fops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_dir_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_dir_fops_nolock;
 		else
-			inode->i_fop = &gfs2_dir_fops;
+			inode->i_fop = gfs2_dir_fops;
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d5329364cdff..dca4fee3078b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
+extern const struct file_operations *gfs2_file_fops_nolock;
+extern const struct file_operations *gfs2_dir_fops_nolock;
 
 extern void gfs2_set_inode_flags(struct inode *inode);
+ 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+extern const struct file_operations *gfs2_file_fops;
+extern const struct file_operations *gfs2_dir_fops;
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return sdp->sd_args.ar_localflocks;
+}
+#else /* Single node only */
+#define gfs2_file_fops NULL
+#define gfs2_dir_fops NULL
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return 1;
+}
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
 
 #endif /* __INODE_DOT_H__ */
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 000000000000..46df988323bc
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/dlm.h>
+#include <linux/types.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+
+static void gdlm_ast(void *arg)
+{
+	struct gfs2_glock *gl = arg;
+	unsigned ret = gl->gl_state;
+
+	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
+
+	if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+
+	switch (gl->gl_lksb.sb_status) {
+	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+		kmem_cache_free(gfs2_glock_cachep, gl);
+		return;
+	case -DLM_ECANCEL: /* Cancel while getting lock */
+		ret |= LM_OUT_CANCELED;
+		goto out;
+	case -EAGAIN: /* Try lock fails */
+		goto out;
+	case -EINVAL: /* Invalid */
+	case -ENOMEM: /* Out of memory */
+		ret |= LM_OUT_ERROR;
+		goto out;
+	case 0: /* Success */
+		break;
+	default: /* Something unexpected */
+		BUG();
+	}
+
+	ret = gl->gl_req;
+	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (gl->gl_req == LM_ST_SHARED)
+			ret = LM_ST_DEFERRED;
+		else if (gl->gl_req == LM_ST_DEFERRED)
+			ret = LM_ST_SHARED;
+		else
+			BUG();
+	}
+
+	set_bit(GLF_INITIAL, &gl->gl_flags);
+	gfs2_glock_complete(gl, ret);
+	return;
+out:
+	if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+		gl->gl_lksb.sb_lkid = 0;
+	gfs2_glock_complete(gl, ret);
+}
+
+static void gdlm_bast(void *arg, int mode)
+{
+	struct gfs2_glock *gl = arg;
+
+	switch (mode) {
+	case DLM_LOCK_EX:
+		gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+		break;
+	case DLM_LOCK_CW:
+		gfs2_glock_cb(gl, LM_ST_DEFERRED);
+		break;
+	case DLM_LOCK_PR:
+		gfs2_glock_cb(gl, LM_ST_SHARED);
+		break;
+	default:
+		printk(KERN_ERR "unknown bast mode %d", mode);
+		BUG();
+	}
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static int make_mode(const unsigned int lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	printk(KERN_ERR "unknown LM state %d", lmstate);
+	BUG();
+	return -1;
+}
+
+static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
+		      const int req)
+{
+	u32 lkf = 0;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+		else
+			BUG();
+	}
+
+	if (lkid != 0) 
+		lkf |= DLM_LKF_CONVERT;
+
+	lkf |= DLM_LKF_VALBLK;
+
+	return lkf;
+}
+
+static unsigned int gdlm_lock(struct gfs2_glock *gl,
+			      unsigned int req_state, unsigned int flags)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+	int req;
+	u32 lkf;
+
+	gl->gl_req = req_state;
+	req = make_mode(req_state);
+	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
+
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+	if (error == -EAGAIN)
+		return 0;
+	if (error)
+		return LM_OUT_ERROR;
+	return LM_OUT_ASYNC;
+}
+
+static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+{
+	struct gfs2_glock *gl = ptr;
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+
+	if (gl->gl_lksb.sb_lkid == 0) {
+		kmem_cache_free(cachep, gl);
+		return;
+	}
+
+	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+			   NULL, gl);
+	if (error) {
+		printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+		       gl->gl_name.ln_type,
+		       (unsigned long long)gl->gl_name.ln_number, error);
+		return;
+	}
+}
+
+static void gdlm_cancel(struct gfs2_glock *gl)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+}
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (fsname == NULL) {
+		fs_info(sdp, "no fsname found\n");
+		return -EINVAL;
+	}
+
+	error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
+				  GDLM_LVB_SIZE);
+	if (error)
+		printk(KERN_ERR "dlm_new_lockspace error %d", error);
+
+	return error;
+}
+
+static void gdlm_unmount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (ls->ls_dlm) {
+		dlm_release_lockspace(ls->ls_dlm, 2);
+		ls->ls_dlm = NULL;
+	}
+}
+
+static const match_table_t dlm_tokens = {
+	{ Opt_jid, "jid=%d"},
+	{ Opt_id, "id=%d"},
+	{ Opt_first, "first=%d"},
+	{ Opt_nodir, "nodir=%d"},
+	{ Opt_err, NULL },
+};
+
+const struct lm_lockops gfs2_dlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_unmount = gdlm_unmount,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_cancel = gdlm_cancel,
+	.lm_tokens = &dlm_tokens,
+};
+
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
deleted file mode 100644
index 523243a13a21..000000000000
--- a/fs/gfs2/locking.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include <linux/fs.h>
-#include <linux/delay.h>
-#include <linux/lm_interface.h>
-
-struct lmh_wrapper {
-	struct list_head lw_list;
-	const struct lm_lockops *lw_ops;
-};
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj);
-
-/* List of registered low-level locking protocols.  A file system selects one
-   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
-
-static const struct lm_lockops nolock_ops = {
-	.lm_proto_name = "lock_nolock",
-	.lm_mount = nolock_mount,
-};
-
-static struct lmh_wrapper nolock_proto  = {
-	.lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
-	.lw_ops = &nolock_ops,
-};
-
-static LIST_HEAD(lmh_list);
-static DEFINE_MUTEX(lmh_lock);
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	char *c;
-	unsigned int jid;
-
-	c = strstr(host_data, "jid=");
-	if (!c)
-		jid = 0;
-	else {
-		c += 4;
-		sscanf(c, "%u", &jid);
-	}
-
-	lockstruct->ls_jid = jid;
-	lockstruct->ls_first = 1;
-	lockstruct->ls_lvb_size = min_lvb_size;
-	lockstruct->ls_ops = &nolock_ops;
-	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
-	return 0;
-}
-
-/**
- * gfs2_register_lockproto - Register a low-level locking protocol
- * @proto: the protocol definition
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_register_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			mutex_unlock(&lmh_lock);
-			printk(KERN_INFO "GFS2: protocol %s already exists\n",
-			       proto->lm_proto_name);
-			return -EEXIST;
-		}
-	}
-
-	lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
-	if (!lw) {
-		mutex_unlock(&lmh_lock);
-		return -ENOMEM;
-	}
-
-	lw->lw_ops = proto;
-	list_add(&lw->lw_list, &lmh_list);
-
-	mutex_unlock(&lmh_lock);
-
-	return 0;
-}
-
-/**
- * gfs2_unregister_lockproto - Unregister a low-level locking protocol
- * @proto: the protocol definition
- *
- */
-
-void gfs2_unregister_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			list_del(&lw->lw_list);
-			mutex_unlock(&lmh_lock);
-			kfree(lw);
-			return;
-		}
-	}
-
-	mutex_unlock(&lmh_lock);
-
-	printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
-	       proto->lm_proto_name);
-}
-
-/**
- * gfs2_mount_lockproto - Mount a lock protocol
- * @proto_name - the name of the protocol
- * @table_name - the name of the lock space
- * @host_data - data specific to this host
- * @cb - the callback to the code using the lock module
- * @sdp - The GFS2 superblock
- * @min_lvb_size - the mininum LVB size that the caller can deal with
- * @flags - LM_MFLAG_*
- * @lockstruct - a structure returned describing the mount
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
-			 lm_callback_t cb, void *cb_data,
-			 unsigned int min_lvb_size, int flags,
-			 struct lm_lockstruct *lockstruct,
-			 struct kobject *fskobj)
-{
-	struct lmh_wrapper *lw = NULL;
-	int try = 0;
-	int error, found;
-
-
-retry:
-	mutex_lock(&lmh_lock);
-
-	if (list_empty(&nolock_proto.lw_list))
-		list_add(&nolock_proto.lw_list, &lmh_list);
-
-	found = 0;
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
-			found = 1;
-			break;
-		}
-	}
-
-	if (!found) {
-		if (!try && capable(CAP_SYS_MODULE)) {
-			try = 1;
-			mutex_unlock(&lmh_lock);
-			request_module(proto_name);
-			goto retry;
-		}
-		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
-		error = -ENOENT;
-		goto out;
-	}
-
-	if (lw->lw_ops->lm_owner &&
-	    !try_module_get(lw->lw_ops->lm_owner)) {
-		try = 0;
-		mutex_unlock(&lmh_lock);
-		msleep(1000);
-		goto retry;
-	}
-
-	error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
-				     min_lvb_size, flags, lockstruct, fskobj);
-	if (error)
-		module_put(lw->lw_ops->lm_owner);
-out:
-	mutex_unlock(&lmh_lock);
-	return error;
-}
-
-void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	if (lockstruct->ls_ops->lm_unmount)
-		lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-/**
- * gfs2_withdraw_lockproto - abnormally unmount a lock module
- * @lockstruct: the lockstruct passed into mount
- *
- */
-
-void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
-EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
-
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
deleted file mode 100644
index 2609bb6cd013..000000000000
--- a/fs/gfs2/locking/dlm/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
-lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
-
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
deleted file mode 100644
index 2482c9047505..000000000000
--- a/fs/gfs2/locking/dlm/lock.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static char junk_lvb[GDLM_LVB_SIZE];
-
-
-/* convert dlm lock-mode to gfs lock-state */
-
-static s16 gdlm_make_lmstate(s16 dlmmode)
-{
-	switch (dlmmode) {
-	case DLM_LOCK_IV:
-	case DLM_LOCK_NL:
-		return LM_ST_UNLOCKED;
-	case DLM_LOCK_EX:
-		return LM_ST_EXCLUSIVE;
-	case DLM_LOCK_CW:
-		return LM_ST_DEFERRED;
-	case DLM_LOCK_PR:
-		return LM_ST_SHARED;
-	}
-	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-	return -1;
-}
-
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
-   thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->submit);
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-	clear_bit(LFL_AST_WAIT, &lp->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list))
-		list_del_init(&lp->delay_list);
-	ls->all_locks_count--;
-	spin_unlock(&ls->async_lock);
-
-	kfree(lp);
-}
-
-static void gdlm_queue_delayed(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->delayed);
-	spin_unlock(&ls->async_lock);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	struct lm_async_cb acb;
-
-	memset(&acb, 0, sizeof(acb));
-
-	if (lp->lksb.sb_status == -DLM_ECANCEL) {
-		log_info("complete dlm cancel %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		if (lp->cur == DLM_LOCK_IV)
-			lp->lksb.sb_lkid = 0;
-		goto out;
-	}
-
-	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-			log_info("unlock sb_status %d %x,%llx flags %lx",
-				 lp->lksb.sb_status, lp->lockname.ln_type,
-				 (unsigned long long)lp->lockname.ln_number,
-				 lp->flags);
-			return;
-		}
-
-		lp->cur = DLM_LOCK_IV;
-		lp->req = DLM_LOCK_IV;
-		lp->lksb.sb_lkid = 0;
-
-		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-			gdlm_delete_lp(lp);
-			return;
-		}
-		goto out;
-	}
-
-	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
-	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (lp->req == DLM_LOCK_PR)
-			lp->req = DLM_LOCK_CW;
-		else if (lp->req == DLM_LOCK_CW)
-			lp->req = DLM_LOCK_PR;
-	}
-
-	/*
-	 * A canceled lock request.  The lock was just taken off the delayed
-	 * list and was never even submitted to dlm.
-	 */
-
-	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-		log_info("complete internal cancel %x,%llx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number);
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		goto out;
-	}
-
-	/*
-	 * An error occured.
-	 */
-
-	if (lp->lksb.sb_status) {
-		/* a "normal" error */
-		if ((lp->lksb.sb_status == -EAGAIN) &&
-		    (lp->lkf & DLM_LKF_NOQUEUE)) {
-			lp->req = lp->cur;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		}
-
-		/* this could only happen with cancels I think */
-		log_info("ast sb_status %d %x,%llx flags %lx",
-			 lp->lksb.sb_status, lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-		return;
-	}
-
-	/*
-	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-	 */
-
-	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * A lock has been demoted to NL because it initially completed during
-	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
-	 * mode.
-	 */
-
-	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-
-		lp->cur = DLM_LOCK_NL;
-		lp->req = lp->prev_req;
-		lp->prev_req = DLM_LOCK_IV;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-		    !test_bit(LFL_NOBLOCK, &lp->flags))
-			gdlm_queue_delayed(lp);
-		else
-			queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * A request is granted during dlm recovery.  It may be granted
-	 * because the locks of a failed node were cleared.  In that case,
-	 * there may be inconsistent data beneath this lock and we must wait
-	 * for recovery to complete to use it.  When gfs recovery is done this
-	 * granted lock will be converted to NL and then reacquired in this
-	 * granted state.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
-	    lp->req != DLM_LOCK_NL) {
-
-		lp->cur = lp->req;
-		lp->prev_req = lp->req;
-		lp->req = DLM_LOCK_NL;
-		lp->lkf |= DLM_LKF_CONVERT;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		log_debug("rereq %x,%llx id %x %d,%d",
-			  lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number,
-			  lp->lksb.sb_lkid, lp->cur, lp->req);
-
-		set_bit(LFL_REREQUEST, &lp->flags);
-		queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * DLM demoted the lock to NL before it was granted so GFS must be
-	 * told it cannot cache data for this lock.
-	 */
-
-	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
-	/*
-	 * This is an internal lock_dlm lock
-	 */
-
-	if (test_bit(LFL_INLOCK, &lp->flags)) {
-		clear_bit(LFL_NOBLOCK, &lp->flags);
-		lp->cur = lp->req;
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * Normal completion of a lock request.  Tell GFS it now has the lock.
-	 */
-
-	clear_bit(LFL_NOBLOCK, &lp->flags);
-	lp->cur = lp->req;
-
-	acb.lc_name = lp->lockname;
-	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
-	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static void gdlm_ast(void *astarg)
-{
-	struct gdlm_lock *lp = astarg;
-	clear_bit(LFL_ACTIVE, &lp->flags);
-	process_complete(lp);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int cb = 0;
-
-	switch (gdlm_make_lmstate(bast_mode)) {
-	case LM_ST_EXCLUSIVE:
-		cb = LM_CB_NEED_E;
-		break;
-	case LM_ST_DEFERRED:
-		cb = LM_CB_NEED_D;
-		break;
-	case LM_ST_SHARED:
-		cb = LM_CB_NEED_S;
-		break;
-	default:
-		gdlm_assert(0, "unknown bast mode %u", bast_mode);
-	}
-
-	ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-
-static void gdlm_bast(void *astarg, int mode)
-{
-	struct gdlm_lock *lp = astarg;
-
-	if (!mode) {
-		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-			lp->lockname.ln_type,
-			(unsigned long long)lp->lockname.ln_number);
-		return;
-	}
-
-	process_blocking(lp, mode);
-}
-
-/* convert gfs lock-state to dlm lock-mode */
-
-static s16 make_mode(s16 lmstate)
-{
-	switch (lmstate) {
-	case LM_ST_UNLOCKED:
-		return DLM_LOCK_NL;
-	case LM_ST_EXCLUSIVE:
-		return DLM_LOCK_EX;
-	case LM_ST_DEFERRED:
-		return DLM_LOCK_CW;
-	case LM_ST_SHARED:
-		return DLM_LOCK_PR;
-	}
-	gdlm_assert(0, "unknown LM state %d", lmstate);
-	return -1;
-}
-
-
-/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
-   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
-
-static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
-{
-	s16 cur = make_mode(cur_state);
-	if (lp->cur != DLM_LOCK_IV)
-		gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
-}
-
-static inline unsigned int make_flags(struct gdlm_lock *lp,
-				      unsigned int gfs_flags,
-				      s16 cur, s16 req)
-{
-	unsigned int lkf = 0;
-
-	if (gfs_flags & LM_FLAG_TRY)
-		lkf |= DLM_LKF_NOQUEUE;
-
-	if (gfs_flags & LM_FLAG_TRY_1CB) {
-		lkf |= DLM_LKF_NOQUEUE;
-		lkf |= DLM_LKF_NOQUEUEBAST;
-	}
-
-	if (gfs_flags & LM_FLAG_PRIORITY) {
-		lkf |= DLM_LKF_NOORDER;
-		lkf |= DLM_LKF_HEADQUE;
-	}
-
-	if (gfs_flags & LM_FLAG_ANY) {
-		if (req == DLM_LOCK_PR)
-			lkf |= DLM_LKF_ALTCW;
-		else if (req == DLM_LOCK_CW)
-			lkf |= DLM_LKF_ALTPR;
-	}
-
-	if (lp->lksb.sb_lkid != 0) {
-		lkf |= DLM_LKF_CONVERT;
-	}
-
-	if (lp->lvb)
-		lkf |= DLM_LKF_VALBLK;
-
-	return lkf;
-}
-
-/* make_strname - convert GFS lock numbers to a string */
-
-static inline void make_strname(const struct lm_lockname *lockname,
-				struct gdlm_strname *str)
-{
-	sprintf(str->name, "%8x%16llx", lockname->ln_type,
-		(unsigned long long)lockname->ln_number);
-	str->namelen = GDLM_STRNAME_BYTES;
-}
-
-static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
-			  struct gdlm_lock **lpp)
-{
-	struct gdlm_lock *lp;
-
-	lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
-	if (!lp)
-		return -ENOMEM;
-
-	lp->lockname = *name;
-	make_strname(name, &lp->strname);
-	lp->ls = ls;
-	lp->cur = DLM_LOCK_IV;
-	INIT_LIST_HEAD(&lp->delay_list);
-
-	spin_lock(&ls->async_lock);
-	ls->all_locks_count++;
-	spin_unlock(&ls->async_lock);
-
-	*lpp = lp;
-	return 0;
-}
-
-int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
-		  void **lockp)
-{
-	struct gdlm_lock *lp;
-	int error;
-
-	error = gdlm_create_lp(lockspace, name, &lp);
-
-	*lockp = lp;
-	return error;
-}
-
-void gdlm_put_lock(void *lock)
-{
-	gdlm_delete_lp(lock);
-}
-
-unsigned int gdlm_do_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	int error, bast = 1;
-
-	/*
-	 * When recovery is in progress, delay lock requests for submission
-	 * once recovery is done.  Requests for recovery (NOEXP) and unlocks
-	 * can pass.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
-		gdlm_queue_delayed(lp);
-		return LM_OUT_ASYNC;
-	}
-
-	/*
-	 * Submit the actual lock request.
-	 */
-
-	if (test_bit(LFL_NOBAST, &lp->flags))
-		bast = 0;
-
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
-		  lp->cur, lp->req, lp->lkf);
-
-	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
-			 lp, bast ? gdlm_bast : NULL);
-
-	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
-		lp->lksb.sb_status = -EAGAIN;
-		gdlm_ast(lp);
-		error = 0;
-	}
-
-	if (error) {
-		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int lkf = 0;
-	int error;
-
-	set_bit(LFL_DLM_UNLOCK, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	if (lp->lvb)
-		lkf = DLM_LKF_VALBLK;
-
-	log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number,
-		  lp->lksb.sb_lkid, lp->cur, lkf);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
-
-	if (error) {
-		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-unsigned int gdlm_lock(void *lock, unsigned int cur_state,
-		       unsigned int req_state, unsigned int flags)
-{
-	struct gdlm_lock *lp = lock;
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (flags & LM_FLAG_NOEXP)
-		set_bit(LFL_NOBLOCK, &lp->flags);
-
-	check_cur_state(lp, cur_state);
-	lp->req = make_mode(req_state);
-	lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
-
-	return gdlm_do_lock(lp);
-}
-
-unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
-{
-	struct gdlm_lock *lp = lock;
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (lp->cur == DLM_LOCK_IV)
-		return 0;
-	return gdlm_do_unlock(lp);
-}
-
-void gdlm_cancel(void *lock)
-{
-	struct gdlm_lock *lp = lock;
-	struct gdlm_ls *ls = lp->ls;
-	int error, delay_list = 0;
-
-	if (test_bit(LFL_DLM_CANCEL, &lp->flags))
-		return;
-
-	log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list)) {
-		list_del_init(&lp->delay_list);
-		delay_list = 1;
-	}
-	spin_unlock(&ls->async_lock);
-
-	if (delay_list) {
-		set_bit(LFL_CANCEL, &lp->flags);
-		set_bit(LFL_ACTIVE, &lp->flags);
-		gdlm_ast(lp);
-		return;
-	}
-
-	if (!test_bit(LFL_ACTIVE, &lp->flags) ||
-	    test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		log_info("gdlm_cancel skip %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number, lp->flags);
-		return;
-	}
-
-	/* the lock is blocked in the dlm */
-
-	set_bit(LFL_DLM_CANCEL, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
-			   NULL, lp);
-
-	log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
-		 lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	if (error == -EBUSY)
-		clear_bit(LFL_DLM_CANCEL, &lp->flags);
-}
-
-static int gdlm_add_lvb(struct gdlm_lock *lp)
-{
-	char *lvb;
-
-	lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
-	if (!lvb)
-		return -ENOMEM;
-
-	lp->lksb.sb_lvbptr = lvb;
-	lp->lvb = lvb;
-	return 0;
-}
-
-static void gdlm_del_lvb(struct gdlm_lock *lp)
-{
-	kfree(lp->lvb);
-	lp->lvb = NULL;
-	lp->lksb.sb_lvbptr = NULL;
-}
-
-static int gdlm_ast_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs won't call hold_lvb() during a callback (from
-   the context of a lock_dlm thread). */
-
-static int hold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = NULL;
-	int error;
-
-	if (lp->hold_null) {
-		printk(KERN_INFO "lock_dlm: lvb already held\n");
-		return 0;
-	}
-
-	error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
-	if (error)
-		goto out;
-
-	lpn->lksb.sb_lvbptr = junk_lvb;
-	lpn->lvb = junk_lvb;
-
-	lpn->req = DLM_LOCK_NL;
-	lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
-	set_bit(LFL_NOBAST, &lpn->flags);
-	set_bit(LFL_INLOCK, &lpn->flags);
-	set_bit(LFL_AST_WAIT, &lpn->flags);
-
-	gdlm_do_lock(lpn);
-	wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
-	error = lpn->lksb.sb_status;
-	if (error) {
-		printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
-		       error);
-		gdlm_delete_lp(lpn);
-		lpn = NULL;
-	}
-out:
-	lp->hold_null = lpn;
-	return error;
-}
-
-/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs may call unhold_lvb() during a callback (from
-   the context of a lock_dlm thread) which could cause a deadlock since the
-   other lock_dlm thread could be engaged in recovery. */
-
-static void unhold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = lp->hold_null;
-
-	gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	lpn->lksb.sb_lvbptr = NULL;
-	lpn->lvb = NULL;
-	set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
-	gdlm_do_unlock(lpn);
-	lp->hold_null = NULL;
-}
-
-/* Acquire a NL lock because gfs requires the value block to remain
-   intact on the resource while the lvb is "held" even if it's holding no locks
-   on the resource. */
-
-int gdlm_hold_lvb(void *lock, char **lvbp)
-{
-	struct gdlm_lock *lp = lock;
-	int error;
-
-	error = gdlm_add_lvb(lp);
-	if (error)
-		return error;
-
-	*lvbp = lp->lvb;
-
-	error = hold_null_lock(lp);
-	if (error)
-		gdlm_del_lvb(lp);
-
-	return error;
-}
-
-void gdlm_unhold_lvb(void *lock, char *lvb)
-{
-	struct gdlm_lock *lp = lock;
-
-	unhold_null_lock(lp);
-	gdlm_del_lvb(lp);
-}
-
-void gdlm_submit_delayed(struct gdlm_ls *ls)
-{
-	struct gdlm_lock *lp, *safe;
-
-	spin_lock(&ls->async_lock);
-	list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
-		list_del_init(&lp->delay_list);
-		list_add_tail(&lp->delay_list, &ls->submit);
-	}
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
deleted file mode 100644
index 3c98e7c6f93b..000000000000
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef LOCK_DLM_DOT_H
-#define LOCK_DLM_DOT_H
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/socket.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/kobject.h>
-#include <linux/fcntl.h>
-#include <linux/wait.h>
-#include <net/sock.h>
-
-#include <linux/dlm.h>
-#include <linux/dlm_plock.h>
-#include <linux/lm_interface.h>
-
-/*
- * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
- * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
- * as "lock_dlm".
- */
-
-#define GDLM_STRNAME_BYTES	24
-#define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		0
-#define GDLM_DROP_PERIOD	60
-#define GDLM_NAME_LEN		128
-
-/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
-   We sprintf these numbers into a 24 byte string of hex values to make them
-   human-readable (to make debugging simpler.) */
-
-struct gdlm_strname {
-	unsigned char		name[GDLM_STRNAME_BYTES];
-	unsigned short		namelen;
-};
-
-enum {
-	DFL_BLOCK_LOCKS		= 0,
-	DFL_SPECTATOR		= 1,
-	DFL_WITHDRAW		= 2,
-};
-
-struct gdlm_ls {
-	u32		id;
-	int			jid;
-	int			first;
-	int			first_done;
-	unsigned long		flags;
-	struct kobject		kobj;
-	char			clustername[GDLM_NAME_LEN];
-	char			fsname[GDLM_NAME_LEN];
-	int			fsflags;
-	dlm_lockspace_t		*dlm_lockspace;
-	lm_callback_t		fscb;
-	struct gfs2_sbd		*sdp;
-	int			recover_jid;
-	int			recover_jid_done;
-	int			recover_jid_status;
-	spinlock_t		async_lock;
-	struct list_head	delayed;
-	struct list_head	submit;
-	u32		all_locks_count;
-	wait_queue_head_t	wait_control;
-	struct task_struct	*thread;
-	wait_queue_head_t	thread_wait;
-};
-
-enum {
-	LFL_NOBLOCK		= 0,
-	LFL_NOCACHE		= 1,
-	LFL_DLM_UNLOCK		= 2,
-	LFL_DLM_CANCEL		= 3,
-	LFL_SYNC_LVB		= 4,
-	LFL_FORCE_PROMOTE	= 5,
-	LFL_REREQUEST		= 6,
-	LFL_ACTIVE		= 7,
-	LFL_INLOCK		= 8,
-	LFL_CANCEL		= 9,
-	LFL_NOBAST		= 10,
-	LFL_HEADQUE		= 11,
-	LFL_UNLOCK_DELETE	= 12,
-	LFL_AST_WAIT		= 13,
-};
-
-struct gdlm_lock {
-	struct gdlm_ls		*ls;
-	struct lm_lockname	lockname;
-	struct gdlm_strname	strname;
-	char			*lvb;
-	struct dlm_lksb		lksb;
-
-	s16			cur;
-	s16			req;
-	s16			prev_req;
-	u32			lkf;		/* dlm flags DLM_LKF_ */
-	unsigned long		flags;		/* lock_dlm flags LFL_ */
-
-	struct list_head	delay_list;	/* delayed */
-	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
-};
-
-#define gdlm_assert(assertion, fmt, args...)                                  \
-do {                                                                          \
-	if (unlikely(!(assertion))) {                                         \
-		printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
-				  "lock_dlm:  " fmt "\n",                     \
-				  #assertion, ##args);                        \
-		BUG();                                                        \
-	}                                                                     \
-} while (0)
-
-#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
-#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
-#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
-#ifdef LOCK_DLM_LOG_DEBUG
-#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
-#else
-#define log_debug(fmt, arg...)
-#endif
-
-/* sysfs.c */
-
-int gdlm_sysfs_init(void);
-void gdlm_sysfs_exit(void);
-int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
-void gdlm_kobject_release(struct gdlm_ls *);
-
-/* thread.c */
-
-int gdlm_init_threads(struct gdlm_ls *);
-void gdlm_release_threads(struct gdlm_ls *);
-
-/* lock.c */
-
-void gdlm_submit_delayed(struct gdlm_ls *);
-unsigned int gdlm_do_lock(struct gdlm_lock *);
-
-int gdlm_get_lock(void *, struct lm_lockname *, void **);
-void gdlm_put_lock(void *);
-unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
-unsigned int gdlm_unlock(void *, unsigned int);
-void gdlm_cancel(void *);
-int gdlm_hold_lvb(void *, char **);
-void gdlm_unhold_lvb(void *, char *);
-
-/* mount.c */
-
-extern const struct lm_lockops gdlm_ops;
-
-#endif
-
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
deleted file mode 100644
index b9a03a7ff801..000000000000
--- a/fs/gfs2/locking/dlm/main.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/init.h>
-
-#include "lock_dlm.h"
-
-static int __init init_lock_dlm(void)
-{
-	int error;
-
-	error = gfs2_register_lockproto(&gdlm_ops);
-	if (error) {
-		printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
-		       error);
-		return error;
-	}
-
-	error = gdlm_sysfs_init();
-	if (error) {
-		gfs2_unregister_lockproto(&gdlm_ops);
-		return error;
-	}
-
-	printk(KERN_INFO
-	       "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
-	return 0;
-}
-
-static void __exit exit_lock_dlm(void)
-{
-	gdlm_sysfs_exit();
-	gfs2_unregister_lockproto(&gdlm_ops);
-}
-
-module_init(init_lock_dlm);
-module_exit(exit_lock_dlm);
-
-MODULE_DESCRIPTION("GFS DLM Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
deleted file mode 100644
index 1aa7eb6a0226..000000000000
--- a/fs/gfs2/locking/dlm/mount.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-const struct lm_lockops gdlm_ops;
-
-
-static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
-				 int flags, char *table_name)
-{
-	struct gdlm_ls *ls;
-	char buf[256], *p;
-
-	ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
-	if (!ls)
-		return NULL;
-
-	ls->fscb = cb;
-	ls->sdp = sdp;
-	ls->fsflags = flags;
-	spin_lock_init(&ls->async_lock);
-	INIT_LIST_HEAD(&ls->delayed);
-	INIT_LIST_HEAD(&ls->submit);
-	init_waitqueue_head(&ls->thread_wait);
-	init_waitqueue_head(&ls->wait_control);
-	ls->jid = -1;
-
-	strncpy(buf, table_name, 256);
-	buf[255] = '\0';
-
-	p = strchr(buf, ':');
-	if (!p) {
-		log_info("invalid table_name \"%s\"", table_name);
-		kfree(ls);
-		return NULL;
-	}
-	*p = '\0';
-	p++;
-
-	strncpy(ls->clustername, buf, GDLM_NAME_LEN);
-	strncpy(ls->fsname, p, GDLM_NAME_LEN);
-
-	return ls;
-}
-
-static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
-{
-	char data[256];
-	char *options, *x, *y;
-	int error = 0;
-
-	memset(data, 0, 256);
-	strncpy(data, data_arg, 255);
-
-	if (!strlen(data)) {
-		log_error("no mount options, (u)mount helpers not installed");
-		return -EINVAL;
-	}
-
-	for (options = data; (x = strsep(&options, ":")); ) {
-		if (!*x)
-			continue;
-
-		y = strchr(x, '=');
-		if (y)
-			*y++ = 0;
-
-		if (!strcmp(x, "jid")) {
-			if (!y) {
-				log_error("need argument to jid");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->jid);
-
-		} else if (!strcmp(x, "first")) {
-			if (!y) {
-				log_error("need argument to first");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->first);
-
-		} else if (!strcmp(x, "id")) {
-			if (!y) {
-				log_error("need argument to id");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->id);
-
-		} else if (!strcmp(x, "nodir")) {
-			if (!y) {
-				log_error("need argument to nodir");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", nodir);
-
-		} else {
-			log_error("unkonwn option: %s", x);
-			error = -EINVAL;
-			break;
-		}
-	}
-
-	return error;
-}
-
-static int gdlm_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	struct gdlm_ls *ls;
-	int error = -ENOMEM, nodir = 0;
-
-	if (min_lvb_size > GDLM_LVB_SIZE)
-		goto out;
-
-	ls = init_gdlm(cb, cb_data, flags, table_name);
-	if (!ls)
-		goto out;
-
-	error = make_args(ls, host_data, &nodir);
-	if (error)
-		goto out;
-
-	error = gdlm_init_threads(ls);
-	if (error)
-		goto out_free;
-
-	error = gdlm_kobject_setup(ls, fskobj);
-	if (error)
-		goto out_thread;
-
-	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
-				  &ls->dlm_lockspace,
-				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
-				  (nodir ? DLM_LSFL_NODIR : 0),
-				  GDLM_LVB_SIZE);
-	if (error) {
-		log_error("dlm_new_lockspace error %d", error);
-		goto out_kobj;
-	}
-
-	lockstruct->ls_jid = ls->jid;
-	lockstruct->ls_first = ls->first;
-	lockstruct->ls_lockspace = ls;
-	lockstruct->ls_ops = &gdlm_ops;
-	lockstruct->ls_flags = 0;
-	lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
-	return 0;
-
-out_kobj:
-	gdlm_kobject_release(ls);
-out_thread:
-	gdlm_release_threads(ls);
-out_free:
-	kfree(ls);
-out:
-	return error;
-}
-
-static void gdlm_unmount(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	log_debug("unmount flags %lx", ls->flags);
-
-	/* FIXME: serialize unmount and withdraw in case they
-	   happen at once.  Also, if unmount follows withdraw,
-	   wait for withdraw to finish. */
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		goto out;
-
-	gdlm_kobject_release(ls);
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	BUG_ON(ls->all_locks_count);
-out:
-	kfree(ls);
-}
-
-static void gdlm_recovery_done(void *lockspace, unsigned int jid,
-                               unsigned int message)
-{
-	char env_jid[20];
-	char env_status[20];
-	char *envp[] = { env_jid, env_status, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->recover_jid_done = jid;
-	ls->recover_jid_status = message;
-	sprintf(env_jid, "JID=%d", jid);
-	sprintf(env_status, "RECOVERY=%s",
-		message == LM_RD_SUCCESS ? "Done" : "Failed");
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-static void gdlm_others_may_mount(void *lockspace)
-{
-	char *message = "FIRSTMOUNT=Done";
-	char *envp[] = { message, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->first_done = 1;
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-/* Userspace gets the offline uevent, blocks new gfs locks on
-   other mounters, and lets us know (sets WITHDRAW flag).  Then,
-   userspace leaves the mount group while we leave the lockspace. */
-
-static void gdlm_withdraw(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
-
-	wait_event_interruptible(ls->wait_control,
-				 test_bit(DFL_WITHDRAW, &ls->flags));
-
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	gdlm_kobject_release(ls);
-}
-
-static int gdlm_plock(void *lockspace, struct lm_lockname *name,
-	       struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
-}
-
-static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
-		 struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
-		   struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-const struct lm_lockops gdlm_ops = {
-	.lm_proto_name = "lock_dlm",
-	.lm_mount = gdlm_mount,
-	.lm_others_may_mount = gdlm_others_may_mount,
-	.lm_unmount = gdlm_unmount,
-	.lm_withdraw = gdlm_withdraw,
-	.lm_get_lock = gdlm_get_lock,
-	.lm_put_lock = gdlm_put_lock,
-	.lm_lock = gdlm_lock,
-	.lm_unlock = gdlm_unlock,
-	.lm_plock = gdlm_plock,
-	.lm_punlock = gdlm_punlock,
-	.lm_plock_get = gdlm_plock_get,
-	.lm_cancel = gdlm_cancel,
-	.lm_hold_lvb = gdlm_hold_lvb,
-	.lm_unhold_lvb = gdlm_unhold_lvb,
-	.lm_recovery_done = gdlm_recovery_done,
-	.lm_owner = THIS_MODULE,
-};
-
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
deleted file mode 100644
index 9b7edcf7bd49..000000000000
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "lock_dlm.h"
-
-static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
-}
-
-static ssize_t block_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_BLOCK_LOCKS, &ls->flags);
-	else if (val == 0) {
-		clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
-		gdlm_submit_delayed(ls);
-	} else {
-		ret = -EINVAL;
-	}
-	return ret;
-}
-
-static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_WITHDRAW, &ls->flags);
-	else
-		ret = -EINVAL;
-	wake_up(&ls->wait_control);
-	return ret;
-}
-
-static ssize_t id_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%u\n", ls->id);
-}
-
-static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->jid);
-}
-
-static ssize_t first_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first);
-}
-
-static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first_done);
-}
-
-static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid);
-}
-
-static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->recover_jid = simple_strtol(buf, NULL, 0);
-	ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
-	return len;
-}
-
-static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_done);
-}
-
-static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_status);
-}
-
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gdlm_ls *, char *);
-	ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
-};
-
-#define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, id_show,             NULL);
-GDLM_ATTR(jid,            0444, jid_show,            NULL);
-GDLM_ATTR(first,          0444, first_show,          NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-
-static struct attribute *gdlm_attrs[] = {
-	&gdlm_attr_proto_name.attr,
-	&gdlm_attr_block.attr,
-	&gdlm_attr_withdraw.attr,
-	&gdlm_attr_id.attr,
-	&gdlm_attr_jid.attr,
-	&gdlm_attr_first.attr,
-	&gdlm_attr_first_done.attr,
-	&gdlm_attr_recover.attr,
-	&gdlm_attr_recover_done.attr,
-	&gdlm_attr_recover_status.attr,
-	NULL,
-};
-
-static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->show ? a->show(ls, buf) : 0;
-}
-
-static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->store ? a->store(ls, buf, len) : len;
-}
-
-static struct sysfs_ops gdlm_attr_ops = {
-	.show  = gdlm_attr_show,
-	.store = gdlm_attr_store,
-};
-
-static struct kobj_type gdlm_ktype = {
-	.default_attrs = gdlm_attrs,
-	.sysfs_ops     = &gdlm_attr_ops,
-};
-
-static struct kset *gdlm_kset;
-
-int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
-{
-	int error;
-
-	ls->kobj.kset = gdlm_kset;
-	error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
-				     "lock_module");
-	if (error)
-		log_error("can't register kobj %d", error);
-	kobject_uevent(&ls->kobj, KOBJ_ADD);
-
-	return error;
-}
-
-void gdlm_kobject_release(struct gdlm_ls *ls)
-{
-	kobject_put(&ls->kobj);
-}
-
-static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
-		       struct kobj_uevent_env *env)
-{
-        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
-        add_uevent_var(env, "LOCKPROTO=lock_dlm");
-        return 0;
-}
-
-static struct kset_uevent_ops gdlm_uevent_ops = {
-	.uevent = gdlm_uevent,
-};
-
-
-int gdlm_sysfs_init(void)
-{
-	gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
-	if (!gdlm_kset) {
-		printk(KERN_WARNING "%s: can not create kset\n", __func__);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void gdlm_sysfs_exit(void)
-{
-	kset_unregister(gdlm_kset);
-}
-
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
deleted file mode 100644
index 38823efd698c..000000000000
--- a/fs/gfs2/locking/dlm/thread.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static inline int no_work(struct gdlm_ls *ls)
-{
-	int ret;
-
-	spin_lock(&ls->async_lock);
-	ret = list_empty(&ls->submit);
-	spin_unlock(&ls->async_lock);
-
-	return ret;
-}
-
-static int gdlm_thread(void *data)
-{
-	struct gdlm_ls *ls = (struct gdlm_ls *) data;
-	struct gdlm_lock *lp = NULL;
-
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(ls->thread_wait,
-				!no_work(ls) || kthread_should_stop());
-
-		spin_lock(&ls->async_lock);
-
-		if (!list_empty(&ls->submit)) {
-			lp = list_entry(ls->submit.next, struct gdlm_lock,
-					delay_list);
-			list_del_init(&lp->delay_list);
-			spin_unlock(&ls->async_lock);
-			gdlm_do_lock(lp);
-			spin_lock(&ls->async_lock);
-		}
-		spin_unlock(&ls->async_lock);
-	}
-
-	return 0;
-}
-
-int gdlm_init_threads(struct gdlm_ls *ls)
-{
-	struct task_struct *p;
-	int error;
-
-	p = kthread_run(gdlm_thread, ls, "lock_dlm");
-	error = IS_ERR(p);
-	if (error) {
-		log_error("can't start lock_dlm thread %d", error);
-		return error;
-	}
-	ls->thread = p;
-
-	return 0;
-}
-
-void gdlm_release_threads(struct gdlm_ls *ls)
-{
-	kthread_stop(ls->thread);
-}
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ad305854bdc6..98918a756410 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4390f6f4047d..80e4f5f898bb 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,7 +13,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7cacfde32194..a6892ed0840a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/atomic.h>
 
 #include "gfs2.h"
@@ -23,6 +22,12 @@
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
+#include "quota.h"
+
+static struct shrinker qd_shrinker = {
+	.shrink = gfs2_shrink_qd_memory,
+	.seeks = DEFAULT_SEEKS,
+};
 
 static void gfs2_init_inode_once(void *foo)
 {
@@ -41,8 +46,6 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_HLIST_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
-	gl->gl_lvb = NULL;
-	atomic_set(&gl->gl_lvb_count, 0);
 	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
@@ -100,6 +103,8 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_quotad_cachep)
 		goto fail;
 
+	register_shrinker(&qd_shrinker);
+
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
 		goto fail;
@@ -117,6 +122,7 @@ static int __init init_gfs2_fs(void)
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
+	unregister_shrinker(&qd_shrinker);
 	gfs2_glock_exit();
 
 	if (gfs2_quotad_cachep)
@@ -145,6 +151,7 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
+	unregister_shrinker(&qd_shrinker);
 	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 09853620c951..8d6f13256b26 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -90,27 +89,6 @@ void gfs2_aspace_put(struct inode *aspace)
 }
 
 /**
- * gfs2_meta_inval - Invalidate all buffers associated with a glock
- * @gl: the glock
- *
- */
-
-void gfs2_meta_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
-
-	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-
-	atomic_inc(&aspace->i_writecount);
-	truncate_inode_pages(mapping, 0);
-	atomic_dec(&aspace->i_writecount);
-
-	gfs2_assert_withdraw(sdp, !mapping->nrpages);
-}
-
-/**
  * gfs2_meta_sync - Sync all buffers associated with a glock
  * @gl: The glock
  *
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b1a5f3674d43..de270c2f9b63 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,7 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
 void gfs2_aspace_put(struct inode *aspace);
 
-void gfs2_meta_inval(struct gfs2_glock *gl);
 void gfs2_meta_sync(struct gfs2_glock *gl);
 
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 3cb0a44ba023..f7e8527a21e0 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -12,12 +12,11 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
-#include "mount.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 
@@ -37,11 +36,15 @@ enum {
 	Opt_quota_off,
 	Opt_quota_account,
 	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
 	Opt_suiddir,
 	Opt_nosuiddir,
 	Opt_data_writeback,
 	Opt_data_ordered,
 	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
 	Opt_err,
 };
 
@@ -61,11 +64,15 @@ static const match_table_t tokens = {
 	{Opt_quota_off, "quota=off"},
 	{Opt_quota_account, "quota=account"},
 	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
 	{Opt_suiddir, "suiddir"},
 	{Opt_nosuiddir, "nosuiddir"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL}
 };
 
@@ -77,101 +84,46 @@ static const match_table_t tokens = {
  * Return: errno
  */
 
-int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 {
-	struct gfs2_args *args = &sdp->sd_args;
-	char *data = data_arg;
-	char *options, *o, *v;
-	int error = 0;
-
-	if (!remount) {
-		/*  Set some defaults  */
-		args->ar_quota = GFS2_QUOTA_DEFAULT;
-		args->ar_data = GFS2_DATA_DEFAULT;
-	}
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
 
 	/* Split the options into tokens with the "," character and
 	   process them */
 
-	for (options = data; (o = strsep(&options, ",")); ) {
-		int token;
-		substring_t tmp[MAX_OPT_ARGS];
-
-		if (!*o)
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
 			continue;
 
 		token = match_token(o, tokens, tmp);
 		switch (token) {
 		case Opt_lockproto:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for lockproto\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_lockproto)) {
-				kfree(v);
-				goto cant_remount;
-			}
-			
-			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
-			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-			kfree(v);
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_locktable:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for locktable\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_locktable)) {
-				kfree(v);
-				goto cant_remount;
-			}
-
-			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-			args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
-			kfree(v);
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_hostdata:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for hostdata\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_hostdata)) {
-				kfree(v);
-				goto cant_remount;
-			}
-
-			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
-			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-			kfree(v);
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_spectator:
-			if (remount && !args->ar_spectator)
-				goto cant_remount;
 			args->ar_spectator = 1;
-			sdp->sd_vfs->s_flags |= MS_RDONLY;
 			break;
 		case Opt_ignore_local_fs:
-			if (remount && !args->ar_ignore_local_fs)
-				goto cant_remount;
 			args->ar_ignore_local_fs = 1;
 			break;
 		case Opt_localflocks:
-			if (remount && !args->ar_localflocks)
-				goto cant_remount;
 			args->ar_localflocks = 1;
 			break;
 		case Opt_localcaching:
-			if (remount && !args->ar_localcaching)
-				goto cant_remount;
 			args->ar_localcaching = 1;
 			break;
 		case Opt_debug:
@@ -181,25 +133,23 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 			args->ar_debug = 0;
 			break;
 		case Opt_upgrade:
-			if (remount && !args->ar_upgrade)
-				goto cant_remount;
 			args->ar_upgrade = 1;
 			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
-			sdp->sd_vfs->s_flags |= MS_POSIXACL;
 			break;
 		case Opt_noacl:
 			args->ar_posix_acl = 0;
-			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
 			break;
 		case Opt_quota_off:
+		case Opt_noquota:
 			args->ar_quota = GFS2_QUOTA_OFF;
 			break;
 		case Opt_quota_account:
 			args->ar_quota = GFS2_QUOTA_ACCOUNT;
 			break;
 		case Opt_quota_on:
+		case Opt_quota:
 			args->ar_quota = GFS2_QUOTA_ON;
 			break;
 		case Opt_suiddir:
@@ -215,29 +165,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 			args->ar_data = GFS2_DATA_ORDERED;
 			break;
 		case Opt_meta:
-			if (remount && args->ar_meta != 1)
-				goto cant_remount;
 			args->ar_meta = 1;
 			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
 		case Opt_err:
 		default:
-			fs_info(sdp, "unknown option: %s\n", o);
-			error = -EINVAL;
-			goto out_error;
+			fs_info(sdp, "invalid mount option: %s\n", o);
+			return -EINVAL;
 		}
 	}
 
-out_error:
-	if (error)
-		fs_info(sdp, "invalid mount option(s)\n");
-
-	if (data != data_arg)
-		kfree(data);
-
-	return error;
-
-cant_remount:
-	fs_info(sdp, "can't remount with option %s\n", o);
-	return -EINVAL;
+	return 0;
 }
 
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
deleted file mode 100644
index 401288acfdf3..000000000000
--- a/fs/gfs2/mount.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __MOUNT_DOT_H__
-#define __MOUNT_DOT_H__
-
-struct gfs2_sbd;
-
-int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
-
-#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4ddab67867eb..a6dde1751e17 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -19,7 +19,6 @@
 #include <linux/writeback.h>
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
 
 #include "gfs2.h"
@@ -442,6 +441,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	 */
 	if (unlikely(page->index)) {
 		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
 		return 0;
 	}
 
@@ -1096,6 +1096,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
@@ -1111,6 +1112,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
@@ -1125,6 +1127,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c2ad36330ca3..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -108,7 +107,7 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
-struct dentry_operations gfs2_dops = {
+const struct dentry_operations gfs2_dops = {
 	.d_revalidate = gfs2_drevalidate,
 	.d_hash = gfs2_dhash,
 };
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 7fdeb14ddd1a..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -14,7 +14,6 @@
 #include <linux/exportfs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 93fe41b67f97..70b9b8548945 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -20,9 +20,10 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -336,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -354,7 +356,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	if (ret)
 		goto out;
 
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
 		goto out_unlock;
@@ -409,6 +413,8 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
@@ -560,57 +566,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return ret;
 }
 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
 /**
  * gfs2_setlease - acquire/release a file lease
  * @file: the file pointer
  * @arg: lease type
  * @fl: file lock
  *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
  * Returns: errno
  */
 
 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-
-	/*
-	 * We don't currently have a way to enforce a lease across the whole
-	 * cluster; until we do, disable leases (by just returning -EINVAL),
-	 * unless the administrator has requested purely local locking.
-	 */
-	if (!sdp->sd_args.ar_localflocks)
-		return -EINVAL;
-	return generic_setlease(file, arg, fl);
-}
-
-static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		      struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
-}
-
-static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		  struct file *file, int cmd, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-	return error;
-}
-
-static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		    struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
+	return -EINVAL;
 }
 
 /**
@@ -626,9 +599,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-	struct lm_lockname name =
-		{ .ln_number = ip->i_no_addr,
-		  .ln_type = LM_TYPE_PLOCK };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
@@ -640,12 +611,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 		cmd = F_SETLK;
 		fl->fl_type = F_UNLCK;
 	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
 	if (IS_GETLK(cmd))
-		return gfs2_lm_plock_get(sdp, &name, file, fl);
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else if (fl->fl_type == F_UNLCK)
-		return gfs2_lm_punlock(sdp, &name, file, fl);
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else
-		return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
 }
 
 static int do_flock(struct file *file, int cmd, struct file_lock *fl)
@@ -732,7 +705,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	}
 }
 
-const struct file_operations gfs2_file_fops = {
+const struct file_operations *gfs2_file_fops = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -750,7 +723,7 @@ const struct file_operations gfs2_file_fops = {
 	.setlease	= gfs2_setlease,
 };
 
-const struct file_operations gfs2_dir_fops = {
+const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
@@ -760,7 +733,9 @@ const struct file_operations gfs2_dir_fops = {
 	.flock		= gfs2_flock,
 };
 
-const struct file_operations gfs2_file_fops_nolock = {
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -773,10 +748,10 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.fsync		= gfs2_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
-	.setlease	= gfs2_setlease,
+	.setlease	= generic_setlease,
 };
 
-const struct file_operations gfs2_dir_fops_nolock = {
+const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f91eebdde581..51883b3ad89c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -25,7 +24,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "mount.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
@@ -64,7 +62,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
-	gt->gt_quota_cache_secs = 300;
 	gt->gt_quota_quantum = 60;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
@@ -100,7 +97,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mutex_init(&sdp->sd_jindex_mutex);
 
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
-	spin_lock_init(&sdp->sd_quota_spin);
 	mutex_init(&sdp->sd_quota_mutex);
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
@@ -238,6 +234,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 
 	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
 	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_uuid, str->sb_uuid, 16);
 }
 
 /**
@@ -299,15 +296,15 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 	__free_page(page);
 	return 0;
 }
+
 /**
  * gfs2_read_sb - Read super block
  * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
  * @silent: Don't print message if mount fails
  *
  */
 
-static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 {
 	u32 hash_blocks, ind_blocks, leaf_blocks;
 	u32 tmp_blocks;
@@ -527,7 +524,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 		return ret;
 	}
 
-	ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+	ret = gfs2_read_sb(sdp, silent);
 	if (ret) {
 		fs_err(sdp, "can't read superblock: %d\n", ret);
 		goto out;
@@ -630,13 +627,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	return rc;
 }
 
-static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
-		return;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-					sdp->sd_lockstruct.ls_lockspace);
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_first_done = 1;
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
 /**
@@ -796,7 +793,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			}
 		}
 
-		gfs2_lm_others_may_mount(sdp);
+		gfs2_others_may_mount(sdp);
 	} else if (!sdp->sd_args.ar_spectator) {
 		error = gfs2_recover_journal(sdp->sd_jdesc);
 		if (error) {
@@ -1005,7 +1002,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 		goto fail_quotad;
 
 	sdp->sd_log_flush_time = jiffies;
-	sdp->sd_jindex_refresh_time = jiffies;
 
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
 	error = IS_ERR(p);
@@ -1033,6 +1029,17 @@ fail:
 	return error;
 }
 
+static const match_table_t nolock_tokens = {
+	{ Opt_jid, "jid=%d\n", },
+	{ Opt_err, NULL },
+};
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_put_lock = kmem_cache_free,
+	.lm_tokens = &nolock_tokens,
+};
+
 /**
  * gfs2_lm_mount - mount a locking protocol
  * @sdp: the filesystem
@@ -1044,31 +1051,73 @@ fail:
 
 static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 {
-	char *proto = sdp->sd_proto_name;
-	char *table = sdp->sd_table_name;
-	int flags = LM_MFLAG_CONV_NODROP;
-	int error;
+	const struct lm_lockops *lm;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	struct gfs2_args *args = &sdp->sd_args;
+	const char *proto = sdp->sd_proto_name;
+	const char *table = sdp->sd_table_name;
+	const char *fsname;
+	char *o, *options;
+	int ret;
 
-	if (sdp->sd_args.ar_spectator)
-		flags |= LM_MFLAG_SPECTATOR;
+	if (!strcmp("lock_nolock", proto)) {
+		lm = &nolock_ops;
+		sdp->sd_args.ar_localflocks = 1;
+		sdp->sd_args.ar_localcaching = 1;
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+	} else if (!strcmp("lock_dlm", proto)) {
+		lm = &gfs2_dlm_ops;
+#endif
+	} else {
+		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+		return -ENOENT;
+	}
 
 	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
 
-	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-				     gfs2_glock_cb, sdp,
-				     GFS2_MIN_LVB_SIZE, flags,
-				     &sdp->sd_lockstruct, &sdp->sd_kobj);
-	if (error) {
-		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-			proto, table, sdp->sd_args.ar_hostdata);
-		goto out;
-	}
+	ls->ls_ops = lm;
+	ls->ls_first = 1;
+	ls->ls_id = 0;
 
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-				  GFS2_MIN_LVB_SIZE)) {
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-		goto out;
+	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
+		substring_t tmp[MAX_OPT_ARGS];
+		int token, option;
+
+		if (!o || !*o)
+			continue;
+
+		token = match_token(o, *lm->lm_tokens, tmp);
+		switch (token) {
+		case Opt_jid:
+			ret = match_int(&tmp[0], &option);
+			if (ret || option < 0) 
+				goto hostdata_error;
+			ls->ls_jid = option;
+			break;
+		case Opt_id:
+			ret = match_int(&tmp[0], &option);
+			if (ret)
+				goto hostdata_error;
+			ls->ls_id = option;
+			break;
+		case Opt_first:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_first = option;
+			break;
+		case Opt_nodir:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_nodir = option;
+			break;
+		case Opt_err:
+		default:
+hostdata_error:
+			fs_info(sdp, "unknown hostdata (%s)\n", o);
+			return -EINVAL;
+		}
 	}
 
 	if (sdp->sd_args.ar_spectator)
@@ -1077,22 +1126,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
 			 sdp->sd_lockstruct.ls_jid);
 
-	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-
-	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-	    !sdp->sd_args.ar_ignore_local_fs) {
-		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
+	fsname = strchr(table, ':');
+	if (fsname)
+		fsname++;
+	if (lm->lm_mount == NULL) {
+		fs_info(sdp, "Now mounting FS...\n");
+		return 0;
 	}
-
-out:
-	return error;
+	ret = lm->lm_mount(sdp, fsname);
+	if (ret == 0)
+		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+	return ret;
 }
 
 void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 {
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    lm->lm_unmount)
+		lm->lm_unmount(sdp);
 }
 
 /**
@@ -1116,12 +1168,20 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	}
 
-	error = gfs2_mount_args(sdp, (char *)data, 0);
+	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
+	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+
+	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
 		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
 		goto fail;
 	}
 
+	if (sdp->sd_args.ar_spectator)
+                sb->s_flags |= MS_RDONLY;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_export_op = &gfs2_export_ops;
@@ -1199,6 +1259,8 @@ fail_sb:
 		dput(sdp->sd_root_dir);
 	if (sdp->sd_master_dir)
 		dput(sdp->sd_master_dir);
+	if (sb->s_root)
+		dput(sb->s_root);
 	sb->s_root = NULL;
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 49877546beb9..abd5429ae285 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,7 +18,6 @@
 #include <linux/posix_acl.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/fiemap.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 320323d03479..458019569dcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/time.h>
 
 #include "gfs2.h"
@@ -27,7 +26,6 @@
 #include "glock.h"
 #include "inode.h"
 #include "log.h"
-#include "mount.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -40,6 +38,8 @@
 #include "bmap.h"
 #include "meta_io.h"
 
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
 /**
  * gfs2_write_inode - Make sure the inode is stable on the disk
  * @inode: The inode
@@ -435,25 +435,45 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
 	int error;
 
-	error = gfs2_mount_args(sdp, data, 1);
+	error = gfs2_mount_args(sdp, &args, data);
 	if (error)
 		return error;
 
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, localcaching) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
 	if (sdp->sd_args.ar_spectator)
 		*flags |= MS_RDONLY;
-	else {
-		if (*flags & MS_RDONLY) {
-			if (!(sb->s_flags & MS_RDONLY))
-				error = gfs2_make_fs_ro(sdp);
-		} else if (!(*flags & MS_RDONLY) &&
-			   (sb->s_flags & MS_RDONLY)) {
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
 			error = gfs2_make_fs_rw(sdp);
-		}
+		if (error)
+			return error;
 	}
 
-	return error;
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	return 0;
 }
 
 /**
@@ -588,6 +608,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		}
 		seq_printf(s, ",data=%s", state);
 	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
 
 	return 0;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b08d09696b3e..8d53f66b5bcc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -45,7 +45,6 @@
 #include <linux/fs.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -80,6 +79,51 @@ struct gfs2_quota_change_host {
 	u32 qc_id;
 };
 
+static LIST_HEAD(qd_lru_list);
+static atomic_t qd_lru_count = ATOMIC_INIT(0);
+static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
+
+int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_sbd *sdp;
+
+	if (nr == 0)
+		goto out;
+
+	if (!(gfp_mask & __GFP_FS))
+		return -1;
+
+	spin_lock(&qd_lru_lock);
+	while (nr && !list_empty(&qd_lru_list)) {
+		qd = list_entry(qd_lru_list.next,
+				struct gfs2_quota_data, qd_reclaim);
+		sdp = qd->qd_gl->gl_sbd;
+
+		/* Free from the filesystem-specific list */
+		list_del(&qd->qd_list);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		atomic_dec(&sdp->sd_quota_count);
+
+		/* Delete it from the common reclaim list */
+		list_del_init(&qd->qd_reclaim);
+		atomic_dec(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
+		spin_lock(&qd_lru_lock);
+		nr--;
+	}
+	spin_unlock(&qd_lru_lock);
+
+out:
+	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
+}
+
 static u64 qd2offset(struct gfs2_quota_data *qd)
 {
 	u64 offset;
@@ -100,22 +144,18 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	if (!qd)
 		return -ENOMEM;
 
-	qd->qd_count = 1;
+	atomic_set(&qd->qd_count, 1);
 	qd->qd_id = id;
 	if (user)
 		set_bit(QDF_USER, &qd->qd_flags);
 	qd->qd_slot = -1;
+	INIT_LIST_HEAD(&qd->qd_reclaim);
 
 	error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
 	if (error)
 		goto fail;
 
-	error = gfs2_lvb_hold(qd->qd_gl);
-	gfs2_glock_put(qd->qd_gl);
-	if (error)
-		goto fail;
-
 	*qdp = qd;
 
 	return 0;
@@ -135,11 +175,17 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 
 	for (;;) {
 		found = 0;
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 			if (qd->qd_id == id &&
 			    !test_bit(QDF_USER, &qd->qd_flags) == !user) {
-				qd->qd_count++;
+				if (!atomic_read(&qd->qd_count) &&
+				    !list_empty(&qd->qd_reclaim)) {
+					/* Remove it from reclaim list */
+					list_del_init(&qd->qd_reclaim);
+					atomic_dec(&qd_lru_count);
+				}
+				atomic_inc(&qd->qd_count);
 				found = 1;
 				break;
 			}
@@ -155,11 +201,11 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 			new_qd = NULL;
 		}
 
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
 		if (qd || !create) {
 			if (new_qd) {
-				gfs2_lvb_unhold(new_qd->qd_gl);
+				gfs2_glock_put(new_qd->qd_gl);
 				kmem_cache_free(gfs2_quotad_cachep, new_qd);
 			}
 			*qdp = qd;
@@ -175,21 +221,18 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-
-	spin_lock(&sdp->sd_quota_spin);
-	gfs2_assert(sdp, qd->qd_count);
-	qd->qd_count++;
-	spin_unlock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
 }
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	spin_lock(&sdp->sd_quota_spin);
-	gfs2_assert(sdp, qd->qd_count);
-	if (!--qd->qd_count)
-		qd->qd_last_touched = jiffies;
-	spin_unlock(&sdp->sd_quota_spin);
+	if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
+		/* Add to the reclaim list */
+		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
+		atomic_inc(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+	}
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -198,10 +241,10 @@ static int slot_get(struct gfs2_quota_data *qd)
 	unsigned int c, o = 0, b;
 	unsigned char byte = 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	if (qd->qd_slot_count++) {
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 		return 0;
 	}
 
@@ -225,13 +268,13 @@ found:
 
 	sdp->sd_quota_bitmap[c][o] |= 1 << b;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	return 0;
 
 fail:
 	qd->qd_slot_count--;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 	return -ENOSPC;
 }
 
@@ -239,23 +282,23 @@ static void slot_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 }
 
 static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
 		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -330,7 +373,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
@@ -341,8 +384,8 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 
 		set_bit(QDF_LOCKED, &qd->qd_flags);
-		gfs2_assert_warn(sdp, qd->qd_count);
-		qd->qd_count++;
+		gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+		atomic_inc(&qd->qd_count);
 		qd->qd_change_sync = qd->qd_change;
 		gfs2_assert_warn(sdp, qd->qd_slot_count);
 		qd->qd_slot_count++;
@@ -354,7 +397,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (!found)
 		qd = NULL;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	if (qd) {
 		gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -379,24 +422,24 @@ static int qd_trylock(struct gfs2_quota_data *qd)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
 	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 		return 0;
 	}
 
 	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 
 	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, qd->qd_count);
-	qd->qd_count++;
+	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
 	qd->qd_change_sync = qd->qd_change;
 	gfs2_assert_warn(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	gfs2_assert_warn(sdp, qd->qd_change_sync);
 	if (bh_get(qd)) {
@@ -556,9 +599,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	x = be64_to_cpu(qc->qc_change) + change;
 	qc->qc_change = cpu_to_be64(x);
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	qd->qd_change = x;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	if (!x) {
 		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -802,8 +845,8 @@ restart:
 		loff_t pos;
 		gfs2_glock_dq_uninit(q_gh);
 		error = gfs2_glock_nq_init(qd->qd_gl,
-					  LM_ST_EXCLUSIVE, GL_NOCACHE,
-					  q_gh);
+					   LM_ST_EXCLUSIVE, GL_NOCACHE,
+					   q_gh);
 		if (error)
 			return error;
 
@@ -820,7 +863,6 @@ restart:
 
 		gfs2_glock_dq_uninit(&i_gh);
 
-
 		gfs2_quota_in(&q, buf);
 		qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 		qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
@@ -890,9 +932,9 @@ static int need_sync(struct gfs2_quota_data *qd)
 	if (!qd->qd_qb.qb_limit)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	value = qd->qd_change;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	spin_lock(&gt->gt_spin);
 	num = gt->gt_quota_scale_num;
@@ -985,9 +1027,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 			continue;
 
 		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 		value += qd->qd_change;
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
@@ -1171,13 +1213,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			qd->qd_change = qc.qc_change;
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
-			qd->qd_last_touched = jiffies;
 
-			spin_lock(&sdp->sd_quota_spin);
+			spin_lock(&qd_lru_lock);
 			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
-			spin_unlock(&sdp->sd_quota_spin);
+			spin_unlock(&qd_lru_lock);
 
 			found++;
 		}
@@ -1197,73 +1238,48 @@ fail:
 	return error;
 }
 
-static void gfs2_quota_scan(struct gfs2_sbd *sdp)
-{
-	struct gfs2_quota_data *qd, *safe;
-	LIST_HEAD(dead);
-
-	spin_lock(&sdp->sd_quota_spin);
-	list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
-		if (!qd->qd_count &&
-		    time_after_eq(jiffies, qd->qd_last_touched +
-			        gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
-			list_move(&qd->qd_list, &dead);
-			gfs2_assert_warn(sdp,
-					 atomic_read(&sdp->sd_quota_count) > 0);
-			atomic_dec(&sdp->sd_quota_count);
-		}
-	}
-	spin_unlock(&sdp->sd_quota_spin);
-
-	while (!list_empty(&dead)) {
-		qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
-		list_del(&qd->qd_list);
-
-		gfs2_assert_warn(sdp, !qd->qd_change);
-		gfs2_assert_warn(sdp, !qd->qd_slot_count);
-		gfs2_assert_warn(sdp, !qd->qd_bh_count);
-
-		gfs2_lvb_unhold(qd->qd_gl);
-		kmem_cache_free(gfs2_quotad_cachep, qd);
-	}
-}
-
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
 	struct list_head *head = &sdp->sd_quota_list;
 	struct gfs2_quota_data *qd;
 	unsigned int x;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		if (qd->qd_count > 1 ||
-		    (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+		if (atomic_read(&qd->qd_count) > 1 ||
+		    (atomic_read(&qd->qd_count) &&
+		     !test_bit(QDF_CHANGE, &qd->qd_flags))) {
 			list_move(&qd->qd_list, head);
-			spin_unlock(&sdp->sd_quota_spin);
+			spin_unlock(&qd_lru_lock);
 			schedule();
-			spin_lock(&sdp->sd_quota_spin);
+			spin_lock(&qd_lru_lock);
 			continue;
 		}
 
 		list_del(&qd->qd_list);
+		/* Also remove if this qd exists in the reclaim list */
+		if (!list_empty(&qd->qd_reclaim)) {
+			list_del_init(&qd->qd_reclaim);
+			atomic_dec(&qd_lru_count);
+		}
 		atomic_dec(&sdp->sd_quota_count);
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
-		if (!qd->qd_count) {
+		if (!atomic_read(&qd->qd_count)) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
 			gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		} else
 			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
-		gfs2_lvb_unhold(qd->qd_gl);
+		gfs2_glock_put(qd->qd_gl);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 	}
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
@@ -1341,9 +1357,6 @@ int gfs2_quotad(void *data)
 		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
 				   &quotad_timeo, &tune->gt_quota_quantum);
 
-		/* FIXME: This should be turned into a shrinker */
-		gfs2_quota_scan(sdp);
-
 		/* Check for & recover partially truncated inodes */
 		quotad_check_trunc_list(sdp);
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index cec9032be97d..0fa5fa63d0e8 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -49,4 +49,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
+extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index efd09c3d2b26..247e8f7d6b3d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 }
 
 
-static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-				  unsigned int message)
+static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                               unsigned int message)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
-		return;
-
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-			sdp->sd_lockstruct.ls_lockspace, jid, message);
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        ls->ls_recover_jid_done = jid;
+        ls->ls_recover_jid_status = message;
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-
 /**
- * gfs2_recover_journal - recovery a given journal
+ * gfs2_recover_journal - recover a given journal
  * @jd: the struct gfs2_jdesc describing the journal
  *
  * Acquire the journal's lock, check to see if the journal is clean, and
@@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&ji_gh);
 
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&j_gh);
@@ -581,7 +583,7 @@ fail_gunlock_j:
 	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 
 fail:
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b01c635d925..f03d024038ea 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -13,8 +13,8 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/prefetch.h>
+#include <linux/blkdev.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -132,81 +132,90 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 }
 
 /**
+ * gfs2_bit_search
+ * @ptr: Pointer to bitmap data
+ * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
+ * @state: The state we are searching for
+ *
+ * We xor the bitmap data with a patter which is the bitwise opposite
+ * of what we are looking for, this gives rise to a pattern of ones
+ * wherever there is a match. Since we have two bits per entry, we
+ * take this pattern, shift it down by one place and then and it with
+ * the original. All the even bit positions (0,2,4, etc) then represent
+ * successful matches, so we mask with 0x55555..... to remove the unwanted
+ * odd bit positions.
+ *
+ * This allows searching of a whole u64 at once (32 blocks) with a
+ * single test (on 64 bit arches).
+ */
+
+static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
+{
+	u64 tmp;
+	static const u64 search[] = {
+		[0] = 0xffffffffffffffffULL,
+		[1] = 0xaaaaaaaaaaaaaaaaULL,
+		[2] = 0x5555555555555555ULL,
+		[3] = 0x0000000000000000ULL,
+	};
+	tmp = le64_to_cpu(*ptr) ^ search[state];
+	tmp &= (tmp >> 1);
+	tmp &= mask;
+	return tmp;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buffer: the buffer that holds the bitmaps
- * @buflen: the length (in bytes) of the buffer
+ * @len: the length (in bytes) of the buffer
  * @goal: start search at this block's bit-pair (within @buffer)
- * @old_state: GFS2_BLKST_XXX the state of the block we're looking for.
+ * @state: GFS2_BLKST_XXX the state of the block we're looking for.
  *
  * Scope of @goal and returned block number is only within this bitmap buffer,
  * not entire rgrp or filesystem.  @buffer will be offset from the actual
- * beginning of a bitmap block buffer, skipping any header structures.
+ * beginning of a bitmap block buffer, skipping any header structures, but
+ * headers are always a multiple of 64 bits long so that the buffer is
+ * always aligned to a 64 bit boundary.
+ *
+ * The size of the buffer is in bytes, but is it assumed that it is
+ * always ok to to read a complete multiple of 64 bits at the end
+ * of the block in case the end is no aligned to a natural boundary.
  *
  * Return: the block number (bitmap buffer scope) that was found
  */
 
-static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
-		       u8 old_state)
+static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
+		       u32 goal, u8 state)
 {
-	const u8 *byte, *start, *end;
-	int bit, startbit;
-	u32 g1, g2, misaligned;
-	unsigned long *plong;
-	unsigned long lskipval;
-
-	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
-	g1 = (goal / GFS2_NBBY);
-	start = buffer + g1;
-	byte = start;
-        end = buffer + buflen;
-	g2 = ALIGN(g1, sizeof(unsigned long));
-	plong = (unsigned long *)(buffer + g2);
-	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	misaligned = g2 - g1;
-	if (!misaligned)
-		goto ulong_aligned;
-/* parse the bitmap a byte at a time */
-misaligned:
-	while (byte < end) {
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
-			return goal +
-				(((byte - start) * GFS2_NBBY) +
-				 ((bit - startbit) >> 1));
-		}
-		bit += GFS2_BIT_SIZE;
-		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
-			bit = 0;
-			byte++;
-			misaligned--;
-			if (!misaligned) {
-				plong = (unsigned long *)byte;
-				goto ulong_aligned;
-			}
-		}
-	}
-	return BFITNOENT;
-
-/* parse the bitmap a unsigned long at a time */
-ulong_aligned:
-	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
-	   We could "if" it but we'd lose some of the performance gained.
-	   This way will only slow down searching the very last 4/8 bytes
-	   depending on architecture.  I've experimented with several ways
-	   of writing this section such as using an else before the goto
-	   but this one seems to be the fastest. */
-	while ((unsigned char *)plong < end - sizeof(unsigned long)) {
-		prefetch(plong + 1);
-		if (((*plong) & LBITMASK) != lskipval)
-			break;
-		plong++;
-	}
-	if ((unsigned char *)plong < end) {
-		byte = (const u8 *)plong;
-		misaligned += sizeof(unsigned long) - 1;
-		goto misaligned;
+	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
+	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
+	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
+	u64 tmp;
+	u64 mask = 0x5555555555555555ULL;
+	u32 bit;
+
+	BUG_ON(state > 3);
+
+	/* Mask off bits we don't care about at the start of the search */
+	mask <<= spoint;
+	tmp = gfs2_bit_search(ptr, mask, state);
+	ptr++;
+	while(tmp == 0 && ptr < end) {
+		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
+		ptr++;
 	}
-	return BFITNOENT;
+	/* Mask off any bits which are more than len bytes from the start */
+	if (ptr == end && (len & (sizeof(u64) - 1)))
+		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
+	/* Didn't find anything, so return */
+	if (tmp == 0)
+		return BFITNOENT;
+	ptr--;
+	bit = fls64(tmp);
+	bit--;		/* fls64 always adds one to the bit count */
+	bit /= 2;	/* two bits per entry in the bitmap */
+	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
 
 /**
@@ -831,6 +840,58 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
 	spin_unlock(&sdp->sd_rindex_spin);
 }
 
+static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+				    const struct gfs2_bitmap *bi)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct block_device *bdev = sb->s_bdev;
+	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
+					   bdev_hardsect_size(sb->s_bdev);
+	u64 blk;
+	sector_t start = 0;
+	sector_t nr_sects = 0;
+	int rv;
+	unsigned int x;
+
+	for (x = 0; x < bi->bi_len; x++) {
+		const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+		const u8 *clone = bi->bi_clone + bi->bi_offset + x;
+		u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+		diff &= 0x55;
+		if (diff == 0)
+			continue;
+		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
+		blk *= sects_per_blk; /* convert to sectors */
+		while(diff) {
+			if (diff & 1) {
+				if (nr_sects == 0)
+					goto start_new_extent;
+				if ((start + nr_sects) != blk) {
+					rv = blkdev_issue_discard(bdev, start,
+							    nr_sects, GFP_NOFS);
+					if (rv)
+						goto fail;
+					nr_sects = 0;
+start_new_extent:
+					start = blk;
+				}
+				nr_sects += sects_per_blk;
+			}
+			diff >>= 2;
+			blk += sects_per_blk;
+		}
+	}
+	if (nr_sects) {
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		if (rv)
+			goto fail;
+	}
+	return;
+fail:
+	fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
+	sdp->sd_args.ar_discard = 0;
+}
+
 void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -841,6 +902,8 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
 		if (!bi->bi_clone)
 			continue;
+		if (sdp->sd_args.ar_discard)
+			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
 		memcpy(bi->bi_clone + bi->bi_offset,
 		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781f2fcc..601913e0a482 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -15,7 +15,6 @@
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -339,7 +338,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 				    struct gfs2_holder *t_gh)
 {
 	struct gfs2_inode *ip;
-	struct gfs2_holder ji_gh;
 	struct gfs2_jdesc *jd;
 	struct lfcc *lfcc;
 	LIST_HEAD(list);
@@ -387,7 +385,6 @@ out:
 		gfs2_glock_dq_uninit(&lfcc->gh);
 		kfree(lfcc);
 	}
-	gfs2_glock_dq_uninit(&ji_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index f6b8b00ad881..b56413e3e40d 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -14,7 +14,7 @@
 #include <linux/dcache.h>
 #include "incore.h"
 
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
@@ -27,27 +27,29 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
 
-int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
-			      struct gfs2_inode **ipp);
+extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
-int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+				     struct gfs2_inode **ipp);
 
-int gfs2_statfs_init(struct gfs2_sbd *sdp);
-void gfs2_statfs_change(struct gfs2_sbd *sdp,
-			s64 total, s64 free, s64 dinodes);
-int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 
-int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
+extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			       s64 dinodes);
+extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+
+extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
 
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
-extern struct dentry_operations gfs2_dops;
+extern const struct dentry_operations gfs2_dops;
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 26c1fa777a95..7655f5025fec 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -14,9 +14,8 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -25,6 +24,7 @@
 #include "glock.h"
 #include "quota.h"
 #include "util.h"
+#include "glops.h"
 
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
@@ -37,6 +37,30 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
 }
 
+static int gfs2_uuid_valid(const u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (uuid[i])
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const u8 *uuid = sdp->sd_sb.sb_uuid;
+	buf[0] = '\0';
+	if (!gfs2_uuid_valid(uuid))
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
+			"%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+			uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
+			uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
+			uuid[12], uuid[13], uuid[14], uuid[15]);
+}
+
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
 {
 	unsigned int count;
@@ -148,6 +172,46 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 	return len;
 }
 
+static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct gfs2_glock *gl;
+	const struct gfs2_glock_operations *glops;
+	unsigned int glmode;
+	unsigned int gltype;
+	unsigned long long glnum;
+	char mode[16];
+	int rv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
+		    mode);
+	if (rv != 3)
+		return -EINVAL;
+
+	if (strcmp(mode, "EX") == 0)
+		glmode = LM_ST_UNLOCKED;
+	else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
+		glmode = LM_ST_DEFERRED;
+	else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
+		glmode = LM_ST_SHARED;
+	else
+		return -EINVAL;
+
+	if (gltype > LM_TYPE_JOURNAL)
+		return -EINVAL;
+	glops = gfs2_glops_list[gltype];
+	if (glops == NULL)
+		return -EINVAL;
+	rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
+	if (rv)
+		return rv;
+	gfs2_glock_cb(gl, glmode);
+	gfs2_glock_put(gl);
+	return len;
+}
+
 struct gfs2_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct gfs2_sbd *, char *);
@@ -159,22 +223,26 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(uuid,                0444, uuid_show,     NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
 GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
 GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+GFS2_ATTR(demote_rq,           0200, NULL,	    demote_rq_store);
 
 static struct attribute *gfs2_attrs[] = {
 	&gfs2_attr_id.attr,
 	&gfs2_attr_fsname.attr,
+	&gfs2_attr_uuid.attr,
 	&gfs2_attr_freeze.attr,
 	&gfs2_attr_withdraw.attr,
 	&gfs2_attr_statfs_sync.attr,
 	&gfs2_attr_quota_sync.attr,
 	&gfs2_attr_quota_refresh_user.attr,
 	&gfs2_attr_quota_refresh_group.attr,
+	&gfs2_attr_demote_rq.attr,
 	NULL,
 };
 
@@ -224,14 +292,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
-LOCKSTRUCT_ATTR(lvb_size, "%u\n");
-LOCKSTRUCT_ATTR(flags,    "%d\n");
 
 static struct attribute *lockstruct_attrs[] = {
 	&lockstruct_attr_jid.attr,
 	&lockstruct_attr_first.attr,
-	&lockstruct_attr_lvb_size.attr,
-	&lockstruct_attr_flags.attr,
+	NULL,
+};
+
+/*
+ * lock_module. Originally from lock_dlm
+ */
+
+static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
+	return sprintf(buf, "%s\n", ops->lm_proto_name);
+}
+
+static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		smp_mb__after_clear_bit();
+		gfs2_glock_thaw(sdp);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%u\n", ls->ls_id);
+}
+
+static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first);
+}
+
+static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first_done);
+}
+
+static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid);
+}
+
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		jd->jd_dirty = 1;
+		break;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
+	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
+	if (sdp->sd_recoverd_process)
+		wake_up_process(sdp->sd_recoverd_process);
+	return len;
+}
+
+static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
+}
+
+struct gdlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+};
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+
+static struct attribute *lock_module_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_id.attr,
+	&lockstruct_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
 	NULL,
 };
 
@@ -373,7 +572,6 @@ TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
-TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -389,7 +587,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
 	&tune_attr_quota_simul_sync.attr,
-	&tune_attr_quota_cache_secs.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_recoverd_secs.attr,
@@ -414,6 +611,11 @@ static struct attribute_group tune_group = {
 	.attrs = tune_attrs,
 };
 
+static struct attribute_group lock_module_group = {
+	.name = "lock_module",
+	.attrs = lock_module_attrs,
+};
+
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
 	int error;
@@ -436,9 +638,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_args;
 
+	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
+	if (error)
+		goto fail_tune;
+
 	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
 	return 0;
 
+fail_tune:
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_lockstruct:
@@ -455,15 +663,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
 
+
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 		       struct kobj_uevent_env *env)
 {
 	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	const u8 *uuid = sdp->sd_sb.sb_uuid;
+
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (gfs2_uuid_valid(uuid)) {
+		add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
+			       "%02X%02X-%02X%02X%02X%02X%02X%02X",
+			       uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
+			       uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
+			       uuid[10], uuid[11], uuid[12], uuid[13],
+			       uuid[14], uuid[15]);
+	}
 	return 0;
 }
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f677b8a83f0c..053752d4b27f 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -12,9 +12,8 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
 #include <linux/kallsyms.h>
-#include <linux/lm_interface.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -88,9 +87,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 
 	if (!tr->tr_touched) {
 		gfs2_log_release(sdp, tr->tr_reserved);
-		gfs2_glock_dq(&tr->tr_t_gh);
-		gfs2_holder_uninit(&tr->tr_t_gh);
-		kfree(tr);
+		if (tr->tr_t_gh.gh_gl) {
+			gfs2_glock_dq(&tr->tr_t_gh);
+			gfs2_holder_uninit(&tr->tr_t_gh);
+			kfree(tr);
+		}
 		return;
 	}
 
@@ -106,9 +107,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	}
 
 	gfs2_log_commit(sdp, tr);
-        gfs2_glock_dq(&tr->tr_t_gh);
-        gfs2_holder_uninit(&tr->tr_t_gh);
-        kfree(tr);
+	if (tr->tr_t_gh.gh_gl) {
+		gfs2_glock_dq(&tr->tr_t_gh);
+		gfs2_holder_uninit(&tr->tr_t_gh);
+		kfree(tr);
+	}
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 374f50e95496..9d12b1118ba0 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
 
 int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 {
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	const struct lm_lockops *lm = ls->ls_ops;
 	va_list args;
 
 	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
@@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 	fs_err(sdp, "about to withdraw this file system\n");
 	BUG_ON(sdp->sd_args.ar_debug);
 
-	fs_err(sdp, "telling LM to withdraw\n");
-	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+	kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+
+	if (lm->lm_unmount) {
+		fs_err(sdp, "telling LM to unmount\n");
+		lm->lm_unmount(sdp);
+	}
 	fs_err(sdp, "withdrawn\n");
 	dump_stack();
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 9955232fdf8c..052387e11671 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,7 @@ extern void hfs_mdb_put(struct super_block *);
 extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* string.c */
-extern struct dentry_operations hfs_dentry_operations;
+extern const struct dentry_operations hfs_dentry_operations;
 
 extern int hfs_hash_dentry(struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
 static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = HFS_SB(sb)->fs_ablocks;
 	buf->f_ffree = HFS_SB(sb)->free_ablocks;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = HFS_NAMELEN;
 
 	return 0;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 5bf89ec01cd4..7478f5c219aa 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -31,7 +31,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 	return 1;
 }
 
-struct dentry_operations hfs_dentry_operations =
+const struct dentry_operations hfs_dentry_operations =
 {
 	.d_revalidate	= hfs_revalidate_dentry,
 	.d_hash		= hfs_hash_dentry,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f027a905225f..5c10d803d9df 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -327,7 +327,7 @@ void hfsplus_file_truncate(struct inode *);
 /* inode.c */
 extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
-extern struct dentry_operations hfsplus_dentry_operations;
+extern const struct dentry_operations hfsplus_dentry_operations;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f105ee9e1cc4..1bcf597c0562 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -137,7 +137,7 @@ const struct address_space_operations hfsplus_aops = {
 	.writepages	= hfsplus_writepages,
 };
 
-struct dentry_operations hfsplus_dentry_operations = {
+const struct dentry_operations hfsplus_dentry_operations = {
 	.d_hash       = hfsplus_hash_dentry,
 	.d_compare    = hfsplus_compare_dentry,
 };
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 
 	opts->creator = HFSPLUS_DEF_CR_TYPE;
 	opts->type = HFSPLUS_DEF_CR_TYPE;
-	opts->umask = current->fs->umask;
+	opts->umask = current_umask();
 	opts->uid = current_uid();
 	opts->gid = current_gid();
 	opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = 0xFFFFFFFF;
 	buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = HFSPLUS_MAX_STRLEN;
 
 	return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5c538e0ec14b..fe02ad4740e7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -31,12 +31,12 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
-int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(struct dentry *dentry)
 {
 	return 1;
 }
 
-struct dentry_operations hostfs_dentry_ops = {
+static const struct dentry_operations hostfs_dentry_ops = {
 	.d_delete		= hostfs_d_delete,
 };
 
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 08319126b2af..940d6d150bee 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -49,7 +49,7 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 	return 0;
 }
 
-static struct dentry_operations hpfs_dentry_operations = {
+static const struct dentry_operations hpfs_dentry_operations = {
 	.d_hash		= hpfs_hash_dentry,
 	.d_compare	= hpfs_compare_dentry,
 };
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fecf402d7b8a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	lock_kernel();
 
 	/*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = sbi->sb_n_free;
 	buf->f_files = sbi->sb_dirband_size / 4;
 	buf->f_ffree = sbi->sb_n_free_dnodes;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = 254;
 
 	unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	uid = current_uid();
 	gid = current_gid();
-	umask = current->fs->umask;
+	umask = current_umask();
 	lowercase = 0;
 	conv = CONV_BINARY;
 	eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 			       "errno = %d\n", err);
 			return err;
 		}
-		count = hppfs_read_file(hppfs->host_fd, buf, count);
+		err = hppfs_read_file(hppfs->host_fd, buf, count);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+			return err;
+		}
+		count = err;
 		if (count > 0)
 			*ppos += count;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a687..23a3c76711e0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
 
 static int can_do_hugetlb_shm(void)
 {
-	return likely(capable(CAP_IPC_LOCK) ||
-			in_group_p(sysctl_hugetlb_shm_group) ||
-			can_do_mlock());
+	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
+	int unlock_shm = 0;
 	struct file *file;
 	struct inode *inode;
 	struct dentry *dentry, *root;
@@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
 
-	if (!can_do_hugetlb_shm())
-		return ERR_PTR(-EPERM);
-
-	if (!user_shm_lock(size, user))
-		return ERR_PTR(-ENOMEM);
+	if (!can_do_hugetlb_shm()) {
+		if (user_shm_lock(size, user)) {
+			unlock_shm = 1;
+			WARN_ONCE(1,
+			  "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+		} else
+			return ERR_PTR(-EPERM);
+	}
 
 	root = hugetlbfs_vfsmount->mnt_root;
 	quick_string.name = name;
@@ -1004,7 +1006,8 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, user);
+	if (unlock_shm)
+		user_shm_unlock(size, user);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d1..d06d6d268de9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -147,13 +148,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_cdev = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
-	if (security_inode_alloc(inode)) {
-		if (inode->i_sb->s_op->destroy_inode)
-			inode->i_sb->s_op->destroy_inode(inode);
-		else
-			kmem_cache_free(inode_cachep, (inode));
-		return NULL;
-	}
+
+	if (security_inode_alloc(inode))
+		goto out_free_inode;
+
+	/* allocate and initialize an i_integrity */
+	if (ima_inode_alloc(inode))
+		goto out_free_security;
 
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -189,6 +190,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_mapping = mapping;
 
 	return inode;
+
+out_free_security:
+	security_inode_free(inode);
+out_free_inode:
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		kmem_cache_free(inode_cachep, (inode));
+	return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
 
@@ -284,7 +294,7 @@ void clear_inode(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	if (inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -356,9 +366,12 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 		if (tmp == head)
 			break;
 		inode = list_entry(tmp, struct inode, i_sb_list);
+		if (inode->i_state & I_NEW)
+			continue;
 		invalidate_inode_buffers(inode);
 		if (!atomic_read(&inode->i_count)) {
 			list_move(&inode->i_list, dispose);
+			WARN_ON(inode->i_state & I_NEW);
 			inode->i_state |= I_FREEING;
 			count++;
 			continue;
@@ -460,6 +473,7 @@ static void prune_icache(int nr_to_scan)
 				continue;
 		}
 		list_move(&inode->i_list, &freeable);
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
 		nr_pruned++;
 	}
@@ -656,6 +670,7 @@ void unlock_new_inode(struct inode *inode)
 	 * just created it (so there can be no old holders
 	 * that haven't tested I_LOCK).
 	 */
+	WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
 	inode->i_state &= ~(I_LOCK|I_NEW);
 	wake_up_inode(inode);
 }
@@ -1145,6 +1160,7 @@ void generic_delete_inode(struct inode *inode)
 
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
@@ -1154,7 +1170,7 @@ void generic_delete_inode(struct inode *inode)
 	if (op->delete_inode) {
 		void (*delete)(struct inode *) = op->delete_inode;
 		if (!is_bad_inode(inode))
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 		/* Filesystems implementing their own
 		 * s_op->delete_inode are required to call
 		 * truncate_inode_pages and clear_inode()
@@ -1186,16 +1202,19 @@ static void generic_forget_inode(struct inode *inode)
 			spin_unlock(&inode_lock);
 			return;
 		}
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
 	}
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
@@ -1283,6 +1302,40 @@ sector_t bmap(struct inode * inode, sector_t block)
 }
 EXPORT_SYMBOL(bmap);
 
+/*
+ * With relative atime, only update atime if the previous atime is
+ * earlier than either the ctime or mtime or if at least a day has
+ * passed since the last atime update.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+			     struct timespec now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than a day? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
 /**
  *	touch_atime	-	update the access time
  *	@mnt: mount the inode is accessed on
@@ -1310,17 +1363,12 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 		goto out;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		goto out;
-	if (mnt->mnt_flags & MNT_RELATIME) {
-		/*
-		 * With relative atime, only update atime if the previous
-		 * atime is earlier than either the ctime or mtime.
-		 */
-		if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
-		    timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
-			goto out;
-	}
 
 	now = current_fs_time(inode->i_sb);
+
+	if (!relatime_need_update(mnt, inode, now))
+		goto out;
+
 	if (timespec_equal(&inode->i_atime, &now))
 		goto out;
 
diff --git a/fs/internal.h b/fs/internal.h
index 0d8ac497b3d5..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct linux_binprm;
+struct path;
 
 /*
  * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
+extern int check_unsafe_exec(struct linux_binprm *);
 
 /*
  * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec63984cb..ac2d47e43926 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
 	if (O_NONBLOCK != O_NDELAY)
 		flag |= O_NDELAY;
 #endif
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= flag;
 	else
 		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -425,18 +427,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync)
+			/* fasync() adjusts filp->f_flags */
 			error = filp->f_op->fasync(fd, filp, on);
 		else
 			error = -ENOTTY;
 	}
-	if (error)
-		return error;
-
-	if (on)
-		filp->f_flags |= FASYNC;
-	else
-		filp->f_flags &= ~FASYNC;
-	return error;
+	return error < 0 ? error : 0;
 }
 
 static int ioctl_fsfreeze(struct file *filp)
@@ -499,17 +495,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOASYNC:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fioasync(fd, filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOQSIZE:
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6147ec3643a0..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static const struct super_operations isofs_sops = {
 };
 
 
-static struct dentry_operations isofs_dentry_ops[] = {
+static const struct dentry_operations isofs_dentry_ops[] = {
 	{
 		.d_hash		= isofs_hash,
 		.d_compare	= isofs_dentry_cmp,
@@ -923,6 +923,7 @@ out_freesbi:
 static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = 0;
 	buf->f_files = ISOFS_SB(sb)->s_ninodes;
 	buf->f_ffree = 0;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = NAME_MAX;
 	return 0;
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
 	return (ret == -EIO);
 }
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+				   int write_op)
 {
 	int i;
 
 	for (i = 0; i < bufs; i++) {
 		wbuf[i]->b_end_io = end_buffer_write_sync;
 		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+		submit_bh(write_op, wbuf[i]);
 	}
 }
 
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
  *  Submit all the data buffers to disk
  */
 static int journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+				       transaction_t *commit_transaction,
+				       int write_op)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				lock_buffer(bh);
 				spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
 			jbd_unlock_bh_state(bh);
 			if (bufs == journal->j_wbufsize) {
 				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
+				journal_do_submit_data(wbuf, bufs, write_op);
 				bufs = 0;
 				goto write_out_data;
 			}
@@ -286,7 +289,7 @@ write_out_data:
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+	journal_do_submit_data(wbuf, bufs, write_op);
 
 	return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC_PLUG;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction,
+					  write_op);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -660,7 +672,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
 		return NULL;
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
 	lock_buffer(bh);
 	memset(bh->b_data, 0, journal->j_blocksize);
 	set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		kfree(journal);
-		journal = NULL;
-		goto out;
+		goto out_err;
 	}
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
 	journal->j_maxlen = len;
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
 	return journal;
+out_err:
+	kfree(journal);
+	return NULL;
 }
 
 /**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
 	return journal;
+out_err:
+	kfree(journal);
+	return NULL;
 }
 
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE_SYNC, bh);
+	ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	}
 	*cbh = bh;
 	return ret;
@@ -190,7 +190,7 @@ retry:
 		set_buffer_uptodate(bh);
 		bh->b_end_io = journal_end_buffer_io_sync;
 
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 		if (ret) {
 			unlock_buffer(bh);
 			return ret;
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
+	int write_op = WRITE;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -401,6 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
+	if (commit_transaction->t_synchronous_commit)
+		write_op = WRITE_SYNC_PLUG;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +688,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(write_op, bh);
 			}
 			cond_resched();
 			stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008eded..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __jbd2_log_start_commit(journal, tid);
-		if (ret && ptid)
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..bbe6d592d8b3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
  *			need do nothing.
  * RevokeValid set, Revoked set:
  *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
  */
 
 #ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
- *
- * The caller must have the journal locked.
  */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
  */
-
 void jbd2_journal_write_revoke_records(journal_t *journal,
 				  transaction_t *transaction)
 {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
 		}
 	}
 
+	if (handle->h_sync)
+		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
@@ -2129,26 +2131,46 @@ done:
 }
 
 /*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
  */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
 					loff_t new_size)
 {
-	journal_t *journal;
-	transaction_t *commit_trans;
+	transaction_t *inode_trans, *commit_trans;
 	int ret = 0;
 
-	if (!inode->i_transaction && !inode->i_next_transaction)
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
 		goto out;
-	journal = inode->i_transaction->t_journal;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
 	spin_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
 	spin_unlock(&journal->j_state_lock);
-	if (inode->i_transaction == commit_trans) {
-		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 			new_size, LLONG_MAX);
 		if (ret)
 			jbd2_journal_abort(journal, ret);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
 	size_t s;
 
 	size -= sizeof(struct jffs2_acl_header);
-	s = size - 4 * sizeof(struct jffs2_acl_entry_short);
-	if (s < 0) {
+	if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
 		if (size % sizeof(struct jffs2_acl_entry_short))
 			return -1;
 		return size / sizeof(struct jffs2_acl_entry_short);
 	} else {
+		s = size - 4 * sizeof(struct jffs2_acl_entry_short);
 		if (s % sizeof(struct jffs2_acl_entry))
 			return -1;
 		return s / sizeof(struct jffs2_acl_entry) + 4;
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		return PTR_ERR(acl);
 
 	if (!acl) {
-		*i_mode &= ~current->fs->umask;
+		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
 			jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b7..e9580104b6ba 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
 			spin_unlock(&c->erase_completion_lock);
 			
 
-		/* This thread is purely an optimisation. But if it runs when
-		   other things could be running, it actually makes things a
-		   lot worse. Use yield() and put it at the back of the runqueue
-		   every time. Especially during boot, pulling an inode in
-		   with read_inode() is much preferable to having the GC thread
-		   get there first. */
-		yield();
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
 
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
 	struct jffs2_xattr_datum *xd;
-	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", xd);
 
-	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
 	xd->node = (void *)xd;
 	INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
 	struct jffs2_xattr_ref *ref;
-	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", ref);
 
-	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
 	ref->node = (void *)ref;
 	return ref;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c0..1fc1e92356ee 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 				struct jffs2_tmp_dnode_info *tn)
 {
 	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
-	struct jffs2_tmp_dnode_info *this;
+	struct jffs2_tmp_dnode_info *this, *ptn;
 
 	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
 
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 	if (this) {
 		/* If the node is coincident with another at a lower address,
 		   back up until the other node is found. It may be relevant */
-		while (this->overlapped)
-			this = tn_prev(this);
-
-		/* First node should never be marked overlapped */
-		BUG_ON(!this);
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
 		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
 	}
 
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 			}
 			if (!this->overlapped)
 				break;
-			this = tn_prev(this);
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
 		}
 	}
 
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 		eat_last(&rii->tn_root, &last->rb);
 		ver_insert(&ver_root, last);
 
-		if (unlikely(last->overlapped))
-			continue;
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
 
 		/* Now we have a bunch of nodes in reverse version
 		   order, in the tree at ver_root. Most of the time,
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 9ff619a6f9cc..57cef19951db 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@
 config JFS_FS
 	tristate "JFS filesystem support"
 	select NLS
+	select CRC32
 	help
 	  This is a port of IBM's Journaled Filesystem .  More information is
 	  available in the file <file:Documentation/filesystems/jfs.txt>.
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d3e5c33665de..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
 		posix_acl_release(acl);
 	} else
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
@@ -233,7 +233,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		if (DQUOT_TRANSFER(inode, iattr))
+		if (vfs_dq_transfer(inode, iattr))
 			return -EDQUOT;
 	}
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b00ee9f05a06..b2ae190a77ba 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -158,9 +158,9 @@ void jfs_delete_inode(struct inode *inode)
 		/*
 		 * Free the inode from the quota allocation.
 		 */
-		DQUOT_INIT(inode);
-		DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
+		vfs_dq_init(inode);
+		vfs_dq_free_inode(inode);
+		vfs_dq_drop(inode);
 	}
 
 	clear_inode(inode);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 6a73de84bcef..dd824d9b0b1a 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -90,7 +90,6 @@ void jfs_proc_init(void)
 
 	if (!(base = proc_mkdir("fs/jfs", NULL)))
 		return;
-	base->owner = THIS_MODULE;
 
 	for (i = 0; i < NPROCENT; i++)
 		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 4dcc05819998..925871e9887b 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 		 * It's time to move the inline table to an external
 		 * page and begin to build the xtree
 		 */
-		if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage))
+		if (vfs_dq_alloc_block(ip, sbi->nbperpage))
 			goto clean_up;
 		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
-			DQUOT_FREE_BLOCK(ip, sbi->nbperpage);
+			vfs_dq_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 			memcpy(&jfs_ip->i_dirtable, temp_table,
 			       sizeof (temp_table));
 			dbFree(ip, xaddr, sbi->nbperpage);
-			DQUOT_FREE_BLOCK(ip, sbi->nbperpage);
+			vfs_dq_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 		ip->i_size = PSIZE;
@@ -1027,7 +1027,7 @@ static int dtSplitUp(tid_t tid,
 			n = xlen;
 
 		/* Allocate blocks to quota. */
-		if (DQUOT_ALLOC_BLOCK(ip, n)) {
+		if (vfs_dq_alloc_block(ip, n)) {
 			rc = -EDQUOT;
 			goto extendOut;
 		}
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
 
 	/* Rollback quota allocation */
 	if (rc && quota_allocation)
-		DQUOT_FREE_BLOCK(ip, quota_allocation);
+		vfs_dq_free_block(ip, quota_allocation);
 
       dtSplitUp_Exit:
 
@@ -1369,7 +1369,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -1916,7 +1916,7 @@ static int dtSplitRoot(tid_t tid,
 	rp = rmp->data;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -2287,7 +2287,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	xlen = lengthPXD(&fp->header.self);
 
 	/* Free quota allocation. */
-	DQUOT_FREE_BLOCK(ip, xlen);
+	vfs_dq_free_block(ip, xlen);
 
 	/* free/invalidate its buffer page */
 	discard_metapage(fmp);
@@ -2363,7 +2363,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 				xlen = lengthPXD(&p->header.self);
 
 				/* Free quota allocation */
-				DQUOT_FREE_BLOCK(ip, xlen);
+				vfs_dq_free_block(ip, xlen);
 
 				/* free/invalidate its buffer page */
 				discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 7ae1e3281de9..bbbd5f202e37 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,7 +141,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	}
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+	if (vfs_dq_alloc_block(ip, nxlen)) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return -EDQUOT;
@@ -164,7 +164,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	 */
 	if (rc) {
 		dbFree(ip, nxaddr, nxlen);
-		DQUOT_FREE_BLOCK(ip, nxlen);
+		vfs_dq_free_block(ip, nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return (rc);
 	}
@@ -256,7 +256,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		goto exit;
 
 	/* Allocat blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+	if (vfs_dq_alloc_block(ip, nxlen)) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return -EDQUOT;
@@ -297,7 +297,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		/* extend the extent */
 		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
 			dbFree(ip, xaddr + xlen, delta);
-			DQUOT_FREE_BLOCK(ip, nxlen);
+			vfs_dq_free_block(ip, nxlen);
 			goto exit;
 		}
 	} else {
@@ -308,7 +308,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		 */
 		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
 			dbFree(ip, nxaddr, nxlen);
-			DQUOT_FREE_BLOCK(ip, nxlen);
+			vfs_dq_free_block(ip, nxlen);
 			goto exit;
 		}
 	}
@@ -362,11 +362,12 @@ exit:
 int extHint(struct inode *ip, s64 offset, xad_t * xp)
 {
 	struct super_block *sb = ip->i_sb;
-	struct xadlist xadl;
-	struct lxdlist lxdl;
-	lxd_t lxd;
+	int nbperpage = JFS_SBI(sb)->nbperpage;
 	s64 prev;
-	int rc, nbperpage = JFS_SBI(sb)->nbperpage;
+	int rc = 0;
+	s64 xaddr;
+	int xlen;
+	int xflag;
 
 	/* init the hint as "no hint provided" */
 	XADaddress(xp, 0);
@@ -376,46 +377,30 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	 */
 	prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
 
-	/* if the offsets in the first page of the file,
-	 * no hint provided.
+	/* if the offset is in the first page of the file, no hint provided.
 	 */
 	if (prev < 0)
-		return (0);
-
-	/* prepare to lookup the previous page's extent info */
-	lxdl.maxnlxd = 1;
-	lxdl.nlxd = 1;
-	lxdl.lxd = &lxd;
-	LXDoffset(&lxd, prev)
-	LXDlength(&lxd, nbperpage);
-
-	xadl.maxnxad = 1;
-	xadl.nxad = 0;
-	xadl.xad = xp;
-
-	/* perform the lookup */
-	if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
-		return (rc);
-
-	/* check if no extent exists for the previous page.
-	 * this is possible for sparse files.
-	 */
-	if (xadl.nxad == 0) {
-//		assert(ISSPARSE(ip));
-		return (0);
-	}
+		goto out;
 
-	/* only preserve the abnr flag within the xad flags
-	 * of the returned hint.
-	 */
-	xp->flag &= XAD_NOTRECORDED;
+	rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0);
 
-	if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
-		jfs_error(ip->i_sb, "extHint: corrupt xtree");
-		return -EIO;
-	}
+	if ((rc == 0) && xlen) {
+		if (xlen != nbperpage) {
+			jfs_error(ip->i_sb, "extHint: corrupt xtree");
+			rc = -EIO;
+		}
+		XADaddress(xp, xaddr);
+		XADlength(xp, xlen);
+		/*
+		 * only preserve the abnr flag within the xad flags
+		 * of the returned hint.
+		 */
+		xp->flag  = xflag & XAD_NOTRECORDED;
+	} else
+		rc = 0;
 
-	return (0);
+out:
+	return (rc);
 }
 
 
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0f94381ca6d0..346057218edc 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -57,12 +57,6 @@
 #include "jfs_debug.h"
 
 /*
- * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we make them appear hashed,
- * but do not put on any lists.
- */
-
-/*
  * imap locks
  */
 /* iag free list lock */
@@ -497,7 +491,9 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	release_metapage(mp);
 
 	/*
-	 * that will look hashed, but won't be on any list; hlist_del()
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want special inodes in the fileset inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
 	 * will work fine and require no locking.
 	 */
 	ip->i_hash.pprev = &ip->i_hash.next;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index d4d142c2edd4..dc0e02159ac9 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,7 +116,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	/*
 	 * Allocate inode to quota.
 	 */
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		rc = -EDQUOT;
 		goto fail_drop;
 	}
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	return inode;
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 fail_unlock:
 	inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index adb2fafcc544..1eff7db34d63 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -47,5 +47,5 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
-extern struct dentry_operations jfs_ci_dentry_operations;
+extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index c350057087dd..07b6c5dfb4b6 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -369,6 +369,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	unsigned long bio_bytes = 0;
 	unsigned long bio_offset = 0;
 	int offset;
+	int bad_blocks = 0;
 
 	page_start = (sector_t)page->index <<
 		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -394,6 +395,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		}
 
 		clear_bit(META_dirty, &mp->flag);
+		set_bit(META_io, &mp->flag);
 		block_offset = offset >> inode->i_blkbits;
 		lblock = page_start + block_offset;
 		if (bio) {
@@ -402,7 +404,6 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 				len = min(xlen, blocks_per_mp);
 				xlen -= len;
 				bio_bytes += len << inode->i_blkbits;
-				set_bit(META_io, &mp->flag);
 				continue;
 			}
 			/* Not contiguous */
@@ -424,12 +425,14 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
 		pblock = metapage_get_blocks(inode, lblock, &xlen);
 		if (!pblock) {
-			/* Need better error handling */
 			printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
-			dec_io(page, last_write_complete);
+			/*
+			 * We already called inc_io(), but can't cancel it
+			 * with dec_io() until we're done with the page
+			 */
+			bad_blocks++;
 			continue;
 		}
-		set_bit(META_io, &mp->flag);
 		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
 
 		bio = bio_alloc(GFP_NOFS, 1);
@@ -459,6 +462,9 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 
 	unlock_page(page);
 
+	if (bad_blocks)
+		goto err_out;
+
 	if (nr_underway == 0)
 		end_page_writeback(page);
 
@@ -474,7 +480,9 @@ skip:
 	bio_put(bio);
 	unlock_page(page);
 	dec_io(page, last_write_complete);
-
+err_out:
+	while (bad_blocks--)
+		dec_io(page, last_write_complete);
 	return -EIO;
 }
 
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 649f9817accd..43ea3713c083 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -58,35 +58,6 @@ struct timestruc_t {
 #define	ONES		0xffffffffu	/* all bit on		*/
 
 /*
- *	logical xd (lxd)
- */
-typedef struct {
-	unsigned len:24;
-	unsigned off1:8;
-	u32 off2;
-} lxd_t;
-
-/* lxd_t field construction */
-#define	LXDlength(lxd, length32)	( (lxd)->len = length32 )
-#define	LXDoffset(lxd, offset64)\
-{\
-	(lxd)->off1 = ((s64)offset64) >> 32;\
-	(lxd)->off2 = (offset64) & 0xffffffff;\
-}
-
-/* lxd_t field extraction */
-#define	lengthLXD(lxd)	( (lxd)->len )
-#define	offsetLXD(lxd)\
-	( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 )
-
-/* lxd list */
-struct lxdlist {
-	s16 maxnlxd;
-	s16 nlxd;
-	lxd_t *lxd;
-};
-
-/*
  *	physical xd (pxd)
  */
 typedef struct {
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index ae3acafb447b..d654a6458648 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -164,11 +164,8 @@ int xtLookup(struct inode *ip, s64 lstart,
 		/* is lookup offset beyond eof ? */
 		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
 		    JFS_SBI(ip->i_sb)->l2bsize;
-		if (lstart >= size) {
-			jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)",
-				(ulong) lstart, (ulong) size);
+		if (lstart >= size)
 			return 0;
-		}
 	}
 
 	/*
@@ -220,264 +217,6 @@ int xtLookup(struct inode *ip, s64 lstart,
 	return rc;
 }
 
-
-/*
- *	xtLookupList()
- *
- * function: map a single logical extent into a list of physical extent;
- *
- * parameter:
- *	struct inode	*ip,
- *	struct lxdlist	*lxdlist,	lxd list (in)
- *	struct xadlist	*xadlist,	xad list (in/out)
- *	int		flag)
- *
- * coverage of lxd by xad under assumption of
- * . lxd's are ordered and disjoint.
- * . xad's are ordered and disjoint.
- *
- * return:
- *	0:	success
- *
- * note: a page being written (even a single byte) is backed fully,
- *	except the last page which is only backed with blocks
- *	required to cover the last byte;
- *	the extent backing a page is fully contained within an xad;
- */
-int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
-		 struct xadlist * xadlist, int flag)
-{
-	int rc = 0;
-	struct btstack btstack;
-	int cmp;
-	s64 bn;
-	struct metapage *mp;
-	xtpage_t *p;
-	int index;
-	lxd_t *lxd;
-	xad_t *xad, *pxd;
-	s64 size, lstart, lend, xstart, xend, pstart;
-	s64 llen, xlen, plen;
-	s64 xaddr, paddr;
-	int nlxd, npxd, maxnpxd;
-
-	npxd = xadlist->nxad = 0;
-	maxnpxd = xadlist->maxnxad;
-	pxd = xadlist->xad;
-
-	nlxd = lxdlist->nlxd;
-	lxd = lxdlist->lxd;
-
-	lstart = offsetLXD(lxd);
-	llen = lengthLXD(lxd);
-	lend = lstart + llen;
-
-	size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
-	    JFS_SBI(ip->i_sb)->l2bsize;
-
-	/*
-	 * search for the xad entry covering the logical extent
-	 */
-      search:
-	if (lstart >= size)
-		return 0;
-
-	if ((rc = xtSearch(ip, lstart, NULL, &cmp, &btstack, 0)))
-		return rc;
-
-	/*
-	 *	compute the physical extent covering logical extent
-	 *
-	 * N.B. search may have failed (e.g., hole in sparse file),
-	 * and returned the index of the next entry.
-	 */
-//map:
-	/* retrieve search result */
-	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
-
-	/* is xad on the next sibling page ? */
-	if (index == le16_to_cpu(p->header.nextindex)) {
-		if (p->header.flag & BT_ROOT)
-			goto mapend;
-
-		if ((bn = le64_to_cpu(p->header.next)) == 0)
-			goto mapend;
-
-		XT_PUTPAGE(mp);
-
-		/* get next sibling page */
-		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
-		if (rc)
-			return rc;
-
-		index = XTENTRYSTART;
-	}
-
-	xad = &p->xad[index];
-
-	/*
-	 * is lxd covered by xad ?
-	 */
-      compare:
-	xstart = offsetXAD(xad);
-	xlen = lengthXAD(xad);
-	xend = xstart + xlen;
-	xaddr = addressXAD(xad);
-
-      compare1:
-	if (xstart < lstart)
-		goto compare2;
-
-	/* (lstart <= xstart) */
-
-	/* lxd is NOT covered by xad */
-	if (lend <= xstart) {
-		/*
-		 * get next lxd
-		 */
-		if (--nlxd == 0)
-			goto mapend;
-		lxd++;
-
-		lstart = offsetLXD(lxd);
-		llen = lengthLXD(lxd);
-		lend = lstart + llen;
-		if (lstart >= size)
-			goto mapend;
-
-		/* compare with the current xad */
-		goto compare1;
-	}
-	/* lxd is covered by xad */
-	else {			/* (xstart < lend) */
-
-		/* initialize new pxd */
-		pstart = xstart;
-		plen = min(lend - xstart, xlen);
-		paddr = xaddr;
-
-		goto cover;
-	}
-
-	/* (xstart < lstart) */
-      compare2:
-	/* lxd is covered by xad */
-	if (lstart < xend) {
-		/* initialize new pxd */
-		pstart = lstart;
-		plen = min(xend - lstart, llen);
-		paddr = xaddr + (lstart - xstart);
-
-		goto cover;
-	}
-	/* lxd is NOT covered by xad */
-	else {			/* (xend <= lstart) */
-
-		/*
-		 * get next xad
-		 *
-		 * linear search next xad covering lxd on
-		 * the current xad page, and then tree search
-		 */
-		if (index == le16_to_cpu(p->header.nextindex) - 1) {
-			if (p->header.flag & BT_ROOT)
-				goto mapend;
-
-			XT_PUTPAGE(mp);
-			goto search;
-		} else {
-			index++;
-			xad++;
-
-			/* compare with new xad */
-			goto compare;
-		}
-	}
-
-	/*
-	 * lxd is covered by xad and a new pxd has been initialized
-	 * (lstart <= xstart < lend) or (xstart < lstart < xend)
-	 */
-      cover:
-	/* finalize pxd corresponding to current xad */
-	XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
-
-	if (++npxd >= maxnpxd)
-		goto mapend;
-	pxd++;
-
-	/*
-	 * lxd is fully covered by xad
-	 */
-	if (lend <= xend) {
-		/*
-		 * get next lxd
-		 */
-		if (--nlxd == 0)
-			goto mapend;
-		lxd++;
-
-		lstart = offsetLXD(lxd);
-		llen = lengthLXD(lxd);
-		lend = lstart + llen;
-		if (lstart >= size)
-			goto mapend;
-
-		/*
-		 * test for old xad covering new lxd
-		 * (old xstart < new lstart)
-		 */
-		goto compare2;
-	}
-	/*
-	 * lxd is partially covered by xad
-	 */
-	else {			/* (xend < lend) */
-
-		/*
-		 * get next xad
-		 *
-		 * linear search next xad covering lxd on
-		 * the current xad page, and then next xad page search
-		 */
-		if (index == le16_to_cpu(p->header.nextindex) - 1) {
-			if (p->header.flag & BT_ROOT)
-				goto mapend;
-
-			if ((bn = le64_to_cpu(p->header.next)) == 0)
-				goto mapend;
-
-			XT_PUTPAGE(mp);
-
-			/* get next sibling page */
-			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
-			if (rc)
-				return rc;
-
-			index = XTENTRYSTART;
-			xad = &p->xad[index];
-		} else {
-			index++;
-			xad++;
-		}
-
-		/*
-		 * test for new xad covering old lxd
-		 * (old lstart < new xstart)
-		 */
-		goto compare;
-	}
-
-      mapend:
-	xadlist->nxad = npxd;
-
-//out:
-	XT_PUTPAGE(mp);
-
-	return rc;
-}
-
-
 /*
  *	xtSearch()
  *
@@ -846,10 +585,10 @@ int xtInsert(tid_t tid,		/* transaction id */
 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
 		} else
 			hint = 0;
-		if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen)))
+		if ((rc = vfs_dq_alloc_block(ip, xlen)))
 			goto out;
 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
-			DQUOT_FREE_BLOCK(ip, xlen);
+			vfs_dq_free_block(ip, xlen);
 			goto out;
 		}
 	}
@@ -878,7 +617,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 			/* undo data extent allocation */
 			if (*xaddrp == 0) {
 				dbFree(ip, xaddr, (s64) xlen);
-				DQUOT_FREE_BLOCK(ip, xlen);
+				vfs_dq_free_block(ip, xlen);
 			}
 			return rc;
 		}
@@ -1246,7 +985,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	rbn = addressPXD(pxd);
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		rc = -EDQUOT;
 		goto clean_up;
 	}
@@ -1456,7 +1195,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 
 	/* Rollback quota allocation. */
 	if (quota_allocation)
-		DQUOT_FREE_BLOCK(ip, quota_allocation);
+		vfs_dq_free_block(ip, quota_allocation);
 
 	return (rc);
 }
@@ -1513,7 +1252,7 @@ xtSplitRoot(tid_t tid,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -3941,7 +3680,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 		ip->i_size = newsize;
 
 	/* update quota allocation to reflect freed blocks */
-	DQUOT_FREE_BLOCK(ip, nfreed);
+	vfs_dq_free_block(ip, nfreed);
 
 	/*
 	 * free tlock of invalidated pages
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 70815c8a3d6a..08c0c749b986 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -110,8 +110,6 @@ typedef union {
  */
 extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
 		    int *pflag, s64 * paddr, int *plen, int flag);
-extern int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
-			struct xadlist * xadlist, int flag);
 extern void xtInitRoot(tid_t tid, struct inode *ip);
 extern int xtInsert(tid_t tid, struct inode *ip,
 		    int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b4de56b851e4..514ee2edb92a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -35,7 +35,7 @@
 /*
  * forward references
  */
-struct dentry_operations jfs_ci_dentry_operations;
+const struct dentry_operations jfs_ci_dentry_operations;
 
 static s64 commitZeroLink(tid_t, struct inode *);
 
@@ -356,7 +356,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	DQUOT_INIT(ip);
+	vfs_dq_init(ip);
 
 	/* directory must be empty to be removed */
 	if (!dtEmpty(ip)) {
@@ -483,7 +483,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	DQUOT_INIT(ip);
+	vfs_dq_init(ip);
 
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
@@ -1136,7 +1136,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	} else if (new_ip) {
 		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
 		/* Init inode for quota operations. */
-		DQUOT_INIT(new_ip);
+		vfs_dq_init(new_ip);
 	}
 
 	/*
@@ -1595,7 +1595,7 @@ out:
 	return result;
 }
 
-struct dentry_operations jfs_ci_dentry_operations =
+const struct dentry_operations jfs_ci_dentry_operations =
 {
 	.d_hash = jfs_ci_hash,
 	.d_compare = jfs_ci_compare,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b37d1f78b854..6f21adf9479a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -29,6 +29,7 @@
 #include <linux/posix_acl.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
+#include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 
@@ -168,6 +169,9 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = maxinodes;
 	buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
 				    atomic_read(&imap->im_numfree));
+	buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2,
+					sizeof(sbi->uuid)/2);
 
 	buf->f_namelen = JFS_NAME_MAX;
 	return 0;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 9b7f2cdaae0a..61dfa8173ebc 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
 
 	/* Allocate new blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nblocks)) {
+	if (vfs_dq_alloc_block(ip, nblocks)) {
 		return -EDQUOT;
 	}
 
 	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
 	if (rc) {
 		/*Rollback quota allocation. */
-		DQUOT_FREE_BLOCK(ip, nblocks);
+		vfs_dq_free_block(ip, nblocks);
 		return rc;
 	}
 
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 
       failed:
 	/* Rollback quota allocation. */
-	DQUOT_FREE_BLOCK(ip, nblocks);
+	vfs_dq_free_block(ip, nblocks);
 
 	dbFree(ip, blkno, nblocks);
 	return rc;
@@ -538,7 +538,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
 
 	if (blocks_needed > current_blocks) {
 		/* Allocate new blocks to quota. */
-		if (DQUOT_ALLOC_BLOCK(inode, blocks_needed))
+		if (vfs_dq_alloc_block(inode, blocks_needed))
 			return -EDQUOT;
 
 		quota_allocation = blocks_needed;
@@ -602,7 +602,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
       clean_up:
 	/* Rollback quota allocation */
 	if (quota_allocation)
-		DQUOT_FREE_BLOCK(inode, quota_allocation);
+		vfs_dq_free_block(inode, quota_allocation);
 
 	return (rc);
 }
@@ -677,7 +677,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 
 	/* If old blocks exist, they must be removed from quota allocation. */
 	if (old_blocks)
-		DQUOT_FREE_BLOCK(inode, old_blocks);
+		vfs_dq_free_block(inode, old_blocks);
 
 	inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 49b44099dabb..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -44,7 +44,7 @@ static int simple_delete_dentry(struct dentry *dentry)
  */
 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	static struct dentry_operations simple_dentry_operations = {
+	static const struct dentry_operations simple_dentry_operations = {
 		.d_delete = simple_delete_dentry,
 	};
 
@@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 Enomem:
 	up_write(&s->s_umount);
@@ -574,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
  * possibly a read which collects the result - which is stored in a
  * file-local buffer.
  */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
 	struct simple_transaction_argresp *ar;
@@ -819,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 
+#include <asm/unaligned.h>
+
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 #define NSM_PROGRAM		100024
 #define NSM_VERSION		1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
 {
 	u64 *p = (u64 *)&nsm->sm_priv.data;
 	struct timespec ts;
+	s64 ns;
 
 	ktime_get_ts(&ts);
-	*p++ = timespec_to_ns(&ts);
-	*p = (unsigned long)nsm;
+	ns = timespec_to_ns(&ts);
+	put_unaligned(ns, p);
+	put_unaligned((unsigned long)nsm, p + 1);
 }
 
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..abf83881f68a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
 /*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
-	defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t	nlmsvc_family = AF_INET6;
-#else	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t	nlmsvc_family = AF_INET;
-#endif	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-
-/*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
  */
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
 	return 0;
 }
 
-static int create_lockd_listener(struct svc_serv *serv, char *name,
-				 unsigned short port)
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+				 const int family, const unsigned short port)
 {
 	struct svc_xprt *xprt;
 
-	xprt = svc_find_xprt(serv, name, 0, 0);
+	xprt = svc_find_xprt(serv, name, family, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
-
+		return svc_create_xprt(serv, name, family, port,
+						SVC_SOCK_DEFAULTS);
 	svc_xprt_put(xprt);
 	return 0;
 }
 
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+	int err;
+
+	err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+	if (err < 0)
+		return err;
+
+	return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
+
 /*
  * Ensure there are active UDP and TCP listeners for lockd.
  *
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
 	static int warned;
 	int err;
 
-	err = create_lockd_listener(serv, "udp", nlm_udpport);
+	err = create_lockd_family(serv, PF_INET);
 	if (err < 0)
 		goto out_err;
 
-	err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-	if (err < 0)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	err = create_lockd_family(serv, PF_INET6);
+	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_err;
+#endif	/* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
 
 	warned = 0;
 	return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
+			/*
+			 * If this is a blocking request for an
+			 * already pending lock request then we need
+			 * to put it back on lockd's block list
+			 */
+			if (wait)
+				break;
 			ret = nlm_lck_denied;
-			break;
+			goto out;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
-	ret = nlm_lck_denied;
-	if (!wait)
-		goto out;
-
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index d1d1eb84679d..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
- *  Copyright (C) 1996  Gertjan van Wingerde    (gertjan@cs.vu.nl)
+ *  Copyright (C) 1996  Gertjan van Wingerde
  *	Minix V2 fs support.
  *
  *  Modified for 680x0 by Andreas Schwab
@@ -321,15 +321,20 @@ out:
 
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
-	buf->f_type = dentry->d_sb->s_magic;
-	buf->f_bsize = dentry->d_sb->s_blocksize;
+	struct super_block *sb = dentry->d_sb;
+	struct minix_sb_info *sbi = minix_sb(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = sbi->s_ninodes;
 	buf->f_ffree = minix_count_free_inodes(sbi);
 	buf->f_namelen = sbi->s_namelen;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
 	return 0;
 }
 
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
-EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
  * just allocate full-size (16-page) BIOs.
  */
 
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		      void *data)
 {
 	struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
-EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index bbc15c237558..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -31,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -850,6 +852,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		if (err == -EAGAIN)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
+		if (!err)
+			err = ima_path_check(&nd->path, MAY_EXEC);
  		if (err)
 			break;
 
@@ -1470,7 +1474,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -1486,29 +1490,32 @@ int may_open(struct path *path, int acc_mode, int flag)
 	if (!inode)
 		return -ENOENT;
 
-	if (S_ISLNK(inode->i_mode))
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFLNK:
 		return -ELOOP;
-	
-	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
-		return -EISDIR;
-
-	/*
-	 * FIFO's, sockets and device files are special: they don't
-	 * actually live on the filesystem itself, and as such you
-	 * can write to them even if the filesystem is read-only.
-	 */
-	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
-	    	flag &= ~O_TRUNC;
-	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
+	case S_IFDIR:
+		if (acc_mode & MAY_WRITE)
+			return -EISDIR;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
 		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
-
+		/*FALLTHRU*/
+	case S_IFIFO:
+	case S_IFSOCK:
 		flag &= ~O_TRUNC;
+		break;
 	}
 
 	error = inode_permission(inode, acc_mode);
 	if (error)
 		return error;
+
+	error = ima_path_check(path,
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+	if (error)
+		return error;
 	/*
 	 * An append-only file must be opened in append mode for writing.
 	 */
@@ -1544,7 +1551,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 			error = security_path_truncate(path, 0,
 					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 
 			error = do_truncate(dentry, 0,
 					    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1555,7 +1562,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 			return error;
 	} else
 		if (flag & FMODE_WRITE)
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 
 	return 0;
 }
@@ -1572,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 	struct dentry *dir = nd->path.dentry;
 
 	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
 	if (error)
 		goto out_unlock;
@@ -1938,7 +1945,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -1983,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
 		goto out_unlock;
 	}
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = may_mknod(mode);
 	if (error)
 		goto out_dput;
@@ -2037,7 +2044,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -2061,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 		goto out_unlock;
 
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
@@ -2123,7 +2130,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->rmdir)
 		return -EPERM;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 
 	mutex_lock(&dentry->d_inode->i_mutex);
 	dentry_unhash(dentry);
@@ -2210,7 +2217,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
@@ -2321,7 +2328,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -2405,7 +2412,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
@@ -2604,8 +2611,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (!old_dir->i_op->rename)
 		return -EPERM;
 
-	DQUOT_INIT(old_dir);
-	DQUOT_INIT(new_dir);
+	vfs_dq_init(old_dir);
+	vfs_dq_init(new_dir);
 
 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
 
@@ -2891,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
-	.umask		= 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -397,11 +398,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	spin_unlock(&vfsmount_lock);
 }
 
-int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
 	mnt->mnt_root = dget(sb->s_root);
-	return 0;
 }
 
 EXPORT_SYMBOL(simple_set_mnt);
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
 		spin_lock(&cpu_writer->lock);
+		if (cpu_writer->mnt != mnt) {
+			spin_unlock(&cpu_writer->lock);
+			continue;
+		}
 		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
 		cpu_writer->count = 0;
 		/*
@@ -778,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
+		{ MNT_STRICTATIME, ",strictatime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
@@ -1917,6 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
+	/* Default to relatime */
+	mnt_flags |= MNT_RELATIME;
+
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
@@ -1928,13 +1934,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NOATIME;
 	if (flags & MS_NODIRATIME)
 		mnt_flags |= MNT_NODIRATIME;
-	if (flags & MS_RELATIME)
-		mnt_flags |= MNT_RELATIME;
+	if (flags & MS_STRICTATIME)
+		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
+		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+		   MS_STRICTATIME);
 
 	/* ... and get the mountpoint */
 	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
@@ -2087,66 +2094,6 @@ out1:
 }
 
 /*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-	struct path old_root;
-
-	write_lock(&fs->lock);
-	old_root = fs->root;
-	fs->root = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-	if (old_root.dentry)
-		path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-	struct task_struct *g, *p;
-	struct fs_struct *fs;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-		fs = p->fs;
-		if (fs) {
-			atomic_inc(&fs->count);
-			task_unlock(p);
-			if (fs->root.dentry == old_root->dentry
-			    && fs->root.mnt == old_root->mnt)
-				set_fs_root(fs, new_root);
-			if (fs->pwd.dentry == old_root->dentry
-			    && fs->pwd.mnt == old_root->mnt)
-				set_fs_pwd(fs, new_root);
-			put_fs_struct(fs);
-		} else
-			task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-}
-
-/*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
  * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 07e9715b8658..9c590722d87e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -79,7 +79,7 @@ static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
 static int ncp_delete_dentry(struct dentry *);
 
-static struct dentry_operations ncp_dentry_operations =
+static const struct dentry_operations ncp_dentry_operations =
 {
 	.d_revalidate	= ncp_lookup_validate,
 	.d_hash		= ncp_hash_dentry,
@@ -87,7 +87,7 @@ static struct dentry_operations ncp_dentry_operations =
 	.d_delete	= ncp_delete_dentry,
 };
 
-struct dentry_operations ncp_root_dentry_operations =
+const struct dentry_operations ncp_root_dentry_operations =
 {
 	.d_hash		= ncp_hash_dentry,
 	.d_compare	= ncp_compare_dentry,
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
 	  <file:Documentation/filesystems/nfsroot.txt>.
 
 	  Most people say N here.
+
+config NFS_FSCACHE
+	bool "Provide NFS client caching support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+	help
+	  Say Y here if you want NFS data to be cached locally on disc through
+	  the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
 
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t	nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t	nfs_callback_family = AF_INET;
-#endif
-
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
-				nfs_callback_family, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
 
-	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
-			      SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", PF_INET,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
-			nfs_callback_tcpport, nfs_callback_family);
+			nfs_callback_tcpport, PF_INET);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	ret = svc_create_xprt(serv, "tcp", PF_INET6,
+				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+	if (ret > 0) {
+		nfs_callback_tcpport6 = ret;
+		dprintk("NFS: Callback listener port = %u (af %u)\n",
+				nfs_callback_tcpport6, PF_INET6);
+	} else if (ret != -EAFNOSUPPORT)
+		goto out_err;
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
 
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3565a1..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 
+	nfs_fscache_get_client_cookie(clp);
+
 	return clp;
 
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
 
 	nfs4_shutdown_client(clp);
 
+	nfs_fscache_release_client_cookie(clp);
+
 	/* -EIO all pending I/O */
 	if (!IS_ERR(clp->cl_rpcclient))
 		rpc_shutdown_client(clp->cl_rpcclient);
@@ -224,53 +229,110 @@ void nfs_put_client(struct nfs_client *clp)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
 {
-	switch (sa->sa_family) {
-		default:
-			return NULL;
-		case AF_INET6:
-			return &((const struct sockaddr_in6 *)sa)->sin6_addr;
-			break;
-		case AF_INET:
-			ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
-					addr_mapped);
-			return addr_mapped;
-	}
-}
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
 
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-		const struct sockaddr *sa2)
+	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+	    sin1->sin6_scope_id != sin2->sin6_scope_id)
+		return 0;
+
+	return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+}
+#else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
 {
-	const struct in6_addr *addr1;
-	const struct in6_addr *addr2;
-	struct in6_addr addr1_mapped;
-	struct in6_addr addr2_mapped;
-
-	addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
-	if (likely(addr1 != NULL)) {
-		addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
-		if (likely(addr2 != NULL))
-			return ipv6_addr_equal(addr1, addr2);
-	}
 	return 0;
 }
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
-				 const struct sockaddr_in *sa2)
+#endif
+
+/*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+				      const struct sockaddr *sa2)
 {
-	return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
 
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+	return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+		(sin1->sin6_port == sin2->sin6_port);
+}
+
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+	return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+		(sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
 static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-				 const struct sockaddr *sa2)
+				     const struct sockaddr *sa2)
 {
-	if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
+	if (sa1->sa_family != sa2->sa_family)
 		return 0;
-	return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-			(const struct sockaddr_in *)sa2);
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+	}
+	return 0;
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, including the port number.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+			    const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_cmp_ip4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_cmp_ip6(sa1, sa2);
+	}
+	return 0;
 }
-#endif
 
 /*
  * Find a client by IP address and protocol version
@@ -344,8 +406,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
 {
 	struct nfs_client *clp;
+	const struct sockaddr *sap = data->addr;
 
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 		/* Don't match clients that failed to initialise properly */
 		if (clp->cl_cons_state < 0)
 			continue;
@@ -358,7 +422,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 			continue;
 
 		/* Match the full socket address */
-		if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+		if (!nfs_sockaddr_cmp(sap, clap))
 			continue;
 
 		atomic_inc(&clp->cl_count);
@@ -701,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
+	server->options = data->options;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1089,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->caps |= NFS_CAP_ATOMIC_OPEN;
+	server->options = data->options;
 
 	/* Get a client record */
 	error = nfs4_set_client(server,
@@ -1500,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 
 	/* display header on line 1 */
 	if (v == &nfs_volume_list) {
-		seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
+		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
 		return 0;
 	}
 	/* display one transport per line on subsequent lines */
@@ -1514,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 		 (unsigned long long) server->fsid.major,
 		 (unsigned long long) server->fsid.minor);
 
-	seq_printf(m, "v%u %s %s %-7s %-17s\n",
+	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
 		   dev,
-		   fsid);
+		   fsid,
+		   nfs_server_fscache_state(server));
 
 	return 0;
 }
@@ -1535,8 +1602,6 @@ int __init nfs_fs_proc_init(void)
 	if (!proc_fs_nfs)
 		goto error_0;
 
-	proc_fs_nfs->owner = THIS_MODULE;
-
 	/* a file of servers with which we're dealing */
 	p = proc_create("servers", S_IFREG|S_IRUGO,
 			proc_fs_nfs, &nfs_server_list_fops);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c8199f82f..370b190a09d1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -899,7 +899,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
-struct dentry_operations nfs_dentry_operations = {
+const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
@@ -967,7 +967,7 @@ out:
 #ifdef CONFIG_NFS_V4
 static int nfs_open_revalidate(struct dentry *, struct nameidata *);
 
-struct dentry_operations nfs4_dentry_operations = {
+const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		} else if (atomic_read(&new_dentry->d_count) > 1)
 			/* dentry still busy? */
 			goto out;
-	} else
-		nfs_drop_nlink(new_inode);
+	}
 
 go_ahead:
 	/*
@@ -1638,10 +1637,8 @@ go_ahead:
 	}
 	nfs_inode_return_delegation(old_inode);
 
-	if (new_inode != NULL) {
+	if (new_inode != NULL)
 		nfs_inode_return_delegation(new_inode);
-		d_delete(new_dentry);
-	}
 
 	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
 					   new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
 	if (rehash)
 		d_rehash(rehash);
 	if (!error) {
+		if (new_inode != NULL)
+			nfs_drop_nlink(new_inode);
 		d_move(old_dentry, new_dentry);
 		nfs_set_verifier(new_dentry,
 					nfs_save_change_attribute(new_dir));
@@ -1892,8 +1891,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 	cache.cred = cred;
 	cache.jiffies = jiffies;
 	status = NFS_PROTO(inode)->access(inode, &cache);
-	if (status != 0)
+	if (status != 0) {
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
 		return status;
+	}
 	nfs_access_add_cache(inode, &cache);
 out:
 	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..3523b895eb4b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
@@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= nfs_file_read,
 	.aio_write	= nfs_file_write,
-#ifdef CONFIG_MMU
 	.mmap		= nfs_file_mmap,
-#else
-	.mmap		= generic_file_mmap,
-#endif
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
 	.release	= nfs_file_release,
@@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
-	/* Ensure that dirty pages are flushed out with the right creds */
-	if (filp->f_mode & FMODE_WRITE)
-		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
 }
@@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
-	int		status;
 
 	dprintk("NFS: flush(%s/%s)\n",
 			dentry->d_parent->d_name.name,
@@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
-	/* Ensure that data+attribute caches are up to date after close() */
-	status = nfs_do_fsync(ctx, inode);
-	if (!status)
-		nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	return status;
+	/* Flush writes to the server and return any errors */
+	return nfs_do_fsync(ctx, inode);
 }
 
 static ssize_t
@@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dprintk("NFS: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_mapping(inode, file->f_mapping);
+	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
+	 *       so we call that before revalidating the mapping
+	 */
+	status = generic_file_mmap(file, vma);
 	if (!status) {
 		vma->vm_ops = &nfs_file_vm_ops;
-		vma->vm_flags |= VM_CAN_NONLINEAR;
-		file_accessed(file);
+		status = nfs_revalidate_mapping(inode, file->f_mapping);
 	}
 	return status;
 }
@@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	/*
+	 * Prevent starvation issues if someone is doing a consistency
+	 * sync-to-disk
+	 */
+	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		return ret;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
 	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 		return;
 	/* Cancel any unstarted writes on this page */
 	nfs_wb_page_cancel(page->mapping->host, page);
+
+	nfs_fscache_invalidate_page(page, page->mapping->host);
 }
 
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
 	/* If PagePrivate() is set, then the page is not freeable */
-	return 0;
+	if (PagePrivate(page))
+		return 0;
+	return nfs_fscache_release_page(page, gfp);
 }
 
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
 static int nfs_launder_page(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
 
 	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
 		inode->i_ino, (long long)page_offset(page));
 
+	nfs_fscache_wait_on_page_write(nfsi, page);
 	return nfs_wb_page(inode, page);
 }
 
@@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 };
 
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
@@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		filp->f_mapping->host->i_ino,
 		(long long)page_offset(page));
 
+	/* make sure the cache has finished storing the page */
+	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
 	lock_page(page);
 	mapping = page->mapping;
 	if (mapping != dentry->d_inode->i_mapping)
@@ -483,6 +520,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		ret = pagelen;
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+	.name		= "nfs",
+	.version	= 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+	return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+	uint16_t	nfsversion;		/* NFS protocol version */
+	uint16_t	family;			/* address family */
+	uint16_t	port;			/* IP port */
+	union {
+		struct in_addr	ipv4_addr;	/* IPv4 address */
+		struct in6_addr ipv6_addr;	/* IPv6 address */
+	} addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t bufmax)
+{
+	const struct nfs_client *clp = cookie_netfs_data;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+	const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+	struct nfs_server_key *key = buffer;
+	uint16_t len = sizeof(struct nfs_server_key);
+
+	key->nfsversion = clp->rpc_ops->version;
+	key->family = clp->cl_addr.ss_family;
+
+	memset(key, 0, len);
+
+	switch (clp->cl_addr.ss_family) {
+	case AF_INET:
+		key->port = sin->sin_port;
+		key->addr[0].ipv4_addr = sin->sin_addr;
+		len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->port = sin6->sin6_port;
+		key->addr[0].ipv6_addr = sin6->sin6_addr;
+		len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+		       clp->cl_addr.ss_family);
+		len = 0;
+		break;
+	}
+
+	return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+	.name		= "NFS.server",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+				  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_fscache_key *key;
+	const struct nfs_server *nfss = cookie_netfs_data;
+	uint16_t len;
+
+	key = nfss->fscache_key;
+	len = sizeof(key->key) + key->key.uniq_len;
+	if (len > bufmax) {
+		len = 0;
+	} else {
+		memcpy(buffer, &key->key, sizeof(key->key));
+		memcpy(buffer + sizeof(key->key),
+		       key->key.uniquifier, key->key.uniq_len);
+	}
+
+	return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+	.name		= "NFS.super",
+	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+	struct timespec	mtime;
+	struct timespec	ctime;
+	loff_t		size;
+	u64		change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+	uint16_t nsize;
+
+	/* use the inode's NFS filehandle as the key */
+	nsize = nfsi->fh.size;
+	memcpy(buffer, nfsi->fh.data, nsize);
+	return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+				       uint64_t *size)
+{
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	*size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+					  void *buffer, uint16_t bufmax)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	const struct nfs_inode *nfsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->change_attr;
+
+	if (bufmax > sizeof(auxdata))
+		bufmax = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, bufmax);
+	return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+						  const void *data,
+						  uint16_t datalen)
+{
+	struct nfs_fscache_inode_auxdata auxdata;
+	struct nfs_inode *nfsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.size = nfsi->vfs_inode.i_size;
+	auxdata.mtime = nfsi->vfs_inode.i_mtime;
+	auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+		auxdata.change_attr = nfsi->change_attr;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct nfs_inode *nfsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+	for (;;) {
+		/* grab a bunch of pages to unmark */
+		nr_pages = pagevec_lookup(&pvec,
+					  nfsi->vfs_inode.i_mapping,
+					  first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+	get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+	if (context)
+		put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+	.name		= "NFS.fh",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= nfs_fscache_inode_get_key,
+	.get_attr	= nfs_fscache_inode_get_attr,
+	.get_aux	= nfs_fscache_inode_get_aux,
+	.check_aux	= nfs_fscache_inode_check_aux,
+	.now_uncached	= nfs_fscache_inode_now_uncached,
+	.get_context	= nfs_fh_get_context,
+	.put_context	= nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY		NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+	/* create a cache index for looking up filehandles */
+	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+					      &nfs_fscache_server_index_def,
+					      clp);
+	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+	dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+		 clp, clp->fscache);
+
+	fscache_relinquish_cookie(clp->fscache, 0);
+	clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+				  struct nfs_parsed_mount_data *data)
+{
+	struct nfs_fscache_key *key, *xkey;
+	struct nfs_server *nfss = NFS_SB(sb);
+	struct rb_node **p, *parent;
+	const char *uniq = data->fscache_uniq ?: "";
+	int diff, ulen;
+
+	ulen = strlen(uniq);
+	key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+	if (!key)
+		return;
+
+	key->nfs_client = nfss->nfs_client;
+	key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+	key->key.nfs_server.flags = nfss->flags;
+	key->key.nfs_server.rsize = nfss->rsize;
+	key->key.nfs_server.wsize = nfss->wsize;
+	key->key.nfs_server.acregmin = nfss->acregmin;
+	key->key.nfs_server.acregmax = nfss->acregmax;
+	key->key.nfs_server.acdirmin = nfss->acdirmin;
+	key->key.nfs_server.acdirmax = nfss->acdirmax;
+	key->key.nfs_server.fsid = nfss->fsid;
+	key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+	key->key.uniq_len = ulen;
+	memcpy(key->key.uniquifier, uniq, ulen);
+
+	spin_lock(&nfs_fscache_keys_lock);
+	p = &nfs_fscache_keys.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+		if (key->nfs_client < xkey->nfs_client)
+			goto go_left;
+		if (key->nfs_client > xkey->nfs_client)
+			goto go_right;
+
+		diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+
+		if (key->key.uniq_len == 0)
+			goto non_unique;
+		diff = memcmp(key->key.uniquifier,
+			      xkey->key.uniquifier,
+			      key->key.uniq_len);
+		if (diff < 0)
+			goto go_left;
+		if (diff > 0)
+			goto go_right;
+		goto non_unique;
+
+	go_left:
+		p = &(*p)->rb_left;
+		continue;
+	go_right:
+		p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&key->node, parent, p);
+	rb_insert_color(&key->node, &nfs_fscache_keys);
+	spin_unlock(&nfs_fscache_keys_lock);
+	nfss->fscache_key = key;
+
+	/* create a cache index for looking up filehandles */
+	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+					       &nfs_fscache_super_index_def,
+					       nfss);
+	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+	return;
+
+non_unique:
+	spin_unlock(&nfs_fscache_keys_lock);
+	kfree(key);
+	nfss->fscache_key = NULL;
+	nfss->fscache = NULL;
+	printk(KERN_WARNING "NFS:"
+	       " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+	struct nfs_server *nfss = NFS_SB(sb);
+
+	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+		 nfss, nfss->fscache);
+
+	fscache_relinquish_cookie(nfss->fscache, 0);
+	nfss->fscache = NULL;
+
+	if (nfss->fscache_key) {
+		spin_lock(&nfs_fscache_keys_lock);
+		rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+		spin_unlock(&nfs_fscache_keys_lock);
+		kfree(nfss->fscache_key);
+		nfss->fscache_key = NULL;
+	}
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+	NFS_I(inode)->fscache = NULL;
+	if (S_ISREG(inode->i_mode))
+		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfsi->fscache || !NFS_FSCACHE(inode))
+		return;
+
+	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+		nfsi->fscache = fscache_acquire_cookie(
+			NFS_SB(sb)->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+			 sb, nfsi, nfsi->fscache);
+	}
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 0);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+		 nfsi, nfsi->fscache);
+
+	fscache_relinquish_cookie(nfsi->fscache, 1);
+	nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+	if (NFS_I(inode)->fscache) {
+		dfprintk(FSCACHE,
+			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+		/* Need to invalidate any mapped pages that were read in before
+		 * turning off the cache.
+		 */
+		if (inode->i_mapping && inode->i_mapping->nrpages)
+			invalidate_inode_pages2(inode->i_mapping);
+
+		nfs_fscache_zap_inode_cookie(inode);
+	}
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+	schedule();
+	return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	smp_mb__before_clear_bit();
+	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if (NFS_FSCACHE(inode)) {
+		nfs_fscache_inode_lock(inode);
+		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+			nfs_fscache_disable_inode_cookie(inode);
+		else
+			nfs_fscache_enable_inode_cookie(inode);
+		nfs_fscache_inode_unlock(inode);
+	}
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	struct fscache_cookie *old = nfsi->fscache;
+
+	nfs_fscache_inode_lock(inode);
+	if (nfsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(nfsi->fscache, 1);
+
+		nfsi->fscache = fscache_acquire_cookie(
+			nfss->nfs_client->fscache,
+			&nfs_fscache_inode_object_def,
+			nfsi);
+
+		dfprintk(FSCACHE,
+			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+			 nfss, nfsi, old, nfsi->fscache);
+	}
+	nfs_fscache_inode_unlock(inode);
+}
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+	struct fscache_cookie *cookie = nfsi->fscache;
+
+	BUG_ON(!cookie);
+
+	if (fscache_check_page_write(cookie, page)) {
+		if (!(gfp & __GFP_WAIT))
+			return 0;
+		fscache_wait_on_page_write(cookie, page);
+	}
+
+	if (PageFsCache(page)) {
+		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+			 cookie, page, nfsi);
+
+		fscache_uncache_page(cookie, page);
+		nfs_add_fscache_stats(page->mapping->host,
+				      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+	}
+
+	return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfsi->fscache;
+
+	BUG_ON(!cookie);
+
+	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+		 cookie, page, nfsi);
+
+	fscache_wait_on_page_write(cookie, page);
+
+	BUG_ON(!PageLocked(page));
+	fscache_uncache_page(cookie, page);
+	nfs_add_fscache_stats(page->mapping->host,
+			      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+					       void *context,
+					       int error)
+{
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+		 page, context, error);
+
+	/* if the read completes with an error, we just unlock the page and let
+	 * the VM reissue the readpage */
+	if (!error) {
+		SetPageUptodate(page);
+		unlock_page(page);
+	} else {
+		error = nfs_readpage_async(context, page->mapping->host, page);
+		if (error)
+			unlock_page(page);
+	}
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+				struct inode *inode, struct page *page)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+		 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+
+	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+					 page,
+					 nfs_readpage_from_fscache_complete,
+					 ctx,
+					 GFP_KERNEL);
+
+	switch (ret) {
+	case 0: /* read BIO submitted (page in fscache) */
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache: BIO submitted\n");
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+		return ret;
+
+	case -ENOBUFS: /* inode not in cache */
+	case -ENODATA: /* page not in cache */
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+		dfprintk(FSCACHE,
+			 "NFS:    readpage_from_fscache %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+				 struct inode *inode,
+				 struct address_space *mapping,
+				 struct list_head *pages,
+				 unsigned *nr_pages)
+{
+	int ret, npages = *nr_pages;
+
+	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+		 NFS_I(inode)->fscache, npages, inode);
+
+	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+					  mapping, pages, nr_pages,
+					  nfs_readpage_from_fscache_complete,
+					  ctx,
+					  mapping_gfp_mask(mapping));
+	if (*nr_pages < npages)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+				      npages);
+	if (*nr_pages > 0)
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+				      *nr_pages);
+
+	switch (ret) {
+	case 0: /* read submitted to the cache for all pages */
+		BUG_ON(!list_empty(pages));
+		BUG_ON(*nr_pages != 0);
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: submitted\n");
+
+		return ret;
+
+	case -ENOBUFS: /* some pages aren't cached and can't be */
+	case -ENODATA: /* some pages aren't cached */
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+		return 1;
+
+	default:
+		dfprintk(FSCACHE,
+			 "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+	int ret;
+
+	dfprintk(FSCACHE,
+		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+		 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+
+	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+	dfprintk(FSCACHE,
+		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+		 page, page->index, page->flags, ret);
+
+	if (ret != 0) {
+		fscache_uncache_page(NFS_I(inode)->fscache, page);
+		nfs_add_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+	} else {
+		nfs_add_fscache_stats(inode,
+				      NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+	}
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+	struct rb_node		node;
+	struct nfs_client	*nfs_client;	/* the server */
+
+	/* the elements of the unique key - as used by nfs_compare_super() and
+	 * nfs_compare_mount_options() to distinguish superblocks */
+	struct {
+		struct {
+			unsigned long	s_flags;	/* various flags
+							 * (& NFS_MS_MASK) */
+		} super;
+
+		struct {
+			struct nfs_fsid fsid;
+			int		flags;
+			unsigned int	rsize;		/* read size */
+			unsigned int	wsize;		/* write size */
+			unsigned int	acregmin;	/* attr cache timeouts */
+			unsigned int	acregmax;
+			unsigned int	acdirmin;
+			unsigned int	acdirmax;
+		} nfs_server;
+
+		struct {
+			rpc_authflavor_t au_flavor;
+		} rpc_auth;
+
+		/* uniquifier - can be used if nfs_server.flags includes
+		 * NFS_MOUNT_UNSHARED  */
+		u8 uniq_len;
+		char uniquifier[0];
+	} key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+					 struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+				       struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+					struct inode *, struct address_space *,
+					struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page)
+{
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpage_from_fscache(ctx, inode, page);
+	return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	if (NFS_I(inode)->fscache)
+		return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+						    nr_pages);
+	return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page,
+					   int sync)
+{
+	if (PageFsCache(page))
+		__nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+		return "yes";
+	return "no ";
+}
+
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_get_super_cookie(
+	struct super_block *sb,
+	struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+						struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+						  struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+					    struct inode *inode,
+					    struct page *page)
+{
+	return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+					     struct inode *inode,
+					     struct address_space *mapping,
+					     struct list_head *pages,
+					     unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+					   struct page *page, int sync) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+	return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " getroot encountered non-directory\n");
 		return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
 		return ret;
 	}
 
-	if (fattr.type != NFDIR) {
+	if (!S_ISDIR(fattr.mode)) {
 		printk(KERN_ERR "nfs4_get_root:"
 		       " lookupfh encountered non-directory\n");
 		return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 }
 
 /**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
+/**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
  *
@@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
+	nfs_fscache_release_inode_cookie(inode);
 }
 
 /**
@@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 	struct inode *inode = ERR_PTR(-ENOENT);
 	unsigned long hash;
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
 		goto out_no_inode;
-
-	if (!fattr->nlink) {
-		printk("NFS: Buggy server - nlink == 0!\n");
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
 		goto out_no_inode;
-	}
 
 	hash = nfs_fattr_to_ino_t(fattr);
 
@@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
-			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 					inode->i_op = &nfs_referral_inode_operations;
 				else
@@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
+		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+		nfsi->change_attr = 0;
+		inode->i_size = 0;
+		inode->i_nlink = 0;
+		inode->i_uid = -2;
+		inode->i_gid = -2;
+		inode->i_blocks = 0;
+		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
-		inode->i_atime = fattr->atime;
-		inode->i_mtime = fattr->mtime;
-		inode->i_ctime = fattr->ctime;
-		if (fattr->valid & NFS_ATTR_FATTR_V4)
+		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+			inode->i_atime = fattr->atime;
+		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			inode->i_mtime = fattr->mtime;
+		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			inode->i_ctime = fattr->ctime;
+		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
-		inode->i_size = nfs_size_to_loff_t(fattr->size);
-		inode->i_nlink = fattr->nlink;
-		inode->i_uid = fattr->uid;
-		inode->i_gid = fattr->gid;
-		if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+			inode->i_nlink = fattr->nlink;
+		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+			inode->i_uid = fattr->uid;
+		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+			inode->i_gid = fattr->gid;
+		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+			inode->i_blocks = fattr->du.nfs2.blocks;
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		} else {
-			inode->i_blocks = fattr->du.nfs2.blocks;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->access_cache = RB_ROOT;
 
+		nfs_fscache_init_inode_cookie(inode);
+
 		unlock_new_inode(inode);
 	} else
 		nfs_refresh_inode(inode, fattr);
@@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	struct inode *inode;
+	struct nfs_server *server;
+
+	if (!(ctx->mode & FMODE_WRITE))
+		return;
+	if (!is_sync)
+		return;
+	inode = ctx->path.dentry->d_inode;
+	if (!list_empty(&NFS_I(inode)->open_files))
+		return;
+	server = NFS_SERVER(inode);
+	if (server->flags & NFS_MOUNT_NOCTO)
+		return;
+	nfs_revalidate_inode(server, inode);
+}
+
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
@@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 	return ctx;
 }
 
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-	struct inode *inode;
-
-	if (ctx == NULL)
-		return;
+	struct inode *inode = ctx->path.dentry->d_inode;
 
-	inode = ctx->path.dentry->d_inode;
 	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
-	if (ctx->state != NULL) {
-		if (wait)
-			nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-		else
-			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-	}
+	NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
@@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	ctx->mode = filp->f_mode;
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
+	nfs_fscache_set_inode_cookie(inode, filp);
 	return 0;
 }
 
@@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
 		goto out;
 
-	if (NFS_STALE(inode))
-		goto out;
-
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
@@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 	spin_unlock(&inode->i_lock);
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+	nfs_fscache_reset_inode_cookie(inode);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
 			inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 	return 0;
@@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
-			nfsi->change_attr == fattr->pre_change_attr) {
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+			&& nfsi->change_attr == fattr->pre_change_attr) {
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
+			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-		}
-		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-		    nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 	}
+	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
+			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+			&& nfsi->npages == 0)
+			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 
 /**
@@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 
 	/* Has the inode gone and changed behind our back? */
-	if (nfsi->fileid != fattr->fileid
-			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+		return -EIO;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		return -EIO;
-	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
-	cur_size = i_size_read(inode);
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	if (cur_size != new_isize && nfsi->npages == 0)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		cur_size = i_size_read(inode);
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		if (cur_size != new_isize && nfsi->npages == 0)
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	}
 
 	/* Have any file permissions changed? */
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
-			|| inode->i_uid != fattr->uid
-			|| inode->i_gid != fattr->gid)
+	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
-	if (inode->i_nlink != fattr->nlink)
+	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_ATTR;
 
-	if (!timespec_equal(&inode->i_atime, &fattr->atime))
+	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
@@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+		return 0;
 	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		return 0;
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
@@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	spin_lock(&inode->i_lock);
 	status = nfs_refresh_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+
 	return status;
 }
 
@@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 	/* Don't do a WCC update if these attributes are already stale */
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
 			!nfs_inode_attrs_need_update(inode, fattr)) {
-		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+		fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+				| NFS_ATTR_FATTR_PRESIZE
+				| NFS_ATTR_FATTR_PREMTIME
+				| NFS_ATTR_FATTR_PRECTIME);
 		goto out_noforce;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
-		fattr->valid |= NFS_ATTR_WCC_V4;
+		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
-			(fattr->valid & NFS_ATTR_WCC) == 0) {
+	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+		fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+	}
+	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+			(fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
 		fattr->pre_size = i_size_read(inode);
-		fattr->valid |= NFS_ATTR_WCC;
+		fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
 	}
 out_noforce:
 	status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if (nfsi->fileid != fattr->fileid)
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
-	if (S_ISDIR(inode->i_mode) &&
+	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
 			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
 		server->fsid = fattr->fsid;
@@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
-			| NFS_INO_REVAL_PAGECACHE);
+	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+		    | NFS_INO_INVALID_ATIME
+		    | NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
-	if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+		if (nfsi->change_attr != fattr->change_attr) {
+			dprintk("NFS: change_attr change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
+			nfsi->change_attr = fattr->change_attr;
+		}
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-	} else if (nfsi->change_attr != fattr->change_attr) {
-		dprintk("NFS: change_attr change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		if (S_ISDIR(inode->i_mode))
-			nfs_force_lookup_revalidate(inode);
+			/* and probably clear data for a directory too as utimes can cause
+			 * havoc with our cache.
+			 */
+			if (S_ISDIR(inode->i_mode)) {
+				invalid |= NFS_INO_INVALID_DATA;
+				nfs_force_lookup_revalidate(inode);
+			}
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		}
 	}
 
 	/* Check if our cached file size is stale */
- 	new_isize = nfs_size_to_loff_t(fattr->size);
-	cur_isize = i_size_read(inode);
-	if (new_isize != cur_isize) {
-		/* Do we perhaps have any outstanding writes, or has
-		 * the file grown beyond our last write? */
-		if (nfsi->npages == 0 || new_isize > cur_isize) {
-			i_size_write(inode, new_isize);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+		new_isize = nfs_size_to_loff_t(fattr->size);
+		cur_isize = i_size_read(inode);
+		if (new_isize != cur_isize) {
+			/* Do we perhaps have any outstanding writes, or has
+			 * the file grown beyond our last write? */
+			if (nfsi->npages == 0 || new_isize > cur_isize) {
+				i_size_write(inode, new_isize);
+				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+			}
+			dprintk("NFS: isize change on server for file %s/%ld\n",
+					inode->i_sb->s_id, inode->i_ino);
 		}
-		dprintk("NFS: isize change on server for file %s/%ld\n",
-				inode->i_sb->s_id, inode->i_ino);
 	}
 
 
-	memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-	nfsi->change_attr = fattr->change_attr;
-
-	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-	    inode->i_uid != fattr->uid ||
-	    inode->i_gid != fattr->gid)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
 
-	if (inode->i_nlink != fattr->nlink)
-		invalid |= NFS_INO_INVALID_ATTR;
+	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_mode = fattr->mode;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+		if (inode->i_uid != fattr->uid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_uid = fattr->uid;
+		}
+	}
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+		if (inode->i_gid != fattr->gid) {
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			inode->i_gid = fattr->gid;
+		}
+	}
 
-	inode->i_mode = fattr->mode;
-	inode->i_nlink = fattr->nlink;
-	inode->i_uid = fattr->uid;
-	inode->i_gid = fattr->gid;
+	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+		if (inode->i_nlink != fattr->nlink) {
+			invalid |= NFS_INO_INVALID_ATTR;
+			if (S_ISDIR(inode->i_mode))
+				invalid |= NFS_INO_INVALID_DATA;
+			inode->i_nlink = fattr->nlink;
+		}
+	}
 
-	if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- 	} else {
- 		inode->i_blocks = fattr->du.nfs2.blocks;
  	}
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
 	if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1381,6 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
@@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_fscache_register();
+	if (err < 0)
+		goto out7;
+
 	err = nfsiod_start();
 	if (err)
 		goto out6;
@@ -1389,6 +1499,8 @@ out4:
 out5:
 	nfsiod_stop();
 out6:
+	nfs_fscache_unregister();
+out7:
 	return err;
 }
 
@@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_readpagecache();
 	nfs_destroy_inodecache();
 	nfs_destroy_nfspagecache();
+	nfs_fscache_unregister();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
 struct nfs_string;
 
 /* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
 	int			acregmin, acregmax,
 				acdirmin, acdirmax;
 	int			namlen;
+	unsigned int		options;
 	unsigned int		bsize;
 	unsigned int		auth_flavor_len;
 	rpc_authflavor_t	auth_flavors[1];
 	char			*client_address;
+	char			*fscache_uniq;
 
 	struct {
 		struct sockaddr_storage	address;
@@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 
@@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+	unsigned long long	fscache[__NFSIOS_FSCACHEMAX];
+#endif
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+					 enum nfs_stat_fscachecounters stat,
+					 unsigned long addend)
+{
+	struct nfs_iostats *iostats;
+	int cpu;
+
+	cpu = get_cpu();
+	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+	iostats->fscache[stat] += addend;
+	put_cpu_no_resched();
+}
+#endif
+
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-	u32 rdev;
-	fattr->type = (enum nfs_ftype) ntohl(*p++);
+	u32 rdev, type;
+	type = ntohl(*p++);
 	fattr->mode = ntohl(*p++);
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	p = xdr_decode_time(p, &fattr->ctime);
-	fattr->valid |= NFS_ATTR_FATTR;
+	fattr->valid |= NFS_ATTR_FATTR_V2;
 	fattr->rdev = new_decode_dev(rdev);
-	if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
-		fattr->type = NFFIFO;
+	if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
 		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
 		fattr->rdev = 0;
 	}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87d..6bbf0e6daad2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_fattr fattr;
-	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct page *pages[NFSACL_MAXPAGES];
 	struct nfs3_setaclargs args = {
 		.inode = inode,
 		.mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		.rpc_argp	= &args,
 		.rpc_resp	= &fattr,
 	};
-	int status, count;
+	int status;
 
 	status = -EOPNOTSUPP;
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	if (S_ISDIR(inode->i_mode)) {
 		args.mask |= NFS_DFACL;
 		args.acl_default = dfacl;
+		args.len = nfsacl_size(acl, dfacl);
+	} else
+		args.len = nfsacl_size(acl, NULL);
+
+	if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+		unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+		status = -ENOMEM;
+		do {
+			args.pages[args.npages] = alloc_page(GFP_KERNEL);
+			if (args.pages[args.npages] == NULL)
+				goto out_freepages;
+			args.npages++;
+		} while (args.npages < npages);
 	}
 
 	dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	nfs_zap_acl_cache(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
-	/* pages may have been allocated at the xdr layer. */
-	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
-		__free_page(args.pages[count]);
-
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		case -ENOTSUPP:
 			status = -EOPNOTSUPP;
 	}
+out_freepages:
+	while (args.npages != 0) {
+		args.npages--;
+		__free_page(args.pages[args.npages]);
+	}
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		data->arg.create.verifier[1] = current->pid;
 	}
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	for (;;) {
 		status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
+	.close_context	= nfs_close_context,
 };
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec1432..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,26 +82,24 @@
 #define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
 
 #define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
-#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+2*(2+5*3))
-#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+2*(2+5*3))
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
 /*
  * Map file type to S_IFMT bits
  */
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-      { 0,		NFNON	},
-      { S_IFREG,	NFREG	},
-      { S_IFDIR,	NFDIR	},
-      { S_IFBLK,	NFBLK	},
-      { S_IFCHR,	NFCHR	},
-      { S_IFLNK,	NFLNK	},
-      { S_IFSOCK,	NFSOCK	},
-      { S_IFIFO,	NFFIFO	},
-      { 0,		NFBAD	}
+static const umode_t nfs_type2fmt[] = {
+	[NF3BAD] = 0,
+	[NF3REG] = S_IFREG,
+	[NF3DIR] = S_IFDIR,
+	[NF3BLK] = S_IFBLK,
+	[NF3CHR] = S_IFCHR,
+	[NF3LNK] = S_IFLNK,
+	[NF3SOCK] = S_IFSOCK,
+	[NF3FIFO] = S_IFIFO,
 };
 
 /*
@@ -146,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
-	int		fmode;
+	umode_t		fmode;
 
 	type = ntohl(*p++);
-	if (type >= NF3BAD)
-		type = NF3BAD;
-	fmode = nfs_type2fmt[type].mode;
-	fattr->type = nfs_type2fmt[type].nfs2type;
+	if (type > NF3FIFO)
+		type = NF3NON;
+	fmode = nfs_type2fmt[type];
 	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
 	fattr->nlink = ntohl(*p++);
 	fattr->uid = ntohl(*p++);
@@ -175,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_time3(p, &fattr->ctime);
 
 	/* Update the mode bits */
-	fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return p;
 }
 
@@ -231,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
 	p = xdr_decode_time3(p, &fattr->pre_ctime);
-	fattr->valid |= NFS_ATTR_WCC;
+	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PREMTIME
+		| NFS_ATTR_FATTR_PRECTIME;
 	return p;
 }
 
@@ -703,28 +702,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_setaclargs *args)
 {
 	struct xdr_buf *buf = &req->rq_snd_buf;
-	unsigned int base, len_in_head, len = nfsacl_size(
-		(args->mask & NFS_ACL)   ? args->acl_access  : NULL,
-		(args->mask & NFS_DFACL) ? args->acl_default : NULL);
-	int count, err;
+	unsigned int base;
+	int err;
 
 	p = xdr_encode_fhandle(p, NFS_FH(args->inode));
 	*p++ = htonl(args->mask);
-	base = (char *)p - (char *)buf->head->iov_base;
-	/* put as much of the acls into head as possible. */
-	len_in_head = min_t(unsigned int, buf->head->iov_len - base, len);
-	len -= len_in_head;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2));
-
-	for (count = 0; (count << PAGE_SHIFT) < len; count++) {
-		args->pages[count] = alloc_page(GFP_KERNEL);
-		if (!args->pages[count]) {
-			while (count)
-				__free_page(args->pages[--count]);
-			return -ENOMEM;
-		}
-	}
-	xdr_encode_pages(buf, args->pages, 0, len);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	base = req->rq_slen;
+
+	if (args->npages != 0)
+		xdr_encode_pages(buf, args->pages, 0, args->len);
+	else
+		req->rq_slen += args->len;
 
 	err = nfsacl_encode(buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4e4d33204376..84345deab26f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -179,7 +179,7 @@ struct nfs4_state_recovery_ops {
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
 
-extern struct dentry_operations nfs4_dentry_operations;
+extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
 /* inode.c */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc39b3c6..2a2a0a7143ad 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
 /*
- * Check if fs_root is valid
+ * Convert the NFSv4 pathname components into a standard posix path.
+ *
+ * Note that the resulting string will be placed at the end of the buffer
  */
 static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
 					 char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	char *mnt_path;
-	int page2len;
+	unsigned int maxbuflen;
 	unsigned int s;
 
 	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
 	if (IS_ERR(mnt_path))
 		return mnt;
 	mountdata->mnt_path = mnt_path;
-	page2 += strlen(mnt_path) + 1;
-	page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+	maxbuflen = mnt_path - 1 - page2;
 
 	for (s = 0; s < location->nservers; s++) {
 		const struct nfs4_string *buf = &location->servers[s];
 		struct sockaddr_storage addr;
 
-		if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+		if (buf->len <= 0 || buf->len >= maxbuflen)
 			continue;
 
 		mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 			continue;
 		nfs_set_port(mountdata->addr, NFS_PORT);
 
-		strncpy(page2, buf->data, page2len);
-		page2[page2len] = '\0';
+		memcpy(page2, buf->data, buf->len);
+		page2[buf->len] = '\0';
 		mountdata->hostname = page2;
 
 		snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..a4d242680299 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start, KM_USER0);
 }
 
-static int nfs4_wait_bit_killable(void *word)
-{
-	if (fatal_signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 	might_sleep();
 
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-			nfs4_wait_bit_killable, TASK_KILLABLE);
+			nfs_wait_bit_killable, TASK_KILLABLE);
 	return res;
 }
 
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->attr_bitmask;
+	calldata->arg.bitmask = server->cache_consistency_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
@@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		attr.ia_mode = nd->intent.open.create_mode;
 		attr.ia_valid = ATTR_MODE;
 		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current->fs->umask;
+			attr.ia_mode &= ~current_umask();
 	} else {
 		attr.ia_valid = 0;
 		BUG_ON(nd->intent.open.flags & O_CREAT);
@@ -1580,6 +1572,15 @@ out_drop:
 	return 0;
 }
 
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+	if (ctx->state == NULL)
+		return;
+	if (is_sync)
+		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+	else
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		server->acl_bitmask = res.acl_bitmask;
 	}
 	return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->attr_bitmask;
+	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages = &page,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+	if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+		(fattr->valid & NFS_ATTR_FATTR_FSID) &&
+		(fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+		return;
+
+	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+		NFS_ATTR_FATTR_NLINK;
+	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	fattr->nlink = 2;
+}
+
 int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
+	nfs_fixup_referral_attributes(&fs_locations->fattr);
 	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
 }
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
+	.close_context  = nfs4_close_context,
 };
 
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
 
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
-	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
-			nfs_callback_tcpport, cred);
+	unsigned short port;
+	int status;
+
+	port = nfs_callback_tcpport;
+	if (clp->cl_addr.ss_family == AF_INET6)
+		port = nfs_callback_tcpport6;
+
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
 	if (status == 0)
 		status = nfs4_proc_setclientid_confirm(clp, cred);
 	if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
 				 decode_lookup_maxsz + \
 				 decode_fs_locations_maxsz)
 
-static struct {
-	unsigned int	mode;
-	unsigned int	nfs2type;
-} nfs_type2fmt[] = {
-	{ 0,		NFNON	     },
-	{ S_IFREG,	NFREG	     },
-	{ S_IFDIR,	NFDIR	     },
-	{ S_IFBLK,	NFBLK	     },
-	{ S_IFCHR,	NFCHR	     },
-	{ S_IFLNK,	NFLNK	     },
-	{ S_IFSOCK,	NFSOCK	     },
-	{ S_IFIFO,	NFFIFO	     },
-	{ 0,		NFNON	     },
-	{ 0,		NFNON	     },
+static const umode_t nfs_type2fmt[] = {
+	[NF4BAD] = 0,
+	[NF4REG] = S_IFREG,
+	[NF4DIR] = S_IFDIR,
+	[NF4BLK] = S_IFBLK,
+	[NF4CHR] = S_IFCHR,
+	[NF4LNK] = S_IFLNK,
+	[NF4SOCK] = S_IFSOCK,
+	[NF4FIFO] = S_IFIFO,
+	[NF4ATTRDIR] = 0,
+	[NF4NAMEDATTR] = 0,
 };
 
 struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*type = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 			return -EIO;
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
+		ret = NFS_ATTR_FATTR_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
-	return 0;
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+	return ret;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*change = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+		ret = NFS_ATTR_FATTR_CHANGE;
 	}
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*size = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		READ_BUF(8);
 		READ64(*size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
+		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	fsid->major = 0;
 	fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		READ64(fsid->major);
 		READ64(fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
+		ret = NFS_ATTR_FATTR_FSID;
 	}
 	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*fileid = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		READ_BUF(8);
 		READ64(*fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
+	if (res->nlocations != 0)
+		status = NFS_ATTR_FATTR_V4_REFERRAL;
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	return status;
 }
 
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+	uint32_t tmp;
 	__be32 *p;
+	int ret = 0;
 
 	*mode = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(*mode);
-		*mode &= ~S_IFMT;
+		READ32(tmp);
+		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
+		ret = NFS_ATTR_FATTR_MODE;
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*nlink = 1;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		READ_BUF(4);
 		READ32(*nlink);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
 	uint32_t len;
 	__be32 *p;
+	int ret = 0;
 
 	*uid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+				ret = NFS_ATTR_FATTR_OWNER;
+			else
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
 						__func__);
 		} else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
 	uint32_t len;
 	__be32 *p;
+	int ret = 0;
 
 	*gid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ32(len);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+				ret = NFS_ATTR_FATTR_GROUP;
+			else
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
 						__func__);
 		} else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
 	uint32_t major = 0, minor = 0;
 	__be32 *p;
+	int ret = 0;
 
 	*rdev = MKDEV(0,0);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
 		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+		ret = NFS_ATTR_FATTR_RDEV;
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
 	__be32 *p;
+	int ret = 0;
 
 	*used = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ_BUF(8);
 		READ64(*used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
-	return 0;
+	return ret;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_ATIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
 	}
 	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_CTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
 	}
 	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
 		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_MTIME;
 		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
 	}
 	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
-	int status, fmode = 0;
+	int status;
+	umode_t fmode = 0;
 	uint64_t fileid;
 
-	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-		goto xdr_error;
-	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
 		goto xdr_error;
 
-	fattr->bitmap[0] = bitmap[0];
-	fattr->bitmap[1] = bitmap[1];
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
 
-	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
 		goto xdr_error;
 
 
-	if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+	status = decode_attr_type(xdr, bitmap, &type);
+	if (status < 0)
 		goto xdr_error;
-	fattr->type = nfs_type2fmt[type].nfs2type;
-	fmode = nfs_type2fmt[type].mode;
+	fattr->mode = 0;
+	if (status != 0) {
+		fattr->mode |= nfs_type2fmt[type];
+		fattr->valid |= status;
+	}
 
-	if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+	status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_size(xdr, bitmap, &fattr->size);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+	fattr->valid |= status;
+
+	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
 						struct nfs4_fs_locations,
-						fattr))) != 0)
+						fattr));
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_mode(xdr, bitmap, &fmode);
+	if (status < 0)
 		goto xdr_error;
-	fattr->mode |= fmode;
-	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+	if (status != 0) {
+		fattr->mode |= fmode;
+		fattr->valid |= status;
+	}
+
+	status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+	if (status < 0)
 		goto xdr_error;
-	if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+	fattr->valid |= status;
+
+	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+	if (status < 0)
 		goto xdr_error;
-	if (fattr->fileid == 0 && fileid != 0)
+	if (status != 0 && !(fattr->valid & status)) {
 		fattr->fileid = fileid;
-	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
-		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+		fattr->valid |= status;
+	}
+
+	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
 	status = decode_setattr(&xdr, res);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
-	if (status == NFS4ERR_DELAY)
-		status = 0;
+	decode_getfattr(&xdr, res->fattr, res->server);
 out:
 	return status;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
-static int nfs_wait_bit_killable(void *word)
-{
-	int ret = 0;
-
-	if (fatal_signal_pending(current))
-		ret = -ERESTARTSYS;
-	else
-		schedule();
-	return ret;
-}
-
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.commit_setup	= nfs_proc_commit_setup,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
+	.close_context	= nfs_close_context,
 };
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
 
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	}
 }
 
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-		struct page *page)
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+		       struct page *page)
 {
 	LIST_HEAD(one_request);
 	struct nfs_page	*new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
 static void nfs_readpage_release(struct nfs_page *req)
 {
+	struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+
+	if (PageUptodate(req->wb_page))
+		nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
 	unlock_page(req->wb_page);
 
 	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
 	} else
 		ctx = get_nfs_open_context(nfs_file_open_context(file));
 
+	if (!IS_SYNC(inode)) {
+		error = nfs_readpage_from_fscache(ctx, inode, page);
+		if (error == 0)
+			goto out;
+	}
+
 	error = nfs_readpage_async(ctx, inode, page);
 
+out:
 	put_nfs_open_context(ctx);
 	return error;
 out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 			return -EBADF;
 	} else
 		desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+	/* attempt to read as many of the pages as possible from the cache
+	 * - this returns -ENOBUFS immediately if the cookie is negative
+	 */
+	ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+					 pages, &nr_pages);
+	if (ret == 0)
+		goto read_complete; /* all pages were read */
+
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	nfs_pageio_complete(&pgio);
 	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
 	put_nfs_open_context(desc.ctx);
 out:
 	return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -76,6 +77,7 @@ enum {
 	Opt_rdirplus, Opt_nordirplus,
 	Opt_sharecache, Opt_nosharecache,
 	Opt_resvport, Opt_noresvport,
+	Opt_fscache, Opt_nofscache,
 
 	/* Mount options that take integer arguments */
 	Opt_port,
@@ -93,6 +95,7 @@ enum {
 	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 	Opt_lookupcache,
+	Opt_fscache_uniq,
 
 	/* Special mount options */
 	Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_nosharecache, "nosharecache" },
 	{ Opt_resvport, "resvport" },
 	{ Opt_noresvport, "noresvport" },
+	{ Opt_fscache, "fsc" },
+	{ Opt_fscache_uniq, "fsc=%s" },
+	{ Opt_nofscache, "nofsc" },
 
 	{ Opt_port, "port=%u" },
 	{ Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	if (clp->rpc_ops->version == 4)
 		seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
 #endif
+	if (nfss->options & NFS_OPTION_FSCACHE)
+		seq_printf(m, ",fsc");
 }
 
 /*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 			totals.events[i] += stats->events[i];
 		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 			totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			totals.fscache[i] += stats->fscache[i];
+#endif
 
 		preempt_enable();
 	}
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, "\n\tbytes:\t");
 	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
 		seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+		seq_printf(m, "\n\tfsc:\t");
+		for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+			seq_printf(m, "%Lu ", totals.bytes[i]);
+	}
+#endif
 	seq_printf(m, "\n");
 
 	rpc_print_iostats(m, nfss->client);
@@ -1018,6 +1037,7 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_rdma:
 			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+			xprt_load_transport(p);
 			break;
 		case Opt_acl:
 			mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1043,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_noresvport:
 			mnt->flags |= NFS_MOUNT_NORESVPORT;
 			break;
+		case Opt_fscache:
+			mnt->options |= NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+		case Opt_nofscache:
+			mnt->options &= ~NFS_OPTION_FSCACHE;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = NULL;
+			break;
+		case Opt_fscache_uniq:
+			string = match_strdup(args);
+			if (!string)
+				goto out_nomem;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = string;
+			mnt->options |= NFS_OPTION_FSCACHE;
+			break;
 
 		/*
 		 * options that take numeric values
@@ -1190,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
-			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
@@ -1205,12 +1242,14 @@ static int nfs_parse_mount_options(char *raw,
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+				xprt_load_transport(string);
 				break;
 			default:
 				errors++;
 				dfprintk(MOUNT, "NFS:   unrecognized "
 						"transport protocol\n");
 			}
+			kfree(string);
 			break;
 		case Opt_mountproto:
 			string = match_strdup(args);
@@ -1868,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
  	nfs_initialise_sb(sb);
 }
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
-
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
 	const struct nfs_server *a = s->s_fs_info;
@@ -2034,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs_fill_super(s, data);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs_get_root(s, mntfh);
@@ -2054,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
 	kfree(data->nfs_server.hostname);
 	kfree(data->mount_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2081,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
 
 	bdi_unregister(&server->backing_dev_info);
 	kill_anon_super(s);
+	nfs_fscache_release_super_cookie(s);
 	nfs_free_server(server);
 }
 
@@ -2388,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		nfs4_fill_super(s);
+		nfs_fscache_get_super_cookie(s, data);
 	}
 
 	mntroot = nfs4_get_root(s, mntfh);
@@ -2409,6 +2450,7 @@ out:
 	kfree(data->client_address);
 	kfree(data->nfs_server.export_path);
 	kfree(data->nfs_server.hostname);
+	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	kfree(mntfh);
@@ -2435,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	kill_anon_super(sb);
 
 	nfs4_renewd_prepare_shutdown(server);
+	nfs_fscache_release_super_cookie(sb);
 	nfs_free_server(server);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
+	/* Stop dirtying of new pages while we sync */
+	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+	if (err)
+		goto out_err;
+
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
+
+	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
 	if (err < 0)
-		return err;
-	if (pgio.pg_error < 0)
-		return pgio.pg_error;
+		goto out_err;
+	err = pgio.pg_error;
+	if (err < 0)
+		goto out_err;
 	return 0;
+out_err:
+	return err;
 }
 
 /*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->ncommit++;
 	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int res = 0;
 
-	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(nfsi, dst, idx_start, npages,
-				NFS_PAGE_TAG_COMMIT);
-		nfsi->ncommit -= res;
-	}
-	return res;
+	if (!nfs_need_commit(nfsi))
+		return 0;
+
+	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+	return 0;
+}
+
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
-		if (!NFS_I(inode)->ncommit)
+		if (!nfs_need_commit(NFS_I(inode)))
 			data->args.stable = NFS_FILE_SYNC;
 	}
 
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
 	struct writeback_control wbc = {
 		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_NONE,
+		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 		.for_writepages = 1,
 	};
-	int ret;
 
-	ret = __nfs_write_mapping(mapping, &wbc, how);
-	if (ret < 0)
-		return ret;
-	wbc.sync_mode = WB_SYNC_ALL;
 	return __nfs_write_mapping(mapping, &wbc, how);
 }
 
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
 	tristate "NFS server support"
 	depends on INET
+	depends on FILE_LOCKING
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 					 struct nfsd3_writeres  *resp)
 {
 	__be32	nfserr;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+				   &cnt,
 				   &resp->committed);
-	resp->count = argp->count;
+	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }
 
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
 
 		/* Note that we don't care for remote fs's here */
-		if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 			resp->p_link_max = EXT2_LINK_MAX;
 			resp->p_name_max = EXT2_NAME_LEN;
 			break;
-		case 0x4d44:	/* MSDOS_SUPER_MAGIC */
+		case MSDOS_SUPER_MAGIC:
 			resp->p_case_insensitive = 1;
 			resp->p_case_preserving  = 0;
 			break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fhlen;
+	int len = cb_rec->cbr_fh.fh_size;
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
-	WRITEMEM(cb_rec->cbr_fhval, len);
+	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
 	return 0;
 }
 
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cb_set an atomic? */
 
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-	struct nfs4_client *clp = data;
 	struct sockaddr_in	addr;
 	struct nfs4_callback    *cb = &clp->cl_callback;
 	struct rpc_timeout	timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 		.client_name    = clp->cl_principal,
 	};
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-	};
 	struct rpc_clnt *client;
-	int status;
 
-	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
-		status = nfserr_cb_path_down;
-		goto out_err;
-	}
+	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+		return ERR_PTR(-EINVAL);
 
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
 
 	/* Create RPC client */
 	client = rpc_create(&args);
+	if (IS_ERR(client))
+		dprintk("NFSD: couldn't create callback client: %ld\n",
+			PTR_ERR(client));
+	return client;
+
+}
+
+static int do_probe_callback(void *data)
+{
+	struct nfs4_client *clp = data;
+	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+		.rpc_argp       = clp,
+	};
+	struct rpc_clnt *client;
+	int status;
+
+	client = setup_callback_client(clp);
 	if (IS_ERR(client)) {
-		dprintk("NFSD: couldn't create callback client\n");
 		status = PTR_ERR(client);
+		dprintk("NFSD: couldn't create callback client: %d\n",
+								status);
 		goto out_err;
 	}
 
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
 	rpc_shutdown_client(client);
 out_err:
-	dprintk("NFSD: warning: no callback path to client %.*s\n",
-		(int)clp->cl_name.len, clp->cl_name.data);
+	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+		(int)clp->cl_name.len, clp->cl_name.data, status);
 	put_nfs4_client(clp);
-	return status;
+	return 0;
 }
 
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 
 /*
  * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
  */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	open->op_truncate = 0;
 
 	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
 		/*
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
 		 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 					(u32 *)open->op_verf.data,
 					&open->op_truncate, &created);
 
-		/* If we ever decide to use different attrs to store the
-		 * verifier in nfsd_create_v3, then we'll need to change this
+		/*
+		 * Following rfc 3530 14.2.16, use the returned bitmask
+		 * to indicate which attributes we used to store the
+		 * verifier:
 		 */
 		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
 						FATTR4_WORD1_TIME_MODIFY);
 	} else {
 		status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		goto out;
 
 	set_change_info(&open->op_cinfo, current_fh);
-
-	/* set reply cache */
 	fh_dup2(current_fh, &resfh);
-	open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
+	/* set reply cache */
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&resfh.fh_handle);
 	if (!created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-		&current_fh->fh_handle.fh_base,
-		current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	return status;
 }
 
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+	struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)session->se_sessionid.data;
+
+	clid->cl_boot = sid->clientid.cl_boot;
+	clid->cl_id = sid->clientid.cl_id;
+}
 
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+	struct nfsd4_compoundres *resp;
+
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
 		return nfserr_inval;
 
+	if (nfsd4_has_session(cstate))
+		copy_clientid(&open->op_clientid, cstate->session);
+
 	nfs4_lock_state();
 
 	/* check seqid for replay. set nfs4_owner */
-	status = nfsd4_process_open1(open);
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(&cstate->current_fh);
-		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
-		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
-				rp->rp_openfh_len);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
 		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-			status = nfserr_inval;
-			if (open->op_create)
-				goto out;
-			/* fall through */
 		case NFS4_OPEN_CLAIM_NULL:
 			/*
 			 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	getattr->ga_fhp = &cstate->current_fh;
 	return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check stateid */
-	if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-				&read->rd_stateid,
-				CHECK_FH | RD_STATE, &read->rd_filp))) {
+	if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
-		status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-			&setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+		status = nfs4_preprocess_stateid_op(cstate,
+			&setattr->sa_stateid, WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file *filp = NULL;
 	u32 *p;
 	__be32 status = nfs_ok;
+	unsigned long cnt;
 
 	/* no need to check permission - this will be done in nfsd_write() */
 
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
-					CHECK_FH | WR_STATE, &filp);
+	status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
 	if (filp)
 		get_file(filp);
 	nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 	}
 
-	write->wr_bytes_written = write->wr_buflen;
+	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
 	p = (u32 *)write->wr_verifier.data;
 	*p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-			     write->wr_buflen, &write->wr_how_written);
+			     &cnt, &write->wr_how_written);
 	if (filp)
 		fput(filp);
 
+	write->wr_bytes_written = cnt;
+
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
 	return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
-	    || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+	    || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+	    || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
 		return nfserr_attrnotsupp;
 	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
 	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out_kfree;
 
-	p = buf + 3;
+	/* skip bitmap */
+	p = buf + 1 + ntohl(buf[0]);
 	status = nfserr_not_same;
 	if (ntohl(*p++) != verify->ve_attrlen)
 		goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 		nfsdstats.nfs4_opcount[opnum]++;
 }
 
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-	if (cstate == NULL)
-		return;
-	fh_put(&cstate->current_fh);
-	fh_put(&cstate->save_fh);
-	BUG_ON(cstate->replay_owner);
-	kfree(cstate);
-}
-
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-	struct nfsd4_compound_state *cstate;
-
-	cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-	if (cstate == NULL)
-		return NULL;
-	fh_init(&cstate->current_fh, NFS4_FHSIZE);
-	fh_init(&cstate->save_fh, NFS4_FHSIZE);
-	cstate->replay_owner = NULL;
-	return cstate;
-}
-
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
+enum nfsd4_op_flags {
+	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
+	ALLOWED_ON_ABSENT_FS = 2 << 0,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 3 << 0,	/* ops reqired first in compound */
+};
 
 struct nfsd4_operation {
 	nfsd4op_func op_func;
 	u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
 	char *op_name;
 };
 
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+			 struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+
+	dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+		resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+
+	/* Encode the replayed sequence operation */
+	BUG_ON(resp->opcnt != 1);
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/*return nfserr_retry_uncached_rep in next operation. */
+	if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+	if (args->minorversion && args->opcnt > 0) {
+		struct nfsd4_op *op = &args->ops[0];
+		return (op->status == nfserr_op_illegal) ||
+		       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+	}
+	return true;
+}
+
+/*
  * COMPOUND call.
  */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
 	struct nfsd4_op	*op;
 	struct nfsd4_operation *opdesc;
-	struct nfsd4_compound_state *cstate = NULL;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
-	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+	resp->p = rqstp->rq_res.head[0].iov_base +
+						rqstp->rq_res.head[0].iov_len;
 	resp->tagp = resp->p;
 	/* reserve space for: taglen, tag, and opcnt */
 	resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->tag = args->tag;
 	resp->opcnt = 0;
 	resp->rqstp = rqstp;
+	resp->cstate.minorversion = args->minorversion;
+	resp->cstate.replay_owner = NULL;
+	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+	/* Use the deferral mechanism only for NFSv4.0 compounds */
+	rqstp->rq_usedeferral = (args->minorversion == 0);
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
 	 */
 	status = nfserr_minor_vers_mismatch;
-	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+	if (args->minorversion > nfsd_supported_minorversion)
 		goto out;
 
-	status = nfserr_resource;
-	cstate = cstate_alloc();
-	if (cstate == NULL)
-		goto out;
+	if (!nfs41_op_ordering_ok(args)) {
+		op = &args->ops[0];
+		op->status = nfserr_sequence_pos;
+		goto encode_op;
+	}
 
 	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
 			resp->opcnt, args->opcnt, op->opnum,
 			nfsd4_op_name(op->opnum));
-
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			BUG_ON(op->status == nfs_ok);
 
 encode_op:
+		/* Only from SEQUENCE or CREATE_SESSION */
+		if (resp->cstate.status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			if (nfsd4_not_cached(resp))
+				status = nfsd4_enc_uncached_replay(args, resp);
+			else
+				status = op->status;
+			goto out;
+		}
 		if (op->status == nfserr_replay_me) {
 			op->replay = &cstate->replay_owner->so_replay;
 			nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
+	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+		status = nfserr_jukebox;
+	}
 
-	cstate_free(cstate);
+	resp->cstate.status = status;
+	fh_put(&resp->cstate.current_fh);
+	fh_put(&resp->cstate.save_fh);
+	BUG_ON(resp->cstate.replay_owner);
 out:
 	nfsd4_release_compoundargs(args);
+	/* Reset deferral mechanism for RPC deferrals */
+	rqstp->rq_usedeferral = 1;
 	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
 
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
 		.op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
-		/* unsupported, just for future reference: */
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_PUTPUBFH",
 	},
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_EXCHANGE_ID",
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_CREATE_SESSION",
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_DESTROY_SESSION",
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_SEQUENCE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 
 typedef int (recdir_func)(struct dentry *, struct dentry *);
 
-struct dentry_list {
-	struct dentry *dentry;
+struct name_list {
+	char name[HEXDIR_LEN];
 	struct list_head list;
 };
 
-struct dentry_list_arg {
-	struct list_head dentries;
-	struct dentry *parent;
-};
-
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct dentry_list_arg *dla = arg;
-	struct list_head *dentries = &dla->dentries;
-	struct dentry *parent = dla->parent;
-	struct dentry *dentry;
-	struct dentry_list *child;
+	struct list_head *names = arg;
+	struct name_list *entry;
 
-	if (name && isdotent(name, namlen))
+	if (namlen != HEXDIR_LEN - 1)
 		return 0;
-	dentry = lookup_one_len(name, parent, namlen);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	child = kmalloc(sizeof(*child), GFP_KERNEL);
-	if (child == NULL)
+	entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+	if (entry == NULL)
 		return -ENOMEM;
-	child->dentry = dentry;
-	list_add(&child->list, dentries);
+	memcpy(entry->name, name, HEXDIR_LEN - 1);
+	entry->name[HEXDIR_LEN - 1] = '\0';
+	list_add(&entry->list, names);
 	return 0;
 }
 
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
 	const struct cred *original_cred;
 	struct file *filp;
-	struct dentry_list_arg dla = {
-		.parent = dir,
-	};
-	struct list_head *dentries = &dla.dentries;
-	struct dentry_list *child;
+	LIST_HEAD(names);
+	struct name_list *entry;
+	struct dentry *dentry;
 	int status;
 
 	if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
-	INIT_LIST_HEAD(dentries);
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
-	INIT_LIST_HEAD(dentries);
-	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
+	status = vfs_readdir(filp, nfsd4_build_namelist, &names);
 	fput(filp);
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		status = f(dir, child->dentry);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+
+		dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+		if (IS_ERR(dentry)) {
+			status = PTR_ERR(dentry);
+			goto out;
+		}
+		status = f(dir, dentry);
+		dput(dentry);
 		if (status)
 			goto out;
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+		list_del(&entry->list);
+		kfree(entry);
 	}
 out:
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+		list_del(&entry->list);
+		kfree(entry);
 	}
 	nfs4_reset_creds(original_cred);
 	return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name))
+	/* note: we currently use this path only for minorversion 0 */
+	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
 	status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
 
-/* Locking:
- *
- * client_mutex:
- * 	protects clientid_hashtbl[], clientstr_hashtbl[],
- * 	unconfstr_hashtbl[], uncofid_hashtbl[].
- */
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
 static DEFINE_MUTEX(client_mutex);
 
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-
-/*
- * Delegation state
- */
-
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
-static void
-free_nfs4_file(struct kref *kref)
-{
-	struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-	list_del(&fp->fi_hash);
-	iput(fp->fi_inode);
-	kmem_cache_free(file_slab, fp);
-}
-
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	kref_put(&fi->fi_ref, free_nfs4_file);
+	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+		list_del(&fi->fi_hash);
+		spin_unlock(&recall_lock);
+		iput(fi->fi_inode);
+		kmem_cache_free(file_slab, fi);
+	}
 }
 
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-	kref_get(&fi->fi_ref);
+	atomic_inc(&fi->fi_ref);
 }
 
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
 	dp->dl_stateid.si_generation = 0;
-	dp->dl_fhlen = current_fh->fh_handle.fh_size;
-	memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-		        current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
 
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+	list_del(&stp->st_hash);
+	list_del(&stp->st_perfile);
+	list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	put_nfs4_file(stp->st_file);
+	kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+	free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_lock_stateid(stp);
+	}
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner(sop);
+	nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateowner *lock_sop;
+
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
+		BUG_ON(lock_sop->so_is_open_owner);
+		release_lockowner(lock_sop);
+	}
+}
+
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	release_stateid_lockowners(stp);
+	nfsd_close(stp->st_vfs_file);
+	free_generic_stateid(stp);
+}
+
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perclient);
+	list_del(&sop->so_perstateid); /* XXX: necessary? */
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_open_stateid(stp);
+	}
+}
+
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+	unhash_openowner(sop);
+	list_del(&sop->so_close_lru);
+	nfs4_put_stateowner(sop);
+}
+
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE	512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+
+	spin_lock(&nfsd_serv->sv_lock);
+	if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+		np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+	nfsd_serv->sv_drc_pages_used += np;
+	spin_unlock(&nfsd_serv->sv_lock);
+
+	if (np <= 0) {
+		status = nfserr_resource;
+		fchan->maxreqs = 0;
+	} else
+		fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+
+	return status;
+}
+
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+				    struct nfsd4_session *session,
+				    struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0;
+	__u32   maxcount = svc_max_payload(rqstp);
+
+	/* headerpadsz set to zero in encode routine */
+
+	/* Use the client's max request and max response size if possible */
+	if (fchan->maxreq_sz > maxcount)
+		fchan->maxreq_sz = maxcount;
+	session->se_fmaxreq_sz = fchan->maxreq_sz;
+
+	if (fchan->maxresp_sz > maxcount)
+		fchan->maxresp_sz = maxcount;
+	session->se_fmaxresp_sz = fchan->maxresp_sz;
+
+	/* Set the max response cached size our default which is
+	 * a multiple of PAGE_SIZE and small */
+	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session->se_fmaxresp_cached;
+
+	/* Use the client's maxops if possible */
+	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+	session->se_fmaxops = fchan->maxops;
+
+	/* try to use the client requested number of slots */
+	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+
+	/* FIXME: Error means no more DRC pages so the server should
+	 * recover pages from existing sessions. For now fail session
+	 * creation.
+	 */
+	status = set_forechannel_maxreqs(fchan);
+
+	session->se_fnumslots = fchan->maxreqs;
+	return status;
+}
+
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+		   struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new, tmp;
+	int idx, status = nfserr_resource, slotsize;
+
+	memset(&tmp, 0, sizeof(tmp));
+
+	/* FIXME: For now, we just accept the client back channel attributes. */
+	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	if (status)
+		goto out;
+
+	/* allocate struct nfsd4_session and slot table in one piece */
+	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	memcpy(new, &tmp, sizeof(*new));
+
+	new->se_client = clp;
+	gen_sessionid(new);
+	idx = hash_sessionid(&new->se_sessionid);
+	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+
+	new->se_flags = cses->flags;
+	kref_init(&new->se_ref);
+	spin_lock(&sessionid_lock);
+	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&sessionid_lock);
+
+	status = nfs_ok;
+out:
+	return status;
+}
+
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_session *elem;
+	int idx;
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	dprintk("%s: idx is %d\n", __func__, idx);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+		dump_sessionid("list traversal", &elem->se_sessionid);
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+	list_del(&ses->se_hash);
+	list_del(&ses->se_perclnt);
+}
+
+static void
+release_session(struct nfsd4_session *ses)
+{
+	spin_lock(&sessionid_lock);
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+	nfsd4_put_session(ses);
+}
+
+static void nfsd4_release_respages(struct page **respages, short resused);
+
+void
+free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int i;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	for (i = 0; i < ses->se_fnumslots; i++) {
+		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+		nfsd4_release_respages(e->ce_respages, e->ce_resused);
+	}
+	kfree(ses->se_slots);
+	kfree(ses);
+}
+
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
 	if (clid->cl_boot == boot_time)
 		return 0;
-	dprintk("NFSD stale clientid (%08x/%08x)\n", 
-			clid->cl_boot, clid->cl_id);
+	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+		clid->cl_boot, clid->cl_id, boot_time);
 	return 1;
 }
 
@@ -376,6 +649,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
 	list_del(&clp->cl_lru);
 	while (!list_empty(&clp->cl_openowners)) {
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-		release_stateowner(sop);
+		release_openowner(sop);
+	}
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session  *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				 se_perclnt);
+		release_session(ses);
 	}
 	put_nfs4_client(clp);
 }
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	return clp;
 }
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+	return use_exchange_id == has_exchange_flags;
+}
+
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+			     bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+			       bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
@@ -685,6 +987,534 @@ out_err:
 	return;
 }
 
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+
+	resp->cstate.statp = statp;
+}
+
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+	int i;
+
+	dprintk("--> %s\n", __func__);
+	for (i = 0; i < resused; i++) {
+		if (!respages[i])
+			continue;
+		put_page(respages[i]);
+		respages[i] = NULL;
+	}
+}
+
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		topages[i] = frompages[i];
+		if (!topages[i])
+			continue;
+		get_page(topages[i]);
+	}
+}
+
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+	struct nfsd4_op *op = &args->ops[resp->opcnt];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/* Don't cache a failed OP_SEQUENCE. */
+	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+		return;
+
+	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+	entry->ce_opcnt = resp->opcnt;
+	entry->ce_status = resp->cstate.status;
+
+	/*
+	 * Don't need a page to cache just the sequence operation - the slot
+	 * does this for us!
+	 */
+
+	if (nfsd4_not_cached(resp)) {
+		entry->ce_resused = 0;
+		entry->ce_rpchdrlen = 0;
+		dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+			resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		return;
+	}
+	entry->ce_resused = rqstp->rq_resused;
+	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+			 entry->ce_resused);
+	entry->ce_datav.iov_base = resp->cstate.statp;
+	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]));
+	/* Current request rpc header length*/
+	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]);
+}
+
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+			struct nfsd4_cache_entry *entry)
+{
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct kvec *resv = &resp->rqstp->rq_res.head[0];
+	int len;
+
+	/* Current request rpc header length*/
+	len = (char *)resp->cstate.statp -
+			(char *)page_address(rqstp->rq_respages[0]);
+	if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+		dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+			entry->ce_datav.iov_len);
+		return 0;
+	}
+	/* copy the cached reply nfsd data past the current rpc header */
+	memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+		entry->ce_datav.iov_len);
+	resv->iov_len = len + entry->ce_datav.iov_len;
+	return 1;
+}
+
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	__be32 status;
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/*
+	 * If this is just the sequence operation, we did not keep
+	 * a page in the cache entry because we can just use the
+	 * slot info stored in struct nfsd4_sequence that was checked
+	 * against the slot in nfsd4_sequence().
+	 *
+	 * This occurs when seq->cachethis is FALSE, or when the client
+	 * session inactivity timer fires and a solo sequence operation
+	 * is sent (lease renewal).
+	 */
+	if (seq && nfsd4_not_cached(resp)) {
+		seq->maxslots = resp->cstate.session->se_fnumslots;
+		return nfs_ok;
+	}
+
+	if (!nfsd41_copy_replay_data(resp, entry)) {
+		/*
+		 * Not enough room to use the replay rpc header, send the
+		 * cached header. Release all the allocated result pages.
+		 */
+		svc_free_res_pages(resp->rqstp);
+		nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+			entry->ce_resused);
+	} else {
+		/* Release all but the first allocated result page */
+
+		resp->rqstp->rq_resused--;
+		svc_free_res_pages(resp->rqstp);
+
+		nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+				 &entry->ce_respages[1],
+				 entry->ce_resused - 1);
+	}
+
+	resp->rqstp->rq_resused = entry->ce_resused;
+	resp->opcnt = entry->ce_opcnt;
+	resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+	status = entry->ce_status;
+
+	return status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+	/* pNFS is not supported */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	struct nfs4_client *unconf, *conf, *new;
+	int status;
+	unsigned int		strhashval;
+	char			dname[HEXDIR_LEN];
+	nfs4_verifier		verf = exid->verifier;
+	u32			ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		" ip_addr=%u flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		ip_addr, exid->flags, exid->spa_how);
+
+	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+		return nfserr_inval;
+
+	/* Currently only support SP4_NONE */
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
+	default:
+		BUG();				/* checked by xdr code */
+	case SP4_MACH_CRED:
+		return nfserr_serverfault;	/* no excuse :-/ */
+	}
+
+	status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+	if (status)
+		goto error;
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	status = nfs_ok;
+
+	conf = find_confirmed_client_by_str(dname, strhashval, true);
+	if (conf) {
+		if (!same_verf(&verf, &conf->cl_verifier)) {
+			/* 18.35.4 case 8 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* Client reboot: destroy old state */
+			expire_client(conf);
+			goto out_new;
+		}
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			/* 18.35.4 case 9 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_perm;
+				goto out;
+			}
+			expire_client(conf);
+			goto out_new;
+		}
+		if (ip_addr != conf->cl_addr &&
+		    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+			/* Client collision. 18.35.4 case 3 */
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+		/*
+		 * Set bit when the owner id and verifier map to an already
+		 * confirmed client id (18.35.3).
+		 */
+		exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+		/*
+		 * Falling into 18.35.4 case 2, possible router replay.
+		 * Leave confirmed record intact and return same result.
+		 */
+		copy_verf(conf, &verf);
+		new = conf;
+		goto out_copy;
+	} else {
+		/* 18.35.4 case 7 */
+		if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+			status = nfserr_noent;
+			goto out;
+		}
+	}
+
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+	if (unconf) {
+		/*
+		 * Possible retry or client restart.  Per 18.35.4 case 4,
+		 * a new unconfirmed record should be generated regardless
+		 * of whether any properties have changed.
+		 */
+		expire_client(unconf);
+	}
+
+out_new:
+	/* Normal case */
+	new = create_client(exid->clname, dname);
+	if (new == NULL) {
+		status = nfserr_resource;
+		goto out;
+	}
+
+	copy_verf(new, &verf);
+	copy_cred(&new->cl_cred, &rqstp->rq_cred);
+	new->cl_addr = ip_addr;
+	gen_clid(new);
+	gen_confirm(new);
+	add_to_unconfirmed(new, strhashval);
+out_copy:
+	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+	exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+	new->cl_slot.sl_seqid = 0;
+	exid->seqid = 1;
+	nfsd4_set_ex_flags(new, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		new->cl_slot.sl_seqid, new->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	nfs4_unlock_state();
+error:
+	dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+	return status;
+}
+
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+	dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+		slot->sl_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot->sl_inuse) {
+		if (seqid == slot->sl_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Normal */
+	if (likely(seqid == slot->sl_seqid + 1))
+		return nfs_ok;
+	/* Replay */
+	if (seqid == slot->sl_seqid)
+		return nfserr_replay_cache;
+	/* Wraparound */
+	if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+		return nfs_ok;
+	/* Misordered replay or misordered new request */
+	return nfserr_seq_misordered;
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *conf, *unconf;
+	struct nfsd4_slot *slot = NULL;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&cr_ses->clientid);
+	conf = find_confirmed_client(&cr_ses->clientid);
+
+	if (conf) {
+		slot = &conf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status == nfserr_replay_cache) {
+			dprintk("Got a create_session replay! seqid= %d\n",
+				slot->sl_seqid);
+			cstate->slot = slot;
+			cstate->status = status;
+			/* Return the cached reply status */
+			status = nfsd4_replay_cache_entry(resp, NULL);
+			goto out;
+		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			dprintk("Sequence misordered!\n");
+			dprintk("Expected seqid= %d but got seqid= %d\n",
+				slot->sl_seqid, cr_ses->seqid);
+			goto out;
+		}
+		conf->cl_slot.sl_seqid++;
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    (ip_addr != unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+
+		slot = &unconf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
+			status = nfserr_seq_misordered;
+			goto out;
+		}
+
+		slot->sl_seqid++; /* from 0 to 1 */
+		move_to_confirmed(unconf);
+
+		/*
+		 * We do not support RDMA or persistent sessions
+		 */
+		cr_ses->flags &= ~SESSION4_PERSIST;
+		cr_ses->flags &= ~SESSION4_RDMA;
+
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	status = alloc_init_session(rqstp, conf, cr_ses);
+	if (status)
+		goto out;
+
+	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	cr_ses->seqid = slot->sl_seqid;
+
+	slot->sl_inuse = true;
+	cstate->slot = slot;
+	/* Ensure a page is used for the cache */
+	slot->sl_cache_entry.ce_cachethis = 1;
+out:
+	nfs4_unlock_state();
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	struct nfsd4_session *ses;
+	u32 status = nfserr_badsession;
+
+	/* Notes:
+	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+	 * - Should we return nfserr_back_chan_busy if waiting for
+	 *   callbacks on to-be-destroyed session?
+	 * - Do we need to clear any callback info from previous session?
+	 */
+
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&sessionid_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+	if (!ses) {
+		spin_unlock(&sessionid_lock);
+		goto out;
+	}
+
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+
+	/* wait for callbacks */
+	shutdown_callback_client(ses->se_client);
+	nfsd4_put_session(ses);
+	status = nfs_ok;
+out:
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_session *session;
+	struct nfsd4_slot *slot;
+	int status;
+
+	if (resp->opcnt != 1)
+		return nfserr_sequence_pos;
+
+	spin_lock(&sessionid_lock);
+	status = nfserr_badsession;
+	session = find_in_sessionid_hashtbl(&seq->sessionid);
+	if (!session)
+		goto out;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fnumslots)
+		goto out;
+
+	slot = &session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	status = check_slot_seqid(seq->seqid, slot);
+	if (status == nfserr_replay_cache) {
+		cstate->slot = slot;
+		cstate->session = session;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_svc_encode_compoundres processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
+		cstate->status = nfserr_replay_cache;
+		goto replay_cache;
+	}
+	if (status)
+		goto out;
+
+	/* Success! bump slot seqid */
+	slot->sl_inuse = true;
+	slot->sl_seqid = seq->seqid;
+	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+	/* Always set the cache entry cachethis for solo sequence */
+	if (nfsd4_is_solo_sequence(resp))
+		slot->sl_cache_entry.ce_cachethis = 1;
+
+	cstate->slot = slot;
+	cstate->session = session;
+
+replay_cache:
+	/* Renew the clientid on success and on replay.
+	 * Hold a session reference until done processing the compound:
+	 * nfsd4_put_session called only if the cstate slot is set.
+	 */
+	renew_client(session->se_client);
+	nfsd4_get_session(session);
+out:
+	spin_unlock(&sessionid_lock);
+	dprintk("%s: return %d\n", __func__, ntohl(status));
+	return status;
+}
+
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval);
+	conf = find_confirmed_client_by_str(dname, strhashval, false);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
-		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
-				|| conf->cl_addr != sin->sin_addr.s_addr) {
-			dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
-				&conf->cl_addr);
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			dprintk("NFSD: setclientid: string in use by client"
+				" at %pI4\n", &conf->cl_addr);
 			goto out;
 		}
 	}
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-									hash);
+							    hash, false);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
 
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
-		kref_init(&fp->fi_ref);
+		atomic_set(&fp->fi_ref, 1);
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
+		spin_lock(&recall_lock);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return sop;
 }
 
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-	struct nfs4_stateowner *lock_sop;
-
-	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_stateowner(lock_sop);
-	}
-}
-
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-	struct nfs4_stateid *stp;
-
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	if (sop->so_is_open_owner)
-		list_del(&sop->so_perclient);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_entry(sop->so_stateids.next,
-			struct nfs4_stateid, st_perstateowner);
-		if (sop->so_is_open_owner)
-			release_stateid(stp, OPEN_STATE);
-		else
-			release_stateid(stp, LOCK_STATE);
-	}
-}
-
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-	unhash_stateowner(sop);
-	list_del(&sop->so_close_lru);
-	nfs4_put_stateowner(sop);
-}
-
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
-	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+		  &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
 }
 
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-	struct file *filp = stp->st_vfs_file;
-
-	list_del(&stp->st_hash);
-	list_del(&stp->st_perfile);
-	list_del(&stp->st_perstateowner);
-	if (flags & OPEN_STATE) {
-		release_stateid_lockowners(stp);
-		stp->st_vfs_file = NULL;
-		nfsd_close(filp);
-	} else if (flags & LOCK_STATE)
-		locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-	put_nfs4_file(stp->st_file);
-	kmem_cache_free(stateid_slab, stp);
-}
-
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 	struct nfs4_file *fp;
 
+	spin_lock(&recall_lock);
 	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
+			spin_unlock(&recall_lock);
 			return fp;
 		}
 	}
+	spin_unlock(&recall_lock);
 	return NULL;
 }
 
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-	if (x < NFS4_SHARE_ACCESS_READ)
+	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
 		return 0;
-	if (x > NFS4_SHARE_ACCESS_BOTH)
+	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	x &= ~NFS4_SHARE_ACCESS_MASK;
+	if (minorversion && x) {
+		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+			return 0;
+		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+			return 0;
+		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+	}
+	if (x)
 		return 0;
 	return 1;
 }
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 
 
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open)
 {
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
 			return nfserr_expired;
 		goto renew;
 	}
+	/* When sessions are used, skip open sequenceid processing */
+	if (nfsd4_has_session(cstate))
+		goto renew;
 	if (!sop->so_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = sop->so_client;
-		release_stateowner(sop);
+		release_openowner(sop);
 		open->op_stateowner = NULL;
 		goto renew;
 	}
@@ -1709,6 +2498,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	__be32 status;
 
 	status = nfserr_inval;
-	if (!access_valid(open->op_share_access)
+	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
 			|| !deny_valid(open->op_share_deny))
 		goto out;
 	/*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		init_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
-			release_stateid(stp, OPEN_STATE);
+			release_open_stateid(stp);
 			goto out;
 		}
+		if (nfsd4_has_session(&resp->cstate))
+			update_stateid(&stp->st_stateid);
 	}
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_stateowner->so_confirmed = 1;
+
 	/*
 	* Attempt to hand out a delegation. No error return, because the
 	* OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed)
+	if (!open->op_stateowner->so_confirmed &&
+	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
 	return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		release_stateowner(sop);
+		release_openowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-	/* Trying to call delegreturn with a special stateid? Yuch: */
-	if (!(flags & (RD_STATE | WR_STATE)))
-		return nfserr_bad_stateid;
-	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
 	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
  * that are not able to provide mandatory locking.
  */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& mandatory_lock(inode);
+	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if ((flags & HAS_SESSION) && in->si_generation == 0)
+		goto out;
+
 	/* If the client sends us a stateid from the future, it's buggy: */
 	if (in->si_generation > ref->si_generation)
 		return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
 	 */
 	if (in->si_generation < ref->si_generation)
 		return nfserr_old_stateid;
+out:
 	return nfs_ok;
 }
 
+static int is_delegation_stateid(stateid_t *stateid)
+{
+	return stateid->si_fileid == 0;
+}
+
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
 {
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	stateid_t *stidp;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	__be32 status;
 
-	dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-		stateid->si_boot, stateid->si_stateownerid, 
-		stateid->si_fileid, stateid->si_generation); 
 	if (filpp)
 		*filpp = NULL;
 
-	if (io_during_grace_disallowed(ino, flags))
+	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
-	/* STALE STATEID */
 	status = nfserr_stale_stateid;
 	if (STALE_STATEID(stateid)) 
 		goto out;
 
-	/* BAD STATEID */
 	status = nfserr_bad_stateid;
-	if (!stateid->si_fileid) { /* delegation stateid */
-		if(!(dp = find_delegation_stateid(ino, stateid))) {
-			dprintk("NFSD: delegation stateid not found\n");
+	if (is_delegation_stateid(stateid)) {
+		dp = find_delegation_stateid(ino, stateid);
+		if (!dp)
 			goto out;
-		}
-		stidp = &dp->dl_stateid;
+		status = check_stateid_generation(stateid, &dp->dl_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_delegmode(dp, flags);
+		if (status)
+			goto out;
+		renew_client(dp->dl_client);
+		if (filpp)
+			*filpp = dp->dl_vfs_file;
 	} else { /* open or lock stateid */
-		if (!(stp = find_stateid(stateid, flags))) {
-			dprintk("NFSD: open or lock stateid not found\n");
+		stp = find_stateid(stateid, flags);
+		if (!stp)
 			goto out;
-		}
-		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
+		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
-		stidp = &stp->st_stateid;
-	}
-	status = check_stateid_generation(stateid, stidp);
-	if (status)
-		goto out;
-	if (stp) {
-		if ((status = nfs4_check_openmode(stp,flags)))
+		status = check_stateid_generation(stateid, &stp->st_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_openmode(stp, flags);
+		if (status)
 			goto out;
 		renew_client(stp->st_stateowner->so_client);
 		if (filpp)
 			*filpp = stp->st_vfs_file;
-	} else {
-		if ((status = nfs4_check_delegmode(dp, flags)))
-			goto out;
-		renew_client(dp->dl_client);
-		if (flags & DELEG_RET)
-			unhash_delegation(dp);
-		if (filpp)
-			*filpp = dp->dl_vfs_file;
 	}
 	status = nfs_ok;
 out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
  * Checks for sequence id mutating operations. 
  */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, int flags,
+			 struct nfs4_stateowner **sopp,
+			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	/*
 	* We return BAD_STATEID if filehandle doesn't match stateid, 
 	* the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		if (lock->lk_is_new) {
 			if (!sop->so_is_open_owner)
 				return nfserr_bad_stateid;
-			if (!same_clid(&clp->cl_clientid, lockclid))
-			       return nfserr_bad_stateid;
+			if (!(flags & HAS_SESSION) &&
+			    !same_clid(&clp->cl_clientid, lockclid))
+				return nfserr_bad_stateid;
 			/* stp is the open stateid */
 			status = nfs4_check_openmode(stp, lkflg);
 			if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 	*  For the moment, we ignore the possibility of 
 	*  generation number wraparound.
 	*/
-	if (seqid != sop->so_seqid)
+	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid);
+	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE,
 					&oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access)
+	if (!access_valid(od->od_share_access, cstate->minorversion)
 			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					od->od_seqid,
 					&od->od_stateid, 
 					OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					close->cl_seqid,
 					&close->cl_stateid, 
 					OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 	/* release_stateid() calls nfsd_close() if needed */
-	release_stateid(stp, OPEN_STATE);
+	release_open_stateid(stp);
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_delegreturn *dr)
 {
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct inode *inode;
 	__be32 status;
+	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-		goto out;
+		return status;
+	inode = cstate->current_fh.fh_dentry->d_inode;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-					    &dr->dr_stateid, DELEG_RET, NULL);
-	nfs4_unlock_state();
+	status = nfserr_bad_stateid;
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		goto out;
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid))
+		goto out;
+	status = nfserr_bad_stateid;
+	if (!is_delegation_stateid(stateid))
+		goto out;
+	dp = find_delegation_stateid(inode, stateid);
+	if (!dp)
+		goto out;
+	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	if (status)
+		goto out;
+	renew_client(dp->dl_client);
+
+	unhash_delegation(dp);
 out:
+	nfs4_unlock_state();
+
 	return status;
 }
 
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfs4_file *fp;
 		
 		status = nfserr_stale_clientid;
-		if (STALE_CLIENTID(&lock->lk_new_clientid))
+		if (!nfsd4_has_session(cstate) &&
+		    STALE_CLIENTID(&lock->lk_new_clientid))
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 					OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 out:
 	if (status && lock->lk_is_new && lock_sop)
-		release_stateowner(lock_sop);
+		release_lockowner(lock_sop);
 	if (lock->lk_replay_owner) {
 		nfs4_get_stateowner(lock->lk_replay_owner);
 		cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 
 	status = nfserr_stale_clientid;
-	if (STALE_CLIENTID(&lockt->lt_clientid))
+	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
 		goto out;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					locku->lu_seqid, 
 					&locku->lu_stateid, 
 					LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		/* unhash_stateowner deletes so_perclient only
 		 * for openowners. */
 		list_del(&sop->so_perclient);
-		release_stateowner(sop);
+		release_lockowner(sop);
 	}
 out:
 	nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
 }
 
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval);
+	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
 	return clp ? 1 : 0;
 }
 
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
 		INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
 		INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
 	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&sessionid_hashtbl[i]);
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f65953be39c0..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 	return p;
 }
 
+static int zero_clientid(clientid_t *clid)
+{
+	return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
 static int
 defer_free(struct nfsd4_compoundargs *argp,
 		void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 
 	bmval[0] = 0;
 	bmval[1] = 0;
+	bmval[2] = 0;
 
 	READ_BUF(4);
 	READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 		READ32(bmval[0]);
 	if (bmlen > 1)
 		READ32(bmval[1]);
+	if (bmlen > 2)
+		READ32(bmval[2]);
 
 	DECODE_TAIL;
 }
 
+static u32 nfsd_attrmask[] = {
+	NFSD_WRITEABLE_ATTRS_WORD0,
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
-    struct nfs4_acl **acl)
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+		   struct iattr *iattr, struct nfs4_acl **acl)
 {
 	int expected_len, len = 0;
 	u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
 	 * read-only attributes return ERR_INVAL.
 	 */
-	if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
 		return nfserr_attrnotsupp;
-	if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+	if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+	    (bmval[2] & ~writable[2]))
 		return nfserr_inval;
 
 	READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 			goto xdr_error;
 		}
 	}
+	BUG_ON(bmval[2]);	/* no such writeable attr supported yet */
 	if (len != expected_len)
 		goto xdr_error;
 
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 	if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
 		return status;
 
-	if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+	status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+				    &create->cr_iattr, &create->cr_acl);
+	if (status)
 		goto out;
 
 	DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 	READ_BUF(lockt->lt_owner.len);
 	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+		return nfserr_inval;
 	DECODE_TAIL;
 }
 
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		switch (open->op_createmode) {
 		case NFS4_CREATE_UNCHECKED:
 		case NFS4_CREATE_GUARDED:
-			if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd_attrmask, &open->op_iattr, &open->op_acl);
+			if (status)
 				goto out;
 			break;
 		case NFS4_CREATE_EXCLUSIVE:
 			READ_BUF(8);
 			COPYMEM(open->op_verf.data, 8);
 			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd41_ex_attrmask, &open->op_iattr,
+				&open->op_acl);
+			if (status)
+				goto out;
+			break;
 		default:
 			goto xdr_error;
 		}
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
 	if (status)
 		return status;
-	return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
 				  &setattr->sa_iattr, &setattr->sa_acl);
 }
 
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	READ_BUF(rlockowner->rl_owner.len);
 	READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+		return nfserr_inval;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *exid)
+{
+	int dummy;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	READ_BUF(4);
+	READ32(exid->clname.len);
+
+	READ_BUF(exid->clname.len);
+	SAVEMEM(exid->clname.data, exid->clname.len);
+
+	READ_BUF(4);
+	READ32(exid->flags);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	READ32(exid->spa_how);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		READ32(dummy);
+		READ32(dummy);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	READ32(dummy);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	DECODE_HEAD;
+
+	u32 dummy;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	READ32(sess->seqid);
+	READ32(sess->flags);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->fore_channel.maxreq_sz);
+	READ32(sess->fore_channel.maxresp_sz);
+	READ32(sess->fore_channel.maxresp_cached);
+	READ32(sess->fore_channel.maxops);
+	READ32(sess->fore_channel.maxreqs);
+	READ32(sess->fore_channel.nr_rdma_attrs);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->fore_channel.rdma_attrs);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->back_channel.maxreq_sz);
+	READ32(sess->back_channel.maxresp_sz);
+	READ32(sess->back_channel.maxresp_cached);
+	READ32(sess->back_channel.maxops);
+	READ32(sess->back_channel.maxreqs);
+	READ32(sess->back_channel.nr_rdma_attrs);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->back_channel.rdma_attrs);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(8);
+	READ32(sess->callback_prog);
+
+	/* callback_sec_params4 */
+	READ32(nr_secflavs);
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		READ32(dummy);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			READ32(dummy);
+
+			/* machine name */
+			READ32(dummy);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			READ32(sess->uid);
+			READ32(sess->gid);
+
+			/* more gids */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy * 4);
+			for (i = 0; i < dummy; ++i)
+				READ32(dummy);
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			READ32(dummy);
+			/* gcbp_handle_from_server */
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(seq->seqid);
+	READ32(seq->slotid);
+	READ32(seq->maxslots);
+	READ32(seq->cachethis);
+
 	DECODE_TAIL;
 }
 
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-	return nfserr_opnotsupp;
+	return nfserr_notsupp;
 }
 
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
 	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
+static nfsd4_dec nfsd41_dec_ops[] = {
+	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_notsupp,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_EXCHANGE_ID]	(nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	(nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	(nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEQUENCE]		(nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WANT_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	(nfsd4_dec)nfsd4_decode_notsupp,
+};
+
 struct nfsd4_minorversion_ops {
 	nfsd4_dec *decoders;
 	int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
 	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
 	u32 bmval0 = bmval[0];
 	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
 	struct kstat stat;
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	int err;
 	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
-	BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
 
 	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
 		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
 		if (status)
 			goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((buflen -= 16) < 0)
 		goto out_resource;
 
-	WRITE32(2);
-	WRITE32(bmval0);
-	WRITE32(bmval1);
+	if (unlikely(bmval2)) {
+		WRITE32(3);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+		WRITE32(bmval2);
+	} else if (likely(bmval1)) {
+		WRITE32(2);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+	} else {
+		WRITE32(1);
+		WRITE32(bmval0);
+	}
 	attrlenp = p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
 		if ((buflen -= 12) < 0)
 			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!exp->ex_fslocs.locations)
 			word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-		WRITE32(2);
-		WRITE32(word0);
-		WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+		if (!word2) {
+			WRITE32(2);
+			WRITE32(word0);
+			WRITE32(word1);
+		} else {
+			WRITE32(3);
+			WRITE32(word0);
+			WRITE32(word1);
+			WRITE32(word2);
+		}
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
 		}
 		WRITE64(stat.ino);
 	}
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		WRITE32(3);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
+
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	ENCODE_HEAD;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	RESERVE_SPACE(
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how (SP4_NONE) */ +
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
+	WRITEMEM(&exid->clientid, 8);
+	WRITE32(exid->seqid);
+	WRITE32(exid->flags);
+
+	/* state_protect4_r. Currently only support SP4_NONE */
+	BUG_ON(exid->spa_how != SP4_NONE);
+	WRITE32(exid->spa_how);
+
+	/* The server_owner struct */
+	WRITE64(minor_id);      /* Minor id */
+	/* major id */
+	WRITE32(major_id_sz);
+	WRITEMEM(major_id, major_id_sz);
+
+	/* Server scope */
+	WRITE32(server_scope_sz);
+	WRITEMEM(server_scope, server_scope_sz);
+
+	/* Implementation id */
+	WRITE32(0);	/* zero length nfs_impl_id4 array */
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(24);
+	WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(sess->seqid);
+	WRITE32(sess->flags);
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->fore_channel.maxreq_sz);
+	WRITE32(sess->fore_channel.maxresp_sz);
+	WRITE32(sess->fore_channel.maxresp_cached);
+	WRITE32(sess->fore_channel.maxops);
+	WRITE32(sess->fore_channel.maxreqs);
+	WRITE32(sess->fore_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->fore_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->back_channel.maxreq_sz);
+	WRITE32(sess->back_channel.maxresp_sz);
+	WRITE32(sess->back_channel.maxresp_cached);
+	WRITE32(sess->back_channel.maxops);
+	WRITE32(sess->back_channel.maxreqs);
+	WRITE32(sess->back_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->back_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	return nfserr;
+}
+
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(seq->seqid);
+	WRITE32(seq->slotid);
+	WRITE32(seq->maxslots);
+	/*
+	 * FIXME: for now:
+	 *   target_maxslots = maxslots
+	 *   status_flags = 0
+	 */
+	WRITE32(seq->maxslots);
+	WRITE32(0);
+
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
 	return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
 
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
 	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
@@ -2596,6 +3109,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPENATTR]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
@@ -2616,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
 };
 
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+	int status = 0;
+	struct xdr_buf *xb = &resp->rqstp->rq_res;
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	struct nfsd4_session *session = NULL;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	u32 length, tlen = 0, pad = 8;
+
+	if (!nfsd4_has_session(&resp->cstate))
+		return status;
+
+	session = resp->cstate.session;
+	if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+		return status;
+
+	if (resp->opcnt >= args->opcnt)
+		pad = 0; /* this is the last operation */
+
+	if (xb->page_len == 0) {
+		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+	} else {
+		if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+			tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+
+		length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+	}
+	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+		length, xb->page_len, tlen, pad);
+
+	if (length <= session->se_fmaxresp_cached)
+		return status;
+	else
+		return nfserr_rep_too_big_to_cache;
+}
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2634,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
 	       !nfsd4_enc_ops[op->opnum]);
 	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+	/* nfsd4_check_drc_limit guarantees enough room for error status */
+	if (!op->status && nfsd4_check_drc_limit(resp))
+		op->status = nfserr_rep_too_big_to_cache;
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
@@ -2734,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
+	if (nfsd4_has_session(&resp->cstate)) {
+		if (resp->cstate.status == nfserr_replay_cache &&
+				!nfsd4_not_cached(resp)) {
+			iov->iov_len = resp->cstate.iovlen;
+		} else {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			resp->cstate.slot->sl_inuse = 0;
+		}
+		if (resp->cstate.session)
+			nfsd4_put_session(resp->cstate.session);
+	}
 	return 1;
 }
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
 	NFSD_FO_UnlockFS,
 	NFSD_Threads,
 	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
 	.owner		= THIS_MODULE,
 };
 
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+
+static struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
-	char *vers, sign;
+	char *vers, *minorp, sign;
 	int len, num;
+	unsigned minor;
 	ssize_t tlen = 0;
 	char *sep;
 
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 		do {
 			sign = *vers;
 			if (sign == '+' || sign == '-')
-				num = simple_strtol((vers+1), NULL, 0);
+				num = simple_strtol((vers+1), &minorp, 0);
 			else
-				num = simple_strtol(vers, NULL, 0);
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num < 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
 			switch(num) {
 			case 2:
 			case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			default:
 				return -EINVAL;
 			}
+		next:
 			vers += len + 1;
 			tlen += len;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 				       num);
 			sep = " ";
 		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+			len += sprintf(buf+len, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
 	len += sprintf(buf+len, "\n");
 	return len;
 }
@@ -938,10 +969,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+			if (port < 1 || port > 65535)
+				return -EINVAL;
 			err = nfsd_create_serv();
 			if (!err) {
 				err = svc_create_xprt(nfsd_serv,
-						      transport, port,
+						      transport, PF_INET, port,
 						      SVC_SOCK_ANONYMOUS);
 				if (err == -ENOENT)
 					/* Give a reasonable perror msg for
@@ -960,7 +993,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 		char transport[16];
 		int port;
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
-			if (port == 0)
+			if (port < 1 || port > 65535)
 				return -EINVAL;
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
@@ -1246,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
 	__be32	nfserr;
 	int	stable = 1;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+			           &cnt,
 				   &stable);
 	return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
-static atomic_t			nfsd_busy;
-static unsigned long		nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 
 /*
  * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 
 };
 
+u32 nfsd_supported_minorversion;
+
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
 	}
 	return 0;
 }
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversion = minorversion;
+		break;
+	case NFSD_CLEAR:
+		if (minorversion == 0)
+			return -1;
+		nfsd_supported_minorversion = minorversion - 1;
+		break;
+	case NFSD_TEST:
+		return minorversion <= nfsd_supported_minorversion;
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
 /*
  * Maximum number of nfsd processes
  */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
 	}
 }
 
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+	#define NFSD_DRC_SIZE_SHIFT	7
+	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+						>> NFSD_DRC_SIZE_SHIFT;
+	nfsd_serv->sv_drc_pages_used = 0;
+	dprintk("%s svc_drc_max_pages %u\n", __func__,
+		nfsd_serv->sv_drc_max_pages);
+}
 
 int nfsd_create_serv(void)
 {
@@ -227,12 +271,12 @@ int nfsd_create_serv(void)
 			nfsd_max_blksize /= 2;
 	}
 
-	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
+	else
+		set_max_drc();
 
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
@@ -244,7 +288,7 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -253,7 +297,7 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
-	error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
 		return error;
@@ -376,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
 	return error;
 }
 
-static inline void
-update_thread_usage(int busy_threads)
-{
-	unsigned long prev_call;
-	unsigned long diff;
-	int decile;
-
-	spin_lock(&nfsd_call_lock);
-	prev_call = nfsd_last_call;
-	nfsd_last_call = jiffies;
-	decile = busy_threads*10/nfsdstats.th_cnt;
-	if (decile>0 && decile <= 10) {
-		diff = nfsd_last_call - prev_call;
-		if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-			nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-		if (decile == 10)
-			nfsdstats.th_fullcnt++;
-	}
-	spin_unlock(&nfsd_call_lock);
-}
 
 /*
  * This is the NFS server kernel thread
@@ -404,7 +428,6 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-	struct fs_struct *fsp;
 	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
@@ -413,13 +436,11 @@ nfsd(void *vrqstp)
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
-	fsp = copy_fs_struct(current->fs);
-	if (!fsp) {
+	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
 	}
-	exit_fs(current);
-	current->fs = fsp;
+
 	current->fs->umask = 0;
 
 	/*
@@ -464,8 +485,6 @@ nfsd(void *vrqstp)
 			continue;
 		}
 
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
@@ -474,8 +493,6 @@ nfsd(void *vrqstp)
 
 		/* Unlock export hash tables */
 		exp_readunlock();
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_dec(&nfsd_busy);
 	}
 
 	/* Clear signals before calling svc_exit_thread() */
@@ -543,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 		+ rqstp->rq_res.head[0].iov_len;
 	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
+	/* NFSv4.1 DRC requires statp */
+	if (rqstp->rq_vers == 4)
+		nfsd4_set_statp(rqstp, statp);
+
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -574,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
 	return 1;
 }
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	if (nfsd_serv == NULL)
+		return -ENODEV;
+	return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa56ca2..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -356,7 +356,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			put_write_access(inode);
 			goto out_nfserr;
 		}
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 	}
 
 	/* sanitize the mode change */
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	}
 
 	/* Revoke setuid/setgid on chown */
-	if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
-	    ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+	if (!S_ISDIR(inode->i_mode) &&
+	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
 		iap->ia_valid |= ATTR_KILL_PRIV;
 		if (iap->ia_valid & ATTR_MODE) {
 			/* we're setting mode too, just clear the s*id bits */
@@ -723,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		else
 			flags = O_WRONLY|O_LARGEFILE;
 
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
 			    flags, cred);
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
-	   			unsigned long cnt, int *stablep)
+				unsigned long *cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 
 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
 		goto out;
 #endif
 
@@ -998,15 +999,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (!EX_ISSYNC(exp))
 		stable = 0;
-	if (stable && !EX_WGATHER(exp))
+	if (stable && !EX_WGATHER(exp)) {
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_SYNC;
+		spin_unlock(&file->f_lock);
+	}
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
-		nfsdstats.io_write += cnt;
+		nfsdstats.io_write += host_err;
 		fsnotify_modify(file->f_path.dentry);
 	}
 
@@ -1051,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0)
+	if (host_err >= 0) {
 		err = 0;
-	else 
+		*cnt = host_err;
+	} else
 		err = nfserrno(host_err);
 out:
 	return err;
@@ -1095,7 +1100,7 @@ out:
  */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
 		int *stablep)
 {
 	__be32			err = 0;
@@ -1176,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
 	return 0;
 }
 
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+		iap->ia_valid &= ~ATTR_SIZE;
+}
+
 /*
  * Create a file (regular, directory, device, fifo); UNIX sockets 
  * not yet implemented.
@@ -1271,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	switch (type) {
 	case S_IFREG:
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+		if (!host_err)
+			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1424,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		/* setattr will sync the child (or not) */
 	}
 
+	nfsd_check_ignore_resizing(iap);
+
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
 		/* Cram the verifier into atime/mtime */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..220c13f0d73d 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
 			return -ENOSPC;
 		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
@@ -380,6 +380,14 @@ void inotify_unmount_inodes(struct list_head *list)
 		struct list_head *watches;
 
 		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
+
+		/*
 		 * If i_count is zero, the inode cannot have any watches and
 		 * doing an __iget/iput with MS_ACTIVE clear would actually
 		 * evict all inodes with zero i_count from icache which is
@@ -388,14 +396,6 @@ void inotify_unmount_inodes(struct list_head *list)
 		if (!atomic_read(&inode->i_count))
 			continue;
 
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
-		 * I_WILL_FREE which is fine because by that point the inode
-		 * cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
-			continue;
-
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
 		/* In case inotify_remove_watch_locked() drops a reference. */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd4..5a9e34475e37 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
 /**
  * The little endian Unicode string $I30 as a global constant.
  */
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
-		const_cpu_to_le16('3'),	const_cpu_to_le16('0'), 0 };
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
 
 /**
  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0a..82c5085559c6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 				goto em_put_err_out;
 			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
 					le16_to_cpu(al_entry->length));
-			if (le32_to_cpu(al_entry->type) >
-					const_le32_to_cpu(AT_DATA))
+			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
 				goto em_put_err_out;
 			if (AT_DATA != al_entry->type)
 				continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328eceb..50931b1ce4b9 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
 
 #include "types.h"
 
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x)	__constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x)	__constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x)	__constant_le64_to_cpu(x)
-
-#define const_cpu_to_le16(x)	__constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x)	__constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x)	__constant_cpu_to_le64(x)
-
 /* The NTFS oem_id "NTFS    " */
-#define magicNTFS	const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
 
 /*
  * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
  */
 enum {
 	/* Found in $MFT/$DATA. */
-	magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
-	magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
-	magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
 
 	/* Found in $LogFile/$DATA. */
-	magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
-	magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
 
 	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
 
 	/* Found in all ntfs record containing records. */
-	magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
 						       transfer was detected. */
 	/*
 	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
 	 * thus not initialized.  Page must be initialized before using it.
 	 */
-	magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
 };
 
 typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
  * information about the mft record in which they are present.
  */
 enum {
-	MFT_RECORD_IN_USE	= const_cpu_to_le16(0x0001),
-	MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
+	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
 } __attribute__ ((__packed__));
 
 typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
  * Note: The _LE versions will return a CPU endian formatted value!
  */
 #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
 
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
  * a revealing choice of symbol I do not know what is... (-;
  */
 enum {
-	AT_UNUSED			= const_cpu_to_le32(         0),
-	AT_STANDARD_INFORMATION		= const_cpu_to_le32(      0x10),
-	AT_ATTRIBUTE_LIST		= const_cpu_to_le32(      0x20),
-	AT_FILE_NAME			= const_cpu_to_le32(      0x30),
-	AT_OBJECT_ID			= const_cpu_to_le32(      0x40),
-	AT_SECURITY_DESCRIPTOR		= const_cpu_to_le32(      0x50),
-	AT_VOLUME_NAME			= const_cpu_to_le32(      0x60),
-	AT_VOLUME_INFORMATION		= const_cpu_to_le32(      0x70),
-	AT_DATA				= const_cpu_to_le32(      0x80),
-	AT_INDEX_ROOT			= const_cpu_to_le32(      0x90),
-	AT_INDEX_ALLOCATION		= const_cpu_to_le32(      0xa0),
-	AT_BITMAP			= const_cpu_to_le32(      0xb0),
-	AT_REPARSE_POINT		= const_cpu_to_le32(      0xc0),
-	AT_EA_INFORMATION		= const_cpu_to_le32(      0xd0),
-	AT_EA				= const_cpu_to_le32(      0xe0),
-	AT_PROPERTY_SET			= const_cpu_to_le32(      0xf0),
-	AT_LOGGED_UTILITY_STREAM	= const_cpu_to_le32(     0x100),
-	AT_FIRST_USER_DEFINED_ATTRIBUTE	= const_cpu_to_le32(    0x1000),
-	AT_END				= const_cpu_to_le32(0xffffffff)
+	AT_UNUSED			= cpu_to_le32(         0),
+	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
+	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
+	AT_FILE_NAME			= cpu_to_le32(      0x30),
+	AT_OBJECT_ID			= cpu_to_le32(      0x40),
+	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
+	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
+	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
+	AT_DATA				= cpu_to_le32(      0x80),
+	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
+	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
+	AT_BITMAP			= cpu_to_le32(      0xb0),
+	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
+	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
+	AT_EA				= cpu_to_le32(      0xe0),
+	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
+	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
+	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
+	AT_END				= cpu_to_le32(0xffffffff)
 };
 
 typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
  *	equal then the second le32 values would be compared, etc.
  */
 enum {
-	COLLATION_BINARY		= const_cpu_to_le32(0x00),
-	COLLATION_FILE_NAME		= const_cpu_to_le32(0x01),
-	COLLATION_UNICODE_STRING	= const_cpu_to_le32(0x02),
-	COLLATION_NTOFS_ULONG		= const_cpu_to_le32(0x10),
-	COLLATION_NTOFS_SID		= const_cpu_to_le32(0x11),
-	COLLATION_NTOFS_SECURITY_HASH	= const_cpu_to_le32(0x12),
-	COLLATION_NTOFS_ULONGS		= const_cpu_to_le32(0x13),
+	COLLATION_BINARY		= cpu_to_le32(0x00),
+	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
+	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
+	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
+	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
+	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
+	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
 };
 
 typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
  * NT4.
  */
 enum {
-	ATTR_DEF_INDEXABLE	= const_cpu_to_le32(0x02), /* Attribute can be
+	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
 					indexed. */
-	ATTR_DEF_MULTIPLE	= const_cpu_to_le32(0x04), /* Attribute type
+	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
 					can be present multiple times in the
 					mft records of an inode. */
-	ATTR_DEF_NOT_ZERO	= const_cpu_to_le32(0x08), /* Attribute value
+	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
 					must contain at least one non-zero
 					byte. */
-	ATTR_DEF_INDEXED_UNIQUE	= const_cpu_to_le32(0x10), /* Attribute must be
+	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
 					indexed and the attribute value must be
 					unique for the attribute type in all of
 					the mft records of an inode. */
-	ATTR_DEF_NAMED_UNIQUE	= const_cpu_to_le32(0x20), /* Attribute must be
+	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
 					named and the name must be unique for
 					the attribute type in all of the mft
 					records of an inode. */
-	ATTR_DEF_RESIDENT	= const_cpu_to_le32(0x40), /* Attribute must be
+	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
 					resident. */
-	ATTR_DEF_ALWAYS_LOG	= const_cpu_to_le32(0x80), /* Always log
+	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
 					modifications to this attribute,
 					regardless of whether it is resident or
 					non-resident.  Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
  * Attribute flags (16-bit).
  */
 enum {
-	ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
-	ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
+	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
 							      mask.  Also, first
 							      illegal value. */
-	ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
-	ATTR_IS_SPARSE	      = const_cpu_to_le16(0x8000),
+	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
+	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
 } __attribute__ ((__packed__));
 
 typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
  * flags appear in all of the above.
  */
 enum {
-	FILE_ATTR_READONLY		= const_cpu_to_le32(0x00000001),
-	FILE_ATTR_HIDDEN		= const_cpu_to_le32(0x00000002),
-	FILE_ATTR_SYSTEM		= const_cpu_to_le32(0x00000004),
-	/* Old DOS volid. Unused in NT.	= const_cpu_to_le32(0x00000008), */
+	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
+	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
+	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
+	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
 
-	FILE_ATTR_DIRECTORY		= const_cpu_to_le32(0x00000010),
+	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
 	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
 	   reserved for the DOS SUBDIRECTORY flag. */
-	FILE_ATTR_ARCHIVE		= const_cpu_to_le32(0x00000020),
-	FILE_ATTR_DEVICE		= const_cpu_to_le32(0x00000040),
-	FILE_ATTR_NORMAL		= const_cpu_to_le32(0x00000080),
+	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
+	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
+	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
 
-	FILE_ATTR_TEMPORARY		= const_cpu_to_le32(0x00000100),
-	FILE_ATTR_SPARSE_FILE		= const_cpu_to_le32(0x00000200),
-	FILE_ATTR_REPARSE_POINT		= const_cpu_to_le32(0x00000400),
-	FILE_ATTR_COMPRESSED		= const_cpu_to_le32(0x00000800),
+	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
+	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
+	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
+	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
 
-	FILE_ATTR_OFFLINE		= const_cpu_to_le32(0x00001000),
-	FILE_ATTR_NOT_CONTENT_INDEXED	= const_cpu_to_le32(0x00002000),
-	FILE_ATTR_ENCRYPTED		= const_cpu_to_le32(0x00004000),
+	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
+	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
+	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
 
-	FILE_ATTR_VALID_FLAGS		= const_cpu_to_le32(0x00007fb7),
+	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
 	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
 	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
 	   to obtain all flags that are valid for reading. */
-	FILE_ATTR_VALID_SET_FLAGS	= const_cpu_to_le32(0x000031a7),
+	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
 	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
 	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
 	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
@@ -846,11 +835,11 @@ enum {
 	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
 	 * attribute of an mft record.
 	 */
-	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= const_cpu_to_le32(0x10000000),
+	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this is a directory or not, i.e. whether it has
 	   an index root attribute or not. */
-	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= const_cpu_to_le32(0x20000000),
+	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this file has a view index present (eg. object id
 	   index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
 	/* Specific rights for files and directories are as follows: */
 
 	/* Right to read data from the file. (FILE) */
-	FILE_READ_DATA			= const_cpu_to_le32(0x00000001),
+	FILE_READ_DATA			= cpu_to_le32(0x00000001),
 	/* Right to list contents of a directory. (DIRECTORY) */
-	FILE_LIST_DIRECTORY		= const_cpu_to_le32(0x00000001),
+	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
 
 	/* Right to write data to the file. (FILE) */
-	FILE_WRITE_DATA			= const_cpu_to_le32(0x00000002),
+	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
 	/* Right to create a file in the directory. (DIRECTORY) */
-	FILE_ADD_FILE			= const_cpu_to_le32(0x00000002),
+	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
 
 	/* Right to append data to the file. (FILE) */
-	FILE_APPEND_DATA		= const_cpu_to_le32(0x00000004),
+	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
 	/* Right to create a subdirectory. (DIRECTORY) */
-	FILE_ADD_SUBDIRECTORY		= const_cpu_to_le32(0x00000004),
+	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
 
 	/* Right to read extended attributes. (FILE/DIRECTORY) */
-	FILE_READ_EA			= const_cpu_to_le32(0x00000008),
+	FILE_READ_EA			= cpu_to_le32(0x00000008),
 
 	/* Right to write extended attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_EA			= const_cpu_to_le32(0x00000010),
+	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
 
 	/* Right to execute a file. (FILE) */
-	FILE_EXECUTE			= const_cpu_to_le32(0x00000020),
+	FILE_EXECUTE			= cpu_to_le32(0x00000020),
 	/* Right to traverse the directory. (DIRECTORY) */
-	FILE_TRAVERSE			= const_cpu_to_le32(0x00000020),
+	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
 
 	/*
 	 * Right to delete a directory and all the files it contains (its
 	 * children), even if the files are read-only. (DIRECTORY)
 	 */
-	FILE_DELETE_CHILD		= const_cpu_to_le32(0x00000040),
+	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
 
 	/* Right to read file attributes. (FILE/DIRECTORY) */
-	FILE_READ_ATTRIBUTES		= const_cpu_to_le32(0x00000080),
+	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
 
 	/* Right to change file attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_ATTRIBUTES		= const_cpu_to_le32(0x00000100),
+	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
 
 	/*
 	 * The standard rights (bits 16 to 23).  These are independent of the
@@ -1489,27 +1478,27 @@ enum {
 	 */
 
 	/* Right to delete the object. */
-	DELETE				= const_cpu_to_le32(0x00010000),
+	DELETE				= cpu_to_le32(0x00010000),
 
 	/*
 	 * Right to read the information in the object's security descriptor,
 	 * not including the information in the SACL, i.e. right to read the
 	 * security descriptor and owner.
 	 */
-	READ_CONTROL			= const_cpu_to_le32(0x00020000),
+	READ_CONTROL			= cpu_to_le32(0x00020000),
 
 	/* Right to modify the DACL in the object's security descriptor. */
-	WRITE_DAC			= const_cpu_to_le32(0x00040000),
+	WRITE_DAC			= cpu_to_le32(0x00040000),
 
 	/* Right to change the owner in the object's security descriptor. */
-	WRITE_OWNER			= const_cpu_to_le32(0x00080000),
+	WRITE_OWNER			= cpu_to_le32(0x00080000),
 
 	/*
 	 * Right to use the object for synchronization.  Enables a process to
 	 * wait until the object is in the signalled state.  Some object types
 	 * do not support this access right.
 	 */
-	SYNCHRONIZE			= const_cpu_to_le32(0x00100000),
+	SYNCHRONIZE			= cpu_to_le32(0x00100000),
 
 	/*
 	 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
 	 */
 
 	/* These are currently defined to READ_CONTROL. */
-	STANDARD_RIGHTS_READ		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_WRITE		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_EXECUTE		= const_cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
 
 	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-	STANDARD_RIGHTS_REQUIRED	= const_cpu_to_le32(0x000f0000),
+	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
 
 	/*
 	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
 	 * SYNCHRONIZE access.
 	 */
-	STANDARD_RIGHTS_ALL		= const_cpu_to_le32(0x001f0000),
+	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
 
 	/*
 	 * The access system ACL and maximum allowed access types (bits 24 to
 	 * 25, bits 26 to 27 are reserved).
 	 */
-	ACCESS_SYSTEM_SECURITY		= const_cpu_to_le32(0x01000000),
-	MAXIMUM_ALLOWED			= const_cpu_to_le32(0x02000000),
+	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
+	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
 
 	/*
 	 * The generic rights (bits 28 to 31).  These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
 	 */
 
 	/* Read, write, and execute access. */
-	GENERIC_ALL			= const_cpu_to_le32(0x10000000),
+	GENERIC_ALL			= cpu_to_le32(0x10000000),
 
 	/* Execute access. */
-	GENERIC_EXECUTE			= const_cpu_to_le32(0x20000000),
+	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
 
 	/*
 	 * Write access.  For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
 	 * For directories, the mapping has the same numerical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_WRITE			= const_cpu_to_le32(0x40000000),
+	GENERIC_WRITE			= cpu_to_le32(0x40000000),
 
 	/*
 	 * Read access.  For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
 	 * For directories, the mapping has the same numberical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_READ			= const_cpu_to_le32(0x80000000),
+	GENERIC_READ			= cpu_to_le32(0x80000000),
 };
 
 typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
  * The object ACE flags (32-bit).
  */
 enum {
-	ACE_OBJECT_TYPE_PRESENT			= const_cpu_to_le32(1),
-	ACE_INHERITED_OBJECT_TYPE_PRESENT	= const_cpu_to_le32(2),
+	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
+	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
 };
 
 typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
  *	expressed as offsets from the beginning of the security descriptor.
  */
 enum {
-	SE_OWNER_DEFAULTED		= const_cpu_to_le16(0x0001),
-	SE_GROUP_DEFAULTED		= const_cpu_to_le16(0x0002),
-	SE_DACL_PRESENT			= const_cpu_to_le16(0x0004),
-	SE_DACL_DEFAULTED		= const_cpu_to_le16(0x0008),
-
-	SE_SACL_PRESENT			= const_cpu_to_le16(0x0010),
-	SE_SACL_DEFAULTED		= const_cpu_to_le16(0x0020),
-
-	SE_DACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0100),
-	SE_SACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0200),
-	SE_DACL_AUTO_INHERITED		= const_cpu_to_le16(0x0400),
-	SE_SACL_AUTO_INHERITED		= const_cpu_to_le16(0x0800),
-
-	SE_DACL_PROTECTED		= const_cpu_to_le16(0x1000),
-	SE_SACL_PROTECTED		= const_cpu_to_le16(0x2000),
-	SE_RM_CONTROL_VALID		= const_cpu_to_le16(0x4000),
-	SE_SELF_RELATIVE		= const_cpu_to_le16(0x8000)
+	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
+	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
+	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
+	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
+
+	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
+	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
+
+	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
+	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
+	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
+	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
+
+	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
+	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
+	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
+	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
 } __attribute__ ((__packed__));
 
 typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
  * Possible flags for the volume (16-bit).
  */
 enum {
-	VOLUME_IS_DIRTY			= const_cpu_to_le16(0x0001),
-	VOLUME_RESIZE_LOG_FILE		= const_cpu_to_le16(0x0002),
-	VOLUME_UPGRADE_ON_MOUNT		= const_cpu_to_le16(0x0004),
-	VOLUME_MOUNTED_ON_NT4		= const_cpu_to_le16(0x0008),
+	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
+	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
+	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
+	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
 
-	VOLUME_DELETE_USN_UNDERWAY	= const_cpu_to_le16(0x0010),
-	VOLUME_REPAIR_OBJECT_ID		= const_cpu_to_le16(0x0020),
+	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
+	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
 
-	VOLUME_CHKDSK_UNDERWAY		= const_cpu_to_le16(0x4000),
-	VOLUME_MODIFIED_BY_CHKDSK	= const_cpu_to_le16(0x8000),
+	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
+	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
 
-	VOLUME_FLAGS_MASK		= const_cpu_to_le16(0xc03f),
+	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
 
 	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= const_cpu_to_le16(0xc027),
+	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 
 typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
  * The user quota flags.  Names explain meaning.
  */
 enum {
-	QUOTA_FLAG_DEFAULT_LIMITS	= const_cpu_to_le32(0x00000001),
-	QUOTA_FLAG_LIMIT_REACHED	= const_cpu_to_le32(0x00000002),
-	QUOTA_FLAG_ID_DELETED		= const_cpu_to_le32(0x00000004),
+	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
+	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
+	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
 
-	QUOTA_FLAG_USER_MASK		= const_cpu_to_le32(0x00000007),
+	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
 	/* This is a bit mask for the user quota flags. */
 
 	/*
 	 * These flags are only present in the quota defaults index entry, i.e.
 	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
 	 */
-	QUOTA_FLAG_TRACKING_ENABLED	= const_cpu_to_le32(0x00000010),
-	QUOTA_FLAG_ENFORCEMENT_ENABLED	= const_cpu_to_le32(0x00000020),
-	QUOTA_FLAG_TRACKING_REQUESTED	= const_cpu_to_le32(0x00000040),
-	QUOTA_FLAG_LOG_THRESHOLD	= const_cpu_to_le32(0x00000080),
-
-	QUOTA_FLAG_LOG_LIMIT		= const_cpu_to_le32(0x00000100),
-	QUOTA_FLAG_OUT_OF_DATE		= const_cpu_to_le32(0x00000200),
-	QUOTA_FLAG_CORRUPT		= const_cpu_to_le32(0x00000400),
-	QUOTA_FLAG_PENDING_DELETES	= const_cpu_to_le32(0x00000800),
+	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
+	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
+	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
+	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
+
+	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
+	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
+	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
+	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
 };
 
 typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
  * Predefined owner_id values (32-bit).
  */
 enum {
-	QUOTA_INVALID_ID	= const_cpu_to_le32(0x00000000),
-	QUOTA_DEFAULTS_ID	= const_cpu_to_le32(0x00000001),
-	QUOTA_FIRST_USER_ID	= const_cpu_to_le32(0x00000100),
+	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
+	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
+	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
 };
 
 /*
@@ -2189,14 +2178,14 @@ typedef enum {
  * Index entry flags (16-bit).
  */
 enum {
-	INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
 			sub-node, i.e. a reference to an index block in form of
 			a virtual cluster number (see below). */
-	INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
 			entry in an index block.  The index entry does not
 			represent a file but it can point to a sub-node. */
 
-	INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
 			enum bit width to 16-bit. */
 } __attribute__ ((__packed__));
 
@@ -2334,26 +2323,26 @@ typedef struct {
  * These are the predefined reparse point tags:
  */
 enum {
-	IO_REPARSE_TAG_IS_ALIAS		= const_cpu_to_le32(0x20000000),
-	IO_REPARSE_TAG_IS_HIGH_LATENCY	= const_cpu_to_le32(0x40000000),
-	IO_REPARSE_TAG_IS_MICROSOFT	= const_cpu_to_le32(0x80000000),
+	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
+	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
+	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
 
-	IO_REPARSE_TAG_RESERVED_ZERO	= const_cpu_to_le32(0x00000000),
-	IO_REPARSE_TAG_RESERVED_ONE	= const_cpu_to_le32(0x00000001),
-	IO_REPARSE_TAG_RESERVED_RANGE	= const_cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
+	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
 
-	IO_REPARSE_TAG_NSS		= const_cpu_to_le32(0x68000005),
-	IO_REPARSE_TAG_NSS_RECOVER	= const_cpu_to_le32(0x68000006),
-	IO_REPARSE_TAG_SIS		= const_cpu_to_le32(0x68000007),
-	IO_REPARSE_TAG_DFS		= const_cpu_to_le32(0x68000008),
+	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
+	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
+	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
+	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
 
-	IO_REPARSE_TAG_MOUNT_POINT	= const_cpu_to_le32(0x88000003),
+	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
 
-	IO_REPARSE_TAG_HSM		= const_cpu_to_le32(0xa8000004),
+	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
 
-	IO_REPARSE_TAG_SYMBOLIC_LINK	= const_cpu_to_le32(0xe8000000),
+	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
 
-	IO_REPARSE_TAG_VALID_VALUES	= const_cpu_to_le32(0xe000ffff),
+	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
 };
 
 /*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae3..b5a6f08bd35c 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
  * in this particular client array.  Also inside the client records themselves,
  * this means that there are no client records preceding or following this one.
  */
-#define LOGFILE_NO_CLIENT	const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
 #define LOGFILE_NO_CLIENT_CPU	0xffff
 
 /*
@@ -112,8 +112,8 @@ typedef struct {
  * information about the log file in which they are present.
  */
 enum {
-	RESTART_VOLUME_IS_CLEAN	= const_cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
+	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 
 typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc35..23bf68453d7d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
 	 */
 
 	/* Mark the mft record as not in use. */
-	m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+	m->flags &= ~MFT_RECORD_IN_USE;
 
 	/* Increment the sequence number, skipping zero, if it is not zero. */
 	old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b5077..f76951dcd4a6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
 	 * many BIOSes will refuse to boot from a bootsector if the magic is
 	 * incorrect, so we emit a warning.
 	 */
-	if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
+	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
 		ntfs_warning(sb, "Invalid end of sector marker.");
 	return true;
 not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	u32 *kaddr, *kend;
 	ntfs_name *name = NULL;
 	int ret = 1;
-	static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'),
-			const_cpu_to_le16('i'), const_cpu_to_le16('b'),
-			const_cpu_to_le16('e'), const_cpu_to_le16('r'),
-			const_cpu_to_le16('f'), const_cpu_to_le16('i'),
-			const_cpu_to_le16('l'), const_cpu_to_le16('.'),
-			const_cpu_to_le16('s'), const_cpu_to_le16('y'),
-			const_cpu_to_le16('s'), 0 };
+	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+			cpu_to_le16('i'), cpu_to_le16('b'),
+			cpu_to_le16('e'), cpu_to_le16('r'),
+			cpu_to_le16('f'), cpu_to_le16('i'),
+			cpu_to_le16('l'), cpu_to_le16('.'),
+			cpu_to_le16('s'), cpu_to_le16('y'),
+			cpu_to_le16('s'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 		goto iput_out;
 	}
 	kaddr = (u32*)page_address(page);
-	if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) {
+	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
 		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
 				"hibernated on the volume.  This is the "
 				"system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
 	MFT_REF mref;
 	struct inode *tmp_ino;
 	ntfs_name *name = NULL;
-	static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
-			const_cpu_to_le16('o'), const_cpu_to_le16('t'),
-			const_cpu_to_le16('a'), 0 };
-	static ntfschar Q[3] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), 0 };
+	static const ntfschar Quota[7] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), cpu_to_le16('u'),
+			cpu_to_le16('o'), cpu_to_le16('t'),
+			cpu_to_le16('a'), 0 };
+	static ntfschar Q[3] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
 	struct page *page;
 	ntfs_name *name = NULL;
 	USN_HEADER *uh;
-	static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('U'), const_cpu_to_le16('s'),
-			const_cpu_to_le16('n'), const_cpu_to_le16('J'),
-			const_cpu_to_le16('r'), const_cpu_to_le16('n'),
-			const_cpu_to_le16('l'), 0 };
-	static ntfschar Max[5] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('M'), const_cpu_to_le16('a'),
-			const_cpu_to_le16('x'), 0 };
-	static ntfschar J[3] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('J'), 0 };
+	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+			cpu_to_le16('U'), cpu_to_le16('s'),
+			cpu_to_le16('n'), cpu_to_le16('J'),
+			cpu_to_le16('r'), cpu_to_le16('n'),
+			cpu_to_le16('l'), 0 };
+	static ntfschar Max[5] = { cpu_to_le16('$'),
+			cpu_to_le16('M'), cpu_to_le16('a'),
+			cpu_to_le16('x'), 0 };
+	static ntfschar J[3] = { cpu_to_le16('$'),
+			cpu_to_le16('J'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac327..00d8e6bd7c36 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
  * documentation: http://www.linux-ntfs.org/
  */
 enum {
-	USN_REASON_DATA_OVERWRITE	= const_cpu_to_le32(0x00000001),
-	USN_REASON_DATA_EXTEND		= const_cpu_to_le32(0x00000002),
-	USN_REASON_DATA_TRUNCATION	= const_cpu_to_le32(0x00000004),
-	USN_REASON_NAMED_DATA_OVERWRITE	= const_cpu_to_le32(0x00000010),
-	USN_REASON_NAMED_DATA_EXTEND	= const_cpu_to_le32(0x00000020),
-	USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040),
-	USN_REASON_FILE_CREATE		= const_cpu_to_le32(0x00000100),
-	USN_REASON_FILE_DELETE		= const_cpu_to_le32(0x00000200),
-	USN_REASON_EA_CHANGE		= const_cpu_to_le32(0x00000400),
-	USN_REASON_SECURITY_CHANGE	= const_cpu_to_le32(0x00000800),
-	USN_REASON_RENAME_OLD_NAME	= const_cpu_to_le32(0x00001000),
-	USN_REASON_RENAME_NEW_NAME	= const_cpu_to_le32(0x00002000),
-	USN_REASON_INDEXABLE_CHANGE	= const_cpu_to_le32(0x00004000),
-	USN_REASON_BASIC_INFO_CHANGE	= const_cpu_to_le32(0x00008000),
-	USN_REASON_HARD_LINK_CHANGE	= const_cpu_to_le32(0x00010000),
-	USN_REASON_COMPRESSION_CHANGE	= const_cpu_to_le32(0x00020000),
-	USN_REASON_ENCRYPTION_CHANGE	= const_cpu_to_le32(0x00040000),
-	USN_REASON_OBJECT_ID_CHANGE	= const_cpu_to_le32(0x00080000),
-	USN_REASON_REPARSE_POINT_CHANGE	= const_cpu_to_le32(0x00100000),
-	USN_REASON_STREAM_CHANGE	= const_cpu_to_le32(0x00200000),
-	USN_REASON_CLOSE		= const_cpu_to_le32(0x80000000),
+	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
+	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
+	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
+	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
+	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
+	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
+	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
+	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
+	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
+	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
+	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
+	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
+	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
+	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
+	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
+	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
+	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
+	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
+	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
+	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
 };
 
 typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
  *	http://www.linux-ntfs.org/
  */
 enum {
-	USN_SOURCE_DATA_MANAGEMENT	  = const_cpu_to_le32(0x00000001),
-	USN_SOURCE_AUXILIARY_DATA	  = const_cpu_to_le32(0x00000002),
-	USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004),
+	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
+	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
+	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
 };
 
 typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 60fe74035db5..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+			(OCFS2_I(inode)->ip_clusters !=
+			 le32_to_cpu(rec->e_cpos)),
 			"Device %s, asking for sparse allocation: inode %llu, "
 			"cpos %u, clusters %u\n",
 			osb->dev_str,
@@ -293,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
 
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					  u64 blkno)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+
+static void ocfs2_dx_root_update_clusters(struct inode *inode,
+					  struct ocfs2_extent_tree *et,
+					  u32 clusters)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+
+static int ocfs2_dx_root_sanity_check(struct inode *inode,
+				      struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+
+	return 0;
+}
+
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+	et->et_root_el = &dx_root->dr_list;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
+	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
+	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
+};
+
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct inode *inode,
 				     struct buffer_head *bh,
@@ -338,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 				 &ocfs2_xattr_value_et_ops);
 }
 
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct inode *inode,
+				    struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+				 NULL, &ocfs2_dx_root_et_ops);
+}
+
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					    u64 new_last_eb_blk)
 {
@@ -4796,6 +4854,29 @@ out:
 	return ret;
 }
 
+static int ocfs2_replace_extent_rec(struct inode *inode,
+				    handle_t *handle,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
 /*
  * Mark part or all of the extent record at split_index in the leaf
  * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4966,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
-			el->l_recs[split_index] = *split_rec;
+			ret = ocfs2_replace_extent_rec(inode, handle,
+						       path, el,
+						       split_index, split_rec);
 		else
 			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
 					struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+				    struct inode *inode,
+				    struct buffer_head *bh);
 
 /*
  * Read an extent block into *bh.  If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6cffb01..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	size = i_size_read(inode);
 
 	if (size > PAGE_CACHE_SIZE ||
-	    size > ocfs2_max_inline_data(inode->i_sb)) {
+	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has with inline data has bad size: %Lu",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	int ret, written = 0;
 	loff_t end = pos + len;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = NULL;
 
 	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
 	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	/*
 	 * Check whether the write can fit.
 	 */
-	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	if (mmap_page ||
+	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
 		return 0;
 
 do_inline_write:
@@ -1953,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 }
 
 const struct address_space_operations ocfs2_aops = {
-	.readpage	= ocfs2_readpage,
-	.readpages	= ocfs2_readpages,
-	.writepage	= ocfs2_writepage,
-	.write_begin	= ocfs2_write_begin,
-	.write_end	= ocfs2_write_end,
-	.bmap		= ocfs2_bmap,
-	.sync_page	= block_sync_page,
-	.direct_IO	= ocfs2_direct_IO,
-	.invalidatepage	= ocfs2_invalidatepage,
-	.releasepage	= ocfs2_releasepage,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ocfs2_readpage,
+	.readpages		= ocfs2_readpages,
+	.writepage		= ocfs2_writepage,
+	.write_begin		= ocfs2_write_begin,
+	.write_end		= ocfs2_write_end,
+	.bmap			= ocfs2_bmap,
+	.sync_page		= block_sync_page,
+	.direct_IO		= ocfs2_direct_IO,
+	.invalidatepage		= ocfs2_invalidatepage,
+	.releasepage		= ocfs2_releasepage,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
 };
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
 #include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/debugfs.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 
+#define O2HB_DEBUG_DIR			"o2hb"
+#define O2HB_DEBUG_LIVENODES		"livenodes"
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
+
 static LIST_HEAD(o2hb_all_regions);
 
 static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
 	return 0;
 }
 
-void o2hb_init(void)
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	char *buf = NULL;
+	int i = -1;
+	int out = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	o2hb_fill_node_map(map, sizeof(map));
+
+	while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+	i_size_write(inode, out);
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+			       size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+static struct file_operations o2hb_debug_fops = {
+	.open =		o2hb_debug_open,
+	.release =	o2hb_debug_release,
+	.read =		o2hb_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
+void o2hb_exit(void)
+{
+	if (o2hb_debug_livenodes)
+		debugfs_remove(o2hb_debug_livenodes);
+	if (o2hb_debug_dir)
+		debugfs_remove(o2hb_debug_dir);
+}
+
+int o2hb_init(void)
 {
 	int i;
 
@@ -918,6 +994,24 @@ void o2hb_init(void)
 	INIT_LIST_HEAD(&o2hb_node_events);
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+
+	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+	if (!o2hb_debug_dir) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+						   S_IFREG|S_IRUSR,
+						   o2hb_debug_dir, NULL,
+						   &o2hb_debug_fops);
+	if (!o2hb_debug_livenodes) {
+		mlog_errno(-ENOMEM);
+		debugfs_remove(o2hb_debug_dir);
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
 /* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
 			      struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
 			unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
 	o2cb_sys_shutdown();
 
 	o2net_exit();
+	o2hb_exit();
 }
 
 static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
 
 	cluster_print_version();
 
-	o2hb_init();
+	ret = o2hb_init();
+	if (ret)
+		goto out;
 
 	ret = o2net_init();
 	if (ret)
-		goto out;
+		goto out_o2hb;
 
 	ret = o2net_register_hb_callbacks();
 	if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
 	o2net_unregister_hb_callbacks();
 out_o2net:
 	o2net_exit();
+out_o2hb:
+	o2hb_exit();
 out:
 	return ret;
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e9d7c2038c0f..7d604480557a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -455,7 +455,7 @@ out_move:
 	d_move(dentry, target);
 }
 
-struct dentry_operations ocfs2_dentry_ops = {
+const struct dentry_operations ocfs2_dentry_ops = {
 	.d_revalidate		= ocfs2_dentry_revalidate,
 	.d_iput			= ocfs2_dentry_iput,
 };
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index d06e16c06640..faa12e75f98d 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,7 +26,7 @@
 #ifndef OCFS2_DCACHE_H
 #define OCFS2_DCACHE_H
 
-extern struct dentry_operations ocfs2_dentry_ops;
+extern const struct dentry_operations ocfs2_dentry_ops;
 
 struct ocfs2_dentry_lock {
 	/* Use count of dentry lock */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
+#include <linux/sort.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -58,6 +59,7 @@
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
+#include "sysfile.h"
 #include "uptodate.h"
 
 #include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-			    struct inode *dir,
-			    struct buffer_head *parent_fe_bh,
-			    unsigned int blocks_wanted,
-			    struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
 			       handle_t *handle,
 			       struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
 
 /*
  * These are distinct checks because future versions of the file system will
  * want to have a trailing dirent structure independent of indexing.
  */
-static int ocfs2_dir_has_trailer(struct inode *dir)
+static int ocfs2_supports_dir_trailer(struct inode *dir)
 {
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return 0;
 
-	return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+	return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
 }
 
-static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 {
-	return ocfs2_meta_ecc(osb);
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	return ocfs2_meta_ecc(osb) ||
+		ocfs2_supports_indexed_dirs(osb);
 }
 
 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 {
 	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 
-	if (!ocfs2_dir_has_trailer(dir))
+	if (!ocfs2_supports_dir_trailer(dir))
 		return 0;
 
 	if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 }
 
 static void ocfs2_init_dir_trailer(struct inode *inode,
-				   struct buffer_head *bh)
+				   struct buffer_head *bh, u16 rec_len)
 {
 	struct ocfs2_dir_block_trailer *trailer;
 
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
 			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
 	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
 	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+	trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+				     struct buffer_head *dx_root_bh,
+				     struct buffer_head *dirdata_bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	trailer->db_free_next = dx_root->dr_free_blk;
+	dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+	return res->dl_prev_leaf_bh == NULL;
+}
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+	brelse(res->dl_dx_root_bh);
+	brelse(res->dl_leaf_bh);
+	brelse(res->dl_dx_leaf_bh);
+	brelse(res->dl_prev_leaf_bh);
+}
+
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+	return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while (--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+				   struct ocfs2_dx_hinfo *hinfo)
+{
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	const char	*p;
+	__u32		in[8], buf[4];
+
+	/*
+	 * XXX: Is this really necessary, if the index is never looked
+	 * at by readdir? Is a hash value of '0' a bad idea?
+	 */
+	if ((len == 1 && !strncmp(".", name, 1)) ||
+	    (len == 2 && !strncmp("..", name, 2))) {
+		buf[0] = buf[1] = 0;
+		goto out;
+	}
+
+#ifdef OCFS2_DEBUG_DX_DIRS
+	/*
+	 * This makes it very easy to debug indexing problems. We
+	 * should never allow this to be selected without hand editing
+	 * this file though.
+	 */
+	buf[0] = buf[1] = len;
+	goto out;
+#endif
+
+	memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+
+	p = name;
+	while (len > 0) {
+		str2hashbuf(p, len, in, 4);
+		TEA_transform(buf, in);
+		len -= 16;
+		p += 16;
+	}
+
+out:
+	hinfo->major_hash = buf[0];
+	hinfo->minor_hash = buf[1];
 }
 
 /*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 }
 
 /*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+	int rc = 0;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+	if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Invalid dirblock #%llu: "
+			    "signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    trailer->db_signature);
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu has an invalid "
+			    "db_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+	if (le64_to_cpu(trailer->db_parent_dinode) !=
+	    OCFS2_I(dir)->ip_blkno) {
+		rc = -EINVAL;
+		ocfs2_error(dir->i_sb,
+			    "Directory block #%llu on dinode "
+			    "#%llu has an invalid parent_dinode "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/*
  * This function forces all errors to -EIO for consistency with its
  * predecessor, ocfs2_bread().  We haven't audited what returning the
  * real error codes would do to callers.  We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
-	struct ocfs2_dir_block_trailer *trailer;
 
 	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
 				    ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 		goto out;
 	}
 
-	/*
-	 * We check the trailer here rather than in
-	 * ocfs2_validate_dir_block() because that function doesn't have
-	 * the inode to test.
-	 */
 	if (!(flags & OCFS2_BH_READAHEAD) &&
-	    ocfs2_dir_has_trailer(inode)) {
-		trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
-		if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
-			rc = -EINVAL;
-			ocfs2_error(inode->i_sb,
-				    "Invalid dirblock #%llu: "
-				    "signature = %.*s\n",
-				    (unsigned long long)tmp->b_blocknr, 7,
-				    trailer->db_signature);
-			goto out;
-		}
-		if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
-			rc = -EINVAL;
-			ocfs2_error(inode->i_sb,
-				    "Directory block #%llu has an invalid "
-				    "db_blkno of %llu",
-				    (unsigned long long)tmp->b_blocknr,
-				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
-			goto out;
-		}
-		if (le64_to_cpu(trailer->db_parent_dinode) !=
-		    OCFS2_I(inode)->ip_blkno) {
-			rc = -EINVAL;
-			ocfs2_error(inode->i_sb,
-				    "Directory block #%llu on dinode "
-				    "#%llu has an invalid parent_dinode "
-				    "of %llu",
-				    (unsigned long long)tmp->b_blocknr,
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+	    ocfs2_supports_dir_trailer(inode)) {
+		rc = ocfs2_check_dir_trailer(inode, tmp);
+		if (rc) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(rc);
 			goto out;
 		}
 	}
@@ -379,6 +553,141 @@ out:
 	return rc ? -EIO : 0;
 }
 
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+				       struct buffer_head **bh)
+{
+	int ret;
+	struct buffer_head *tmp = *bh;
+
+	ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_supports_dir_trailer(dir)) {
+		ret = ocfs2_check_dir_trailer(dir, tmp);
+		if (ret) {
+			if (!*bh)
+				brelse(tmp);
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!ret && !*bh)
+		*bh = tmp;
+out:
+	return ret;
+}
+
+static int ocfs2_validate_dx_root(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_root_block *dx_root;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index root block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+		ocfs2_error(sb,
+			    "Dir Index Root # %llu has bad signature %.*s",
+			    (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+			    7, dx_root->dr_signature);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+			      struct buffer_head **dx_root_bh)
+{
+	int ret;
+	u64 blkno = le64_to_cpu(di->i_dx_root);
+	struct buffer_head *tmp = *dx_root_bh;
+
+	ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_root_bh)
+		*dx_root_bh = tmp;
+
+	return ret;
+}
+
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+				  struct buffer_head *bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+	if (ret) {
+		mlog(ML_ERROR,
+		     "Checksum failed for dir index leaf block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return ret;
+	}
+
+	if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+		ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+			    7, dx_leaf->dl_signature);
+		return -EROFS;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+			      struct buffer_head **dx_leaf_bh)
+{
+	int ret;
+	struct buffer_head *tmp = *dx_leaf_bh;
+
+	ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!ret && !*dx_leaf_bh)
+		*dx_leaf_bh = tmp;
+
+	return ret;
+}
+
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+				struct buffer_head **dx_leaf_bhs)
+{
+	int ret;
+
+	ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+				ocfs2_validate_dx_leaf);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 					       struct inode *dir,
 					       struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
 	return ret;
 }
 
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+				   struct ocfs2_extent_list *el,
+				   u32 major_hash,
+				   u32 *ret_cpos,
+				   u64 *ret_phys_blkno,
+				   unsigned int *ret_clen)
+{
+	int ret = 0, i, found;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "btree tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in btree", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+	if (ret_cpos)
+		*ret_cpos = le32_to_cpu(rec->e_cpos);
+	if (ret_clen)
+		*ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+						   u32 minor_hash)
+{
+	return minor_hash & osb->osb_dx_mask;
+}
+
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+					  struct ocfs2_dx_hinfo *hinfo)
+{
+	return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+			       struct ocfs2_extent_list *el,
+			       struct ocfs2_dx_hinfo *hinfo,
+			       u32 *ret_cpos,
+			       u64 *ret_phys_blkno)
+{
+	int ret = 0;
+	unsigned int cend, uninitialized_var(clen);
+	u32 uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	u32 name_hash = hinfo->major_hash;
+
+	ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+				      &clen);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cend = cpos + clen;
+	if (name_hash >= cend) {
+		/* We want the last cluster */
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+		cpos += clen - 1;
+	} else {
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+						  name_hash - cpos);
+		cpos = name_hash;
+	}
+
+	/*
+	 * We now have the cluster which should hold our entry. To
+	 * find the exact block from the start of the cluster to
+	 * search, we take the lower bits of the hash.
+	 */
+	blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+
+	if (ret_phys_blkno)
+		*ret_phys_blkno = blkno;
+	if (ret_cpos)
+		*ret_cpos = cpos;
+
+out:
+
+	return ret;
+}
+
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dx_root_block *dx_root,
+			       struct ocfs2_dir_lookup_result *res)
+{
+	int ret, i, found;
+	u64 uninitialized_var(phys);
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = NULL;
+	struct buffer_head *dir_ent_bh = NULL;
+	struct ocfs2_dir_entry *dir_ent = NULL;
+	struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+	struct ocfs2_extent_list *dr_el;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+		goto search;
+	}
+
+	dr_el = &dx_root->dr_list;
+
+	ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+	     "returns: %llu\n",
+	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+	     namelen, name, hinfo->major_hash, hinfo->minor_hash,
+	     (unsigned long long)phys);
+
+	ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+
+	mlog(0, "leaf info: num_used: %d, count: %d\n",
+	     le16_to_cpu(dx_leaf->dl_list.de_num_used),
+	     le16_to_cpu(dx_leaf->dl_list.de_count));
+
+	entry_list = &dx_leaf->dl_list;
+
+search:
+	/*
+	 * Empty leaf is legal, so no need to check for that.
+	 */
+	found = 0;
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+		    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+			continue;
+
+		/*
+		 * Search unindexed leaf block now. We're not
+		 * guaranteed to find anything.
+		 */
+		ret = ocfs2_read_dir_block_direct(dir,
+					  le64_to_cpu(dx_entry->dx_dirent_blk),
+					  &dir_ent_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * XXX: We should check the unindexed block here,
+		 * before using it.
+		 */
+
+		found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+					      0, dir_ent_bh->b_data,
+					      dir->i_sb->s_blocksize, &dir_ent);
+		if (found == 1)
+			break;
+
+		if (found == -1) {
+			/* This means we found a bad directory entry. */
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		brelse(dir_ent_bh);
+		dir_ent_bh = NULL;
+	}
+
+	if (found <= 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	res->dl_leaf_bh = dir_ent_bh;
+	res->dl_entry = dir_ent;
+	res->dl_dx_leaf_bh = dx_leaf_bh;
+	res->dl_dx_entry = dx_entry;
+
+	ret = 0;
+out:
+	if (ret) {
+		brelse(dx_leaf_bh);
+		brelse(dir_ent_bh);
+	}
+	return ret;
+}
+
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+			       struct inode *dir,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_inode_block(dir, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+	if (ret) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	lookup->dl_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
 /*
  * Try to find an entry of the provided name within 'dir'.
  *
- * If nothing was found, NULL is returned. Otherwise, a buffer_head
- * and pointer to the dir entry are passed back.
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
  *
  * Caller can NOT assume anything about the contents of the
- * buffer_head - it is passed back only so that it can be passed into
- * any one of the manipulation functions (add entry, delete entry,
- * etc). As an example, bh in the extent directory case is a data
- * block, in the inline-data case it actually points to an inode.
+ * buffer_heads - they are passed back only so that it can be passed
+ * into any one of the manipulation functions (add entry, delete
+ * entry, etc). As an example, bh in the extent directory case is a
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
  */
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
-				     struct inode *dir,
-				     struct ocfs2_dir_entry **res_dir)
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
 {
-	*res_dir = NULL;
+	struct buffer_head *bh;
+	struct ocfs2_dir_entry *res_dir = NULL;
 
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+
+	/*
+	 * The unindexed dir code only uses part of the lookup
+	 * structure, so there's no reason to push it down further
+	 * than this.
+	 */
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+		bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+	else
+		bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+
+	if (bh == NULL)
+		return -ENOENT;
 
-	return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+	lookup->dl_leaf_bh = bh;
+	lookup->dl_entry = res_dir;
+	return 0;
 }
 
 /*
  * Update inode number and type of a previously found directory entry.
  */
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+		       struct ocfs2_dir_lookup_result *res,
 		       struct inode *new_entry_inode)
 {
 	int ret;
 	ocfs2_journal_access_func access = ocfs2_journal_access_db;
+	struct ocfs2_dir_entry *de = res->dl_entry;
+	struct buffer_head *de_bh = res->dl_leaf_bh;
 
 	/*
 	 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
 	return ret;
 }
 
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 				struct ocfs2_dir_entry *de_del,
 				struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
 	return status;
 }
 
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+	unsigned int hole;
+
+	if (le64_to_cpu(de->inode) == 0)
+		hole = le16_to_cpu(de->rec_len);
+	else
+		hole = le16_to_cpu(de->rec_len) -
+			OCFS2_DIR_REC_LEN(de->name_len);
+
+	return hole;
+}
+
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+				  struct buffer_head *dirblock_bh)
+{
+	int size, this_hole, largest_hole = 0;
+	char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+	struct ocfs2_dir_entry *de;
+
+	trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+	size = ocfs2_dir_trailer_blk_off(sb);
+	limit = start + size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		if (de_buf != trailer) {
+			this_hole = ocfs2_figure_dirent_hole(de);
+			if (this_hole > largest_hole)
+				largest_hole = this_hole;
+		}
+
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
+}
+
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+				       int index)
+{
+	int num_used = le16_to_cpu(entry_list->de_num_used);
+
+	if (num_used == 1 || index == (num_used - 1))
+		goto clear;
+
+	memmove(&entry_list->de_entries[index],
+		&entry_list->de_entries[index + 1],
+		(num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+	num_used--;
+	memset(&entry_list->de_entries[num_used], 0,
+	       sizeof(struct ocfs2_dx_entry));
+	entry_list->de_num_used = cpu_to_le16(num_used);
+}
+
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+				 struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, index, max_rec_len, add_to_free_list = 0;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+	struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+	struct ocfs2_dir_block_trailer *trailer;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * This function gets a bit messy because we might have to
+	 * modify the root block, regardless of whether the indexed
+	 * entries are stored inline.
+	 */
+
+	/*
+	 * *Only* set 'entry_list' here, based on where we're looking
+	 * for the indexed entries. Later, we might still want to
+	 * journal both blocks, based on free list state.
+	 */
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		entry_list = &dx_root->dr_entries;
+	} else {
+		dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+		entry_list = &dx_leaf->dl_list;
+	}
+
+	/* Neither of these are a disk corruption - that should have
+	 * been caught by lookup, before we got here. */
+	BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+	BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+
+	index = (char *)dx_entry - (char *)entry_list->de_entries;
+	index /= sizeof(*dx_entry);
+
+	if (index >= le16_to_cpu(entry_list->de_num_used)) {
+		mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+		     entry_list, dx_entry);
+		return -EIO;
+	}
+
+	/*
+	 * We know that removal of this dirent will leave enough room
+	 * for a new one, so add this block to the free list if it
+	 * isn't already there.
+	 */
+	trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+	if (trailer->db_free_rec_len == 0)
+		add_to_free_list = 1;
+
+	/*
+	 * Add the block holding our index into the journal before
+	 * removing the unindexed entry. If we get an error return
+	 * from __ocfs2_delete_entry(), then it hasn't removed the
+	 * entry yet. Likewise, successful return means we *must*
+	 * remove the indexed entry.
+	 *
+	 * We're also careful to journal the root tree block here as
+	 * the entry count needs to be updated. Also, we might be
+	 * adding to the start of the free list.
+	 */
+	ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_journal_access_dl(handle, dir,
+					      lookup->dl_dx_leaf_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	mlog(0, "Dir %llu: delete entry at index: %d\n",
+	     (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+
+	ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+				   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+	trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+	if (add_to_free_list) {
+		trailer->db_free_next = dx_root->dr_free_blk;
+		dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+		ocfs2_journal_dirty(handle, dx_root_bh);
+	}
+
+	/* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+	ocfs2_journal_dirty(handle, leaf_bh);
+
+	le32_add_cpu(&dx_root->dr_num_entries, -1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+	ocfs2_dx_list_remove_entry(entry_list, index);
+
+	if (!ocfs2_dx_root_inline(dx_root))
+		ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+
+out:
+	return ret;
+}
+
 static inline int ocfs2_delete_entry_id(handle_t *handle,
 					struct inode *dir,
 					struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
 }
 
 /*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * Delete a directory entry. Hide the details of directory
+ * implementation from the caller.
  */
 int ocfs2_delete_entry(handle_t *handle,
 		       struct inode *dir,
-		       struct ocfs2_dir_entry *de_del,
-		       struct buffer_head *bh)
+		       struct ocfs2_dir_lookup_result *res)
 {
+	if (ocfs2_dir_indexed(dir))
+		return ocfs2_delete_entry_dx(handle, dir, res);
+
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+		return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+					     res->dl_leaf_bh);
 
-	return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+	return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+				     res->dl_leaf_bh);
 }
 
 /*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
 	return 0;
 }
 
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+					  struct ocfs2_dx_entry *dx_new_entry)
+{
+	int i;
+
+	i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+
+	le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+				       struct ocfs2_dx_hinfo *hinfo,
+				       u64 dirent_blk)
+{
+	int i;
+	struct ocfs2_dx_entry *dx_entry;
+
+	i = le16_to_cpu(entry_list->de_num_used);
+	dx_entry = &entry_list->de_entries[i];
+
+	memset(dx_entry, 0, sizeof(*dx_entry));
+	dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+	dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+	dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+
+	le16_add_cpu(&entry_list->de_num_used, 1);
+}
+
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+				      struct ocfs2_dx_hinfo *hinfo,
+				      u64 dirent_blk,
+				      struct buffer_head *dx_leaf_bh)
+{
+	int ret;
+	struct ocfs2_dx_leaf *dx_leaf;
+
+	ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+					struct ocfs2_dx_hinfo *hinfo,
+					u64 dirent_blk,
+					struct ocfs2_dx_root_block *dx_root)
+{
+	ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+			       struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+	ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ocfs2_dx_inline_root_insert(dir, handle,
+					    &lookup->dl_hinfo,
+					    lookup->dl_leaf_bh->b_blocknr,
+					    dx_root);
+	} else {
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+						 lookup->dl_leaf_bh->b_blocknr,
+						 lookup->dl_dx_leaf_bh);
+		if (ret)
+			goto out;
+	}
+
+	le32_add_cpu(&dx_root->dr_num_entries, 1);
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+	return ret;
+}
+
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+				       handle_t *handle,
+				       struct ocfs2_dir_lookup_result *lookup)
+{
+	struct ocfs2_dir_block_trailer *trailer, *prev;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *bh;
+
+	trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+
+	if (ocfs2_free_list_at_root(lookup)) {
+		bh = lookup->dl_dx_root_bh;
+		dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+		dx_root->dr_free_blk = trailer->db_free_next;
+	} else {
+		bh = lookup->dl_prev_leaf_bh;
+		prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+		prev->db_free_next = trailer->db_free_next;
+	}
+
+	trailer->db_free_rec_len = cpu_to_le16(0);
+	trailer->db_free_next = cpu_to_le64(0);
+
+	ocfs2_journal_dirty(handle, bh);
+	ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int max_rec_len;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	/* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+	max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+	if (max_rec_len) {
+		/*
+		 * There's still room in this block, so no need to remove it
+		 * from the free list. In this case, we just want to update
+		 * the rec len accounting.
+		 */
+		trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+		trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+		ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+	} else {
+		ocfs2_remove_block_from_free_list(dir, handle, lookup);
+	}
+}
+
 /* we don't always have a dentry for what we want to add, so people
  * like orphan dir can call this instead.
  *
- * If you pass me insert_bh, I'll skip the search of the other dir
- * blocks and put the record in there.
+ * The lookup context must have been filled from
+ * ocfs2_prepare_dir_for_insert.
  */
 int __ocfs2_add_entry(handle_t *handle,
 		      struct inode *dir,
 		      const char *name, int namelen,
 		      struct inode *inode, u64 blkno,
 		      struct buffer_head *parent_fe_bh,
-		      struct buffer_head *insert_bh)
+		      struct ocfs2_dir_lookup_result *lookup)
 {
 	unsigned long offset;
 	unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
 	struct super_block *sb = dir->i_sb;
 	int retval, status;
 	unsigned int size = sb->s_blocksize;
+	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
 	char *data_start = insert_bh->b_data;
 
 	mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
 	if (!namelen)
 		return -EINVAL;
 
-	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+	if (ocfs2_dir_indexed(dir)) {
+		struct buffer_head *bh;
+
+		/*
+		 * An indexed dir may require that we update the free space
+		 * list. Reserve a write to the previous node in the list so
+		 * that we don't fail later.
+		 *
+		 * XXX: This can be either a dx_root_block, or an unindexed
+		 * directory tree leaf block.
+		 */
+		if (ocfs2_free_list_at_root(lookup)) {
+			bh = lookup->dl_dx_root_bh;
+			retval = ocfs2_journal_access_dr(handle, dir, bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		} else {
+			bh = lookup->dl_prev_leaf_bh;
+			retval = ocfs2_journal_access_db(handle, dir, bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		}
+		if (retval) {
+			mlog_errno(retval);
+			return retval;
+		}
+	} else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		data_start = di->id2.i_data.id_data;
 		size = i_size_read(dir);
 
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
 				status = ocfs2_journal_access_di(handle, dir,
 								 insert_bh,
 								 OCFS2_JOURNAL_ACCESS_WRITE);
-			else
+			else {
 				status = ocfs2_journal_access_db(handle, dir,
 								 insert_bh,
-								 OCFS2_JOURNAL_ACCESS_WRITE);
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+
+				if (ocfs2_dir_indexed(dir)) {
+					status = ocfs2_dx_dir_insert(dir,
+								handle,
+								lookup);
+					if (status) {
+						mlog_errno(status);
+						goto bail;
+					}
+				}
+			}
+
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
 			de->name_len = namelen;
 			memcpy(de->name, name, namelen);
 
+			if (ocfs2_dir_indexed(dir))
+				ocfs2_recalc_free_list(dir, handle, lookup);
+
 			dir->i_version++;
 			status = ocfs2_journal_dirty(handle, insert_bh);
 			retval = 0;
@@ -870,6 +1851,10 @@ out:
 	return 0;
 }
 
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 				    u64 *f_version,
 				    loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
 			     int namelen,
 			     u64 *blkno,
 			     struct inode *inode,
-			     struct buffer_head **dirent_bh,
-			     struct ocfs2_dir_entry **dirent)
+			     struct ocfs2_dir_lookup_result *lookup)
 {
 	int status = -ENOENT;
 
-	mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
-		   namelen, name, blkno, inode, dirent_bh, dirent);
+	mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
-	if (!*dirent_bh || !*dirent) {
-		status = -ENOENT;
+	status = ocfs2_find_entry(name, namelen, inode, lookup);
+	if (status)
 		goto leave;
-	}
 
-	*blkno = le64_to_cpu((*dirent)->inode);
+	*blkno = le64_to_cpu(lookup->dl_entry->inode);
 
 	status = 0;
 leave:
-	if (status < 0) {
-		*dirent = NULL;
-		brelse(*dirent_bh);
-		*dirent_bh = NULL;
-	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
 			       int namelen, u64 *blkno)
 {
 	int ret;
-	struct buffer_head *bh = NULL;
-	struct ocfs2_dir_entry *dirent = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
-	brelse(bh);
+	ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
+	ocfs2_free_dir_lookup_result(&lookup);
 
 	return ret;
 }
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 			      int namelen)
 {
 	int ret;
-	struct buffer_head *dirent_bh = NULL;
-	struct ocfs2_dir_entry *dirent = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
 	mlog_entry("dir %llu, name '%.*s'\n",
 		   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
 	ret = -EEXIST;
-	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
-	if (dirent_bh)
+	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
 		goto bail;
 
 	ret = 0;
 bail:
-	brelse(dirent_bh);
+	ocfs2_free_dir_lookup_result(&lookup);
 
 	mlog_exit(ret);
 	return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
 	unsigned seen_dot;
 	unsigned seen_dot_dot;
 	unsigned seen_other;
+	unsigned dx_dir;
 };
 static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
 				   loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
 	/*
 	 * Check the positions of "." and ".." records to be sure
 	 * they're in the correct place.
+	 *
+	 * Indexed directories don't need to proceed past the first
+	 * two entries, so we end the scan after seeing '..'. Despite
+	 * that, we allow the scan to proceed In the event that we
+	 * have a corrupted indexed directory (no dot or dot dot
+	 * entries). This allows us to double check for existing
+	 * entries which might not have been found in the index.
 	 */
 	if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
 		p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
 	if (name_len == 2 && !strncmp("..", name, 2) &&
 	    pos == OCFS2_DIR_REC_LEN(1)) {
 		p->seen_dot_dot = 1;
+
+		if (p->dx_dir && p->seen_dot)
+			return 1;
+
 		return 0;
 	}
 
 	p->seen_other = 1;
 	return 1;
 }
+
+static int ocfs2_empty_dir_dx(struct inode *inode,
+			      struct ocfs2_empty_dir_priv *priv)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_dx_root_block *dx_root;
+
+	priv->dx_dir = 1;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+		priv->seen_other = 1;
+
+out:
+	brelse(di_bh);
+	brelse(dx_root_bh);
+	return ret;
+}
+
 /*
  * routine to check that the specified directory is empty (for rmdir)
  *
  * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
  */
 int ocfs2_empty_dir(struct inode *inode)
 {
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
 
 	memset(&priv, 0, sizeof(priv));
 
+	if (ocfs2_dir_indexed(inode)) {
+		ret = ocfs2_empty_dir_dx(inode, &priv);
+		if (ret)
+			mlog_errno(ret);
+		/*
+		 * We still run ocfs2_dir_foreach to get the checks
+		 * for "." and "..".
+		 */
+	}
+
 	ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
 	if (ret)
 		mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 				 struct inode *parent,
 				 struct inode *inode,
 				 struct buffer_head *fe_bh,
-				 struct ocfs2_alloc_context *data_ac)
+				 struct ocfs2_alloc_context *data_ac,
+				 struct buffer_head **ret_new_bh)
 {
 	int status;
 	unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
-	if (ocfs2_supports_dir_trailer(osb))
+	if (ocfs2_new_dir_wants_trailer(inode))
 		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
 
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
 	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-	if (ocfs2_supports_dir_trailer(osb))
-		ocfs2_init_dir_trailer(inode, new_bh);
+	if (ocfs2_new_dir_wants_trailer(inode)) {
+		int size = le16_to_cpu(de->rec_len);
+
+		/*
+		 * Figure out the size of the hole left over after
+		 * insertion of '.' and '..'. The trailer wants this
+		 * information.
+		 */
+		size -= OCFS2_DIR_REC_LEN(2);
+		size -= sizeof(struct ocfs2_dir_block_trailer);
+
+		ocfs2_init_dir_trailer(inode, new_bh, size);
+	}
 
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 	}
 
 	status = 0;
+	if (ret_new_bh) {
+		*ret_new_bh = new_bh;
+		new_bh = NULL;
+	}
 bail:
 	brelse(new_bh);
 
@@ -1336,20 +2384,427 @@ bail:
 	return status;
 }
 
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+				     handle_t *handle, struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dirdata_bh,
+				     struct ocfs2_alloc_context *meta_ac,
+				     int dx_inline, u32 num_entries,
+				     struct buffer_head **ret_dx_root_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	u16 dr_suballoc_bit;
+	u64 dr_blkno;
+	unsigned int num_bits;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+
+	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+				   &num_bits, &dr_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "Dir %llu, attach new index block: %llu\n",
+	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+	     (unsigned long long)dr_blkno);
+
+	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+	if (dx_root_bh == NULL) {
+		ret = -EIO;
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+
+	ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	memset(dx_root, 0, osb->sb->s_blocksize);
+	strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+	dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+	dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+	dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+	dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+	dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+	dx_root->dr_num_entries = cpu_to_le32(num_entries);
+	if (le16_to_cpu(trailer->db_free_rec_len))
+		dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+	else
+		dx_root->dr_free_blk = cpu_to_le64(0);
+
+	if (dx_inline) {
+		dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+		dx_root->dr_entries.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+	} else {
+		dx_root->dr_list.l_count =
+			cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+	}
+
+	ret = ocfs2_journal_dirty(handle, dx_root_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	ret = ocfs2_journal_access_di(handle, dir, di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di->i_dx_root = cpu_to_le64(dr_blkno);
+
+	OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	*ret_dx_root_bh = dx_root_bh;
+	dx_root_bh = NULL;
+
+out:
+	brelse(dx_root_bh);
+	return ret;
+}
+
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+				       handle_t *handle, struct inode *dir,
+				       struct buffer_head **dx_leaves,
+				       int num_dx_leaves, u64 start_blk)
+{
+	int ret, i;
+	struct ocfs2_dx_leaf *dx_leaf;
+	struct buffer_head *bh;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		bh = sb_getblk(osb->sb, start_blk + i);
+		if (bh == NULL) {
+			ret = -EIO;
+			goto out;
+		}
+		dx_leaves[i] = bh;
+
+		ocfs2_set_new_buffer_uptodate(dir, bh);
+
+		ret = ocfs2_journal_access_dl(handle, dir, bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+
+		memset(dx_leaf, 0, osb->sb->s_blocksize);
+		strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+		dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+		dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+		dx_leaf->dl_list.de_count =
+			cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+
+		mlog(0,
+		     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+		     (unsigned long long)bh->b_blocknr,
+		     le16_to_cpu(dx_leaf->dl_list.de_count));
+
+		ocfs2_journal_dirty(handle, bh);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+				      u32 cpos, handle_t *handle,
+				      struct ocfs2_alloc_context *data_ac,
+				      struct buffer_head **dx_leaves,
+				      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+	int ret;
+	u32 phys, num;
+	u64 phys_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	/*
+	 * XXX: For create, this should claim cluster for the index
+	 * *before* the unindexed insert so that we have a better
+	 * chance of contiguousness as the directory grows in number
+	 * of entries.
+	 */
+	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Format the new cluster first. That way, we're inserting
+	 * valid data.
+	 */
+	phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+	ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+					  num_dx_leaves, phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_phys_blkno = phys_blkno;
+out:
+	return ret;
+}
+
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, handle_t *handle,
+				    struct ocfs2_alloc_context *data_ac,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves)
+{
+	int ret;
+	u64 phys_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &phys_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+				  meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+							int *ret_num_leaves)
+{
+	int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+	struct buffer_head **dx_leaves;
+
+	dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+			    GFP_NOFS);
+	if (dx_leaves && ret_num_leaves)
+		*ret_num_leaves = num_dx_leaves;
+
+	return dx_leaves;
+}
+
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh,
+				 struct ocfs2_alloc_context *data_ac,
+				 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *leaf_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_hinfo hinfo;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	/*
+	 * Our strategy is to create the directory as though it were
+	 * unindexed, then add the index block. This works with very
+	 * little complication since the state of a new directory is a
+	 * very well known quantity.
+	 *
+	 * Essentially, we have two dirents ("." and ".."), in the 1st
+	 * block which need indexing. These are easily inserted into
+	 * the index block.
+	 */
+
+	ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+				    data_ac, &leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+					meta_ac, 1, 2, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	/* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+	ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+	ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+	ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+out:
+	brelse(dx_root_bh);
+	brelse(leaf_bh);
+	return ret;
+}
+
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       handle_t *handle,
 		       struct inode *parent,
 		       struct inode *inode,
 		       struct buffer_head *fe_bh,
-		       struct ocfs2_alloc_context *data_ac)
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac)
+
 {
 	BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
 
+	if (ocfs2_supports_indexed_dirs(osb))
+		return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+					     data_ac, meta_ac);
+
 	return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
-				     data_ac);
+				     data_ac, NULL);
+}
+
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+				    handle_t *handle,
+				    struct buffer_head **dx_leaves,
+				    int num_dx_leaves,
+				    u32 *num_dx_entries,
+				    struct buffer_head *dirent_bh)
+{
+	int ret, namelen, i;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct buffer_head *dx_leaf_bh;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		namelen = de->name_len;
+		if (!namelen || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+
+		i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+		dx_leaf_bh = dx_leaves[i];
+
+		ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+						 dirent_blk, dx_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		*num_dx_entries = *num_dx_entries + 1;
+
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+					 struct buffer_head *dx_root_bh,
+					 struct buffer_head *dirent_bh)
+{
+	char *de_buf, *limit;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dx_hinfo hinfo;
+	u64 dirent_blk = dirent_bh->b_blocknr;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	de_buf = dirent_bh->b_data;
+	limit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (!de->name_len || !de->inode)
+			goto inc;
+
+		ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+
+		mlog(0,
+		     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+		     (unsigned long long)dir->i_ino, hinfo.major_hash,
+		     hinfo.minor_hash,
+		     le16_to_cpu(dx_root->dr_entries.de_num_used),
+		     de->name_len, de->name);
+
+		ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+					   dirent_blk);
+
+		le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+}
+
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+					 struct buffer_head *di_bh)
+{
+	int dirent_count = 0;
+	char *de_buf, *limit;
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
+
+		if (de->name_len && de->inode)
+			dirent_count++;
+
+		de_buf += le16_to_cpu(de->rec_len);
+	}
+
+	/* We are careful to leave room for one extra record. */
+	return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
 }
 
 /*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
  * expansion from an inline directory to one with extents. The first dir block
  * in that case is taken from the inline data portion of the inode block.
  *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
  * We add the dir trailer if this filesystem wants it.
  */
-static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-				     struct super_block *sb)
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+					     struct inode *dir)
 {
+	struct super_block *sb = dir->i_sb;
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *prev_de;
 	char *de_buf, *limit;
 	unsigned int new_size = sb->s_blocksize;
-	unsigned int bytes;
+	unsigned int bytes, this_hole;
+	unsigned int largest_hole = 0;
 
-	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+	if (ocfs2_new_dir_wants_trailer(dir))
 		new_size = ocfs2_dir_trailer_blk_off(sb);
 
 	bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
 	de_buf = start;
 	de = (struct ocfs2_dir_entry *)de_buf;
 	do {
+		this_hole = ocfs2_figure_dirent_hole(de);
+		if (this_hole > largest_hole)
+			largest_hole = this_hole;
+
 		prev_de = de;
 		de_buf += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *)de_buf;
 	} while (de_buf < limit);
 
 	le16_add_cpu(&prev_de->rec_len, bytes);
+
+	/* We need to double check this after modification of the final
+	 * dirent. */
+	this_hole = ocfs2_figure_dirent_hole(prev_de);
+	if (this_hole > largest_hole)
+		largest_hole = this_hole;
+
+	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+		return largest_hole;
+	return 0;
 }
 
 /*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
  */
 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int blocks_wanted,
+				   struct ocfs2_dir_lookup_result *lookup,
 				   struct buffer_head **first_block_bh)
 {
-	u32 alloc, bit_off, len;
+	u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
 	struct super_block *sb = dir->i_sb;
-	int ret, credits = ocfs2_inline_to_extents_credits(sb);
-	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
+	int ret, i, num_dx_leaves = 0, dx_inline = 0,
+		credits = ocfs2_inline_to_extents_credits(sb);
+	u64 dx_insert_blkno, blkno,
+		bytes = blocks_wanted << sb->s_blocksize_bits;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(dir);
 	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct buffer_head *dirdata_bh = NULL;
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head **dx_leaves = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
-	int did_quota = 0;
+	struct ocfs2_extent_tree dx_et;
+	int did_quota = 0, bytes_allocated = 0;
 
 	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
+	dx_alloc = 0;
+
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		credits += ocfs2_add_dir_index_credits(sb);
+
+		dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+		if (!dx_inline) {
+			/* Add one more cluster for an index leaf */
+			dx_alloc++;
+			dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+								&num_dx_leaves);
+			if (!dx_leaves) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* This gets us the dx_root */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
 
 	/*
-	 * We should never need more than 2 clusters for this -
-	 * maximum dirent size is far less than one block. In fact,
-	 * the only time we'd need more than one cluster is if
+	 * We should never need more than 2 clusters for the unindexed
+	 * tree - maximum dirent size is far less than one block. In
+	 * fact, the only time we'd need more than one cluster is if
 	 * blocksize == clustersize and the dirent won't fit in the
 	 * extra space that the expansion to a single block gives. As
 	 * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 	/*
 	 * Prepare for worst case allocation scenario of two separate
-	 * extents.
+	 * extents in the unindexed tree.
 	 */
 	if (alloc == 2)
 		credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	}
 
 	if (vfs_dq_alloc_space_nodirty(dir,
-				ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+				ocfs2_clusters_to_bytes(osb->sb,
+							alloc + dx_alloc))) {
 		ret = -EDQUOT;
 		goto out_commit;
 	}
 	did_quota = 1;
+
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Allocate our index cluster first, to maximize the
+		 * possibility that unindexed leaves grow
+		 * contiguously.
+		 */
+		ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+						 dx_leaves, num_dx_leaves,
+						 &dx_insert_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+	}
+
 	/*
 	 * Try to claim as many clusters as the bitmap can give though
 	 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		mlog_errno(ret);
 		goto out_commit;
 	}
+	bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
 
 	/*
 	 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
 	memset(dirdata_bh->b_data + i_size_read(dir), 0,
 	       sb->s_blocksize - i_size_read(dir));
-	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-	if (ocfs2_supports_dir_trailer(osb))
-		ocfs2_init_dir_trailer(dir, dirdata_bh);
+	i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
+	if (ocfs2_new_dir_wants_trailer(dir)) {
+		/*
+		 * Prepare the dir trailer up front. It will otherwise look
+		 * like a valid dirent. Even if inserting the index fails
+		 * (unlikely), then all we'll have done is given first dir
+		 * block a small amount of fragmentation.
+		 */
+		ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+	}
 
 	ret = ocfs2_journal_dirty(handle, dirdata_bh);
 	if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_commit;
 	}
 
+	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+		/*
+		 * Dx dirs with an external cluster need to do this up
+		 * front. Inline dx root's get handled later, after
+		 * we've allocated our root block. We get passed back
+		 * a total number of items so that dr_num_entries can
+		 * be correctly set once the dx_root has been
+		 * allocated.
+		 */
+		ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+					       num_dx_leaves, &num_dx_entries,
+					       dirdata_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
 	/*
 	 * Set extent, i_size, etc on the directory. After this, the
 	 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_commit;
 	}
 
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+						dirdata_bh, meta_ac, dx_inline,
+						num_dx_entries, &dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		if (dx_inline) {
+			ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+						      dirdata_bh);
+		} else {
+			ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
+			ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+						  dx_insert_blkno, 1, 0, NULL);
+			if (ret)
+				mlog_errno(ret);
+		}
+	}
+
 	/*
 	 * We asked for two clusters, but only got one in the 1st
 	 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 			mlog_errno(ret);
 			goto out_commit;
 		}
+		bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
 	}
 
 	*first_block_bh = dirdata_bh;
 	dirdata_bh = NULL;
+	if (ocfs2_supports_indexed_dirs(osb)) {
+		unsigned int off;
+
+		if (!dx_inline) {
+			/*
+			 * We need to return the correct block within the
+			 * cluster which should hold our entry.
+			 */
+			off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+						    &lookup->dl_hinfo);
+			get_bh(dx_leaves[off]);
+			lookup->dl_dx_leaf_bh = dx_leaves[off];
+		}
+		lookup->dl_dx_root_bh = dx_root_bh;
+		dx_root_bh = NULL;
+	}
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir,
-			ocfs2_clusters_to_bytes(osb->sb, 2));
+		vfs_dq_free_space_nodirty(dir, bytes_allocated);
+
 	ocfs2_commit_trans(osb, handle);
 
 out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
 out:
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
 
 	brelse(dirdata_bh);
+	brelse(dx_root_bh);
 
 	return ret;
 }
@@ -1658,11 +3258,14 @@ bail:
  * is to be turned into an extent based one. The size of the dirent to
  * insert might be larger than the space gained by growing to just one
  * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
  */
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct inode *dir,
 			    struct buffer_head *parent_fe_bh,
 			    unsigned int blocks_wanted,
+			    struct ocfs2_dir_lookup_result *lookup,
 			    struct buffer_head **new_de_bh)
 {
 	int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	struct ocfs2_dir_entry * de;
 	struct super_block *sb = osb->sb;
 	struct ocfs2_extent_tree et;
+	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
 
 	mlog_entry_void();
 
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * This would be a code error as an inline directory should
+		 * never have an index root.
+		 */
+		BUG_ON(dx_root_bh);
+
 		status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
-						 blocks_wanted, &new_bh);
+						 blocks_wanted, lookup,
+						 &new_bh);
 		if (status) {
 			mlog_errno(status);
 			goto bail;
 		}
 
+		/* Expansion from inline to an indexed directory will
+		 * have given us this. */
+		dx_root_bh = lookup->dl_dx_root_bh;
+
 		if (blocks_wanted == 1) {
 			/*
 			 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	}
 
 do_extend:
+	if (ocfs2_dir_indexed(dir))
+		credits++; /* For attaching the new dirent block to the
+			    * dx_root */
+
 	down_write(&OCFS2_I(dir)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
@@ -1781,9 +3400,19 @@ do_extend:
 
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de->inode = 0;
-	if (ocfs2_dir_has_trailer(dir)) {
+	if (ocfs2_supports_dir_trailer(dir)) {
 		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
-		ocfs2_init_dir_trailer(dir, new_bh);
+
+		ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+
+		if (ocfs2_dir_indexed(dir)) {
+			status = ocfs2_dx_dir_link_trailer(dir, handle,
+							   dx_root_bh, new_bh);
+			if (status) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
 	} else {
 		de->rec_len = cpu_to_le16(sb->s_blocksize);
 	}
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 	 * This calculates how many free bytes we'd have in block zero, should
 	 * this function force expansion to an extent tree.
 	 */
-	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+	if (ocfs2_new_dir_wants_trailer(dir))
 		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
 	else
 		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
 	return status;
 }
 
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+	const struct ocfs2_dx_entry *entry1 = a;
+	const struct ocfs2_dx_entry *entry2 = b;
+	u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+	u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+	u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+	u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+
+	if (major_hash1 > major_hash2)
+		return 1;
+	if (major_hash1 < major_hash2)
+		return -1;
+
+	/*
+	 * It is not strictly necessary to sort by minor
+	 */
+	if (minor_hash1 > minor_hash2)
+		return 1;
+	if (minor_hash1 < minor_hash2)
+		return -1;
+	return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+	struct ocfs2_dx_entry *entry1 = a;
+	struct ocfs2_dx_entry *entry2 = b;
+	struct ocfs2_dx_entry tmp;
+
+	BUG_ON(size != sizeof(*entry1));
+
+	tmp = *entry1;
+	*entry1 = *entry2;
+	*entry2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num = le16_to_cpu(dl_list->de_num_used);
+
+	for (i = 0; i < (num - 1); i++) {
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+		    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+					u32 leaf_cpos, u32 insert_hash,
+					u32 *split_hash)
+{
+	struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+	int i, num_used = le16_to_cpu(dl_list->de_num_used);
+	int allsame;
+
+	/*
+	 * There's a couple rare, but nasty corner cases we have to
+	 * check for here. All of them involve a leaf where all value
+	 * have the same hash, which is what we look for first.
+	 *
+	 * Most of the time, all of the above is false, and we simply
+	 * pick the median value for a split.
+	 */
+	allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+	if (allsame) {
+		u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+
+		if (val == insert_hash) {
+			/*
+			 * No matter where we would choose to split,
+			 * the new entry would want to occupy the same
+			 * block as these. Since there's no space left
+			 * in their existing block, we know there
+			 * won't be space after the split.
+			 */
+			return -ENOSPC;
+		}
+
+		if (val == leaf_cpos) {
+			/*
+			 * Because val is the same as leaf_cpos (which
+			 * is the smallest value this leaf can have),
+			 * yet is not equal to insert_hash, then we
+			 * know that insert_hash *must* be larger than
+			 * val (and leaf_cpos). At least cpos+1 in value.
+			 *
+			 * We also know then, that there cannot be an
+			 * adjacent extent (otherwise we'd be looking
+			 * at it). Choosing this value gives us a
+			 * chance to get some contiguousness.
+			 */
+			*split_hash = leaf_cpos + 1;
+			return 0;
+		}
+
+		if (val > insert_hash) {
+			/*
+			 * val can not be the same as insert hash, and
+			 * also must be larger than leaf_cpos. Also,
+			 * we know that there can't be a leaf between
+			 * cpos and val, otherwise the entries with
+			 * hash 'val' would be there.
+			 */
+			*split_hash = val;
+			return 0;
+		}
+
+		*split_hash = insert_hash;
+		return 0;
+	}
+
+	/*
+	 * Since the records are sorted and the checks above
+	 * guaranteed that not all records in this block are the same,
+	 * we simple travel forward, from the median, and pick the 1st
+	 * record whose value is larger than leaf_cpos.
+	 */
+	for (i = (num_used / 2); i < num_used; i++)
+		if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+		    leaf_cpos)
+			break;
+
+	BUG_ON(i == num_used); /* Should be impossible */
+	*split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+	return 0;
+}
+
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+				       handle_t *handle,
+				       struct ocfs2_dx_leaf *tmp_dx_leaf,
+				       struct buffer_head **orig_dx_leaves,
+				       struct buffer_head **new_dx_leaves,
+				       int num_dx_leaves)
+{
+	int i, j, num_used;
+	u32 major_hash;
+	struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+	struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+	struct ocfs2_dx_entry *dx_entry;
+
+	tmp_list = &tmp_dx_leaf->dl_list;
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+		orig_list = &orig_dx_leaf->dl_list;
+		new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+		new_list = &new_dx_leaf->dl_list;
+
+		num_used = le16_to_cpu(orig_list->de_num_used);
+
+		memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+		tmp_list->de_num_used = cpu_to_le16(0);
+		memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+
+		for (j = 0; j < num_used; j++) {
+			dx_entry = &orig_list->de_entries[j];
+			major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+			if (major_hash >= split_hash)
+				ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+							      dx_entry);
+			else
+				ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+							      dx_entry);
+		}
+		memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+
+		ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+		ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+	}
+}
+
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+					  struct ocfs2_dx_root_block *dx_root)
+{
+	int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+
+	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+	credits += ocfs2_quota_trans_credits(osb->sb);
+	return credits;
+}
+
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+				  struct buffer_head *dx_root_bh,
+				  struct buffer_head *dx_leaf_bh,
+				  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+				  u64 leaf_blkno)
+{
+	struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+	int credits, ret, i, num_used, did_quota = 0;
+	u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+	u64 orig_leaves_start;
+	int num_dx_leaves;
+	struct buffer_head **orig_dx_leaves = NULL;
+	struct buffer_head **new_dx_leaves = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+
+	mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+	     (unsigned long long)leaf_blkno, insert_hash);
+
+	ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	/*
+	 * XXX: This is a rather large limit. We should use a more
+	 * realistic value.
+	 */
+	if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+		return -ENOSPC;
+
+	num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+	if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+		     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+		     (unsigned long long)leaf_blkno, num_used);
+		ret = -EIO;
+		goto out;
+	}
+
+	orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!orig_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+	if (!new_dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (vfs_dq_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
+	did_quota = 1;
+
+	ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * This block is changing anyway, so we can sort it in place.
+	 */
+	sort(dx_leaf->dl_list.de_entries, num_used,
+	     sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+	     dx_leaf_sort_swap);
+
+	ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+					   &split_hash);
+	if (ret) {
+		mlog_errno(ret);
+		goto  out_commit;
+	}
+
+	mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+	     leaf_cpos, split_hash, insert_hash);
+
+	/*
+	 * We have to carefully order operations here. There are items
+	 * which want to be in the new cluster before insert, but in
+	 * order to put those items in the new cluster, we alter the
+	 * old cluster. A failure to insert gets nasty.
+	 *
+	 * So, start by reserving writes to the old
+	 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+	 * the new cluster for us, before inserting it. The insert
+	 * won't happen if there's an error before that. Once the
+	 * insert is done then, we can transfer from one leaf into the
+	 * other without fear of hitting any error.
+	 */
+
+	/*
+	 * The leaf transfer wants some scratch space so that we don't
+	 * wind up doing a bunch of expensive memmove().
+	 */
+	tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+	if (!tmp_dx_leaf) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+	ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+				   orig_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	for (i = 0; i < num_dx_leaves; i++) {
+		ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	cpos = split_hash;
+	ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+				       data_ac, meta_ac, new_dx_leaves,
+				       num_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+				   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (orig_dx_leaves || new_dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++) {
+			if (orig_dx_leaves)
+				brelse(orig_dx_leaves[i]);
+			if (new_dx_leaves)
+				brelse(new_dx_leaves[i]);
+		}
+		kfree(orig_dx_leaves);
+		kfree(new_dx_leaves);
+	}
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	kfree(tmp_dx_leaf);
+	return ret;
+}
+
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+				   struct buffer_head *di_bh,
+				   struct buffer_head *dx_root_bh,
+				   const char *name, int namelen,
+				   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, rebalanced = 0;
+	struct ocfs2_dx_root_block *dx_root;
+	struct buffer_head *dx_leaf_bh = NULL;
+	struct ocfs2_dx_leaf *dx_leaf;
+	u64 blkno;
+	u32 leaf_cpos;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+restart_search:
+	ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+				  &leaf_cpos, &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+
+	if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+	    le16_to_cpu(dx_leaf->dl_list.de_count)) {
+		if (rebalanced) {
+			/*
+			 * Rebalancing should have provided us with
+			 * space in an appropriate leaf.
+			 *
+			 * XXX: Is this an abnormal condition then?
+			 * Should we print a message here?
+			 */
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+					     &lookup->dl_hinfo, leaf_cpos,
+					     blkno);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Restart the lookup. The rebalance might have
+		 * changed which block our item fits into. Mark our
+		 * progress, so we only execute this once.
+		 */
+		brelse(dx_leaf_bh);
+		dx_leaf_bh = NULL;
+		rebalanced = 1;
+		goto restart_search;
+	}
+
+	lookup->dl_dx_leaf_bh = dx_leaf_bh;
+	dx_leaf_bh = NULL;
+
+out:
+	brelse(dx_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_search_dx_free_list(struct inode *dir,
+				     struct buffer_head *dx_root_bh,
+				     int namelen,
+				     struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret = -ENOSPC;
+	struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+	struct ocfs2_dir_block_trailer *db;
+	u64 next_block;
+	int rec_len = OCFS2_DIR_REC_LEN(namelen);
+	struct ocfs2_dx_root_block *dx_root;
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	next_block = le64_to_cpu(dx_root->dr_free_blk);
+
+	while (next_block) {
+		brelse(prev_leaf_bh);
+		prev_leaf_bh = leaf_bh;
+		leaf_bh = NULL;
+
+		ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+		if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+			lookup->dl_leaf_bh = leaf_bh;
+			lookup->dl_prev_leaf_bh = prev_leaf_bh;
+			leaf_bh = NULL;
+			prev_leaf_bh = NULL;
+			break;
+		}
+
+		next_block = le64_to_cpu(db->db_free_next);
+	}
+
+	if (!next_block)
+		ret = -ENOSPC;
+
+out:
+
+	brelse(leaf_bh);
+	brelse(prev_leaf_bh);
+	return ret;
+}
+
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+				       struct buffer_head *dx_root_bh)
+{
+	int ret, num_dx_leaves, i, j, did_quota = 0;
+	struct buffer_head **dx_leaves = NULL;
+	struct ocfs2_extent_tree et;
+	u64 insert_blkno;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	handle_t *handle = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+	struct ocfs2_dx_entry *dx_entry;
+	struct ocfs2_dx_leaf *target_leaf;
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+	if (!dx_leaves) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (vfs_dq_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
+	did_quota = 1;
+
+	/*
+	 * We do this up front, before the allocation, so that a
+	 * failure to add the dx_root_bh to the journal won't result
+	 * us losing clusters.
+	 */
+	ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+					 num_dx_leaves, &insert_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Transfer the entries from our dx_root into the appropriate
+	 * block
+	 */
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+		dx_entry = &entry_list->de_entries[i];
+
+		j = __ocfs2_dx_dir_hash_idx(osb,
+					    le32_to_cpu(dx_entry->dx_minor_hash));
+		target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+
+		ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+
+		/* Each leaf has been passed to the journal already
+		 * via __ocfs2_dx_dir_new_cluster() */
+	}
+
+	dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+	memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+	       offsetof(struct ocfs2_dx_root_block, dr_list));
+	dx_root->dr_list.l_count =
+		cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+
+	/* This should never fail considering we start with an empty
+	 * dx_root. */
+	ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+	ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
+				  insert_blkno, 1, 0, NULL);
+	if (ret)
+		mlog_errno(ret);
+	did_quota = 0;
+
+	ocfs2_journal_dirty(handle, dx_root_bh);
+
+out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(dir,
+					  ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	if (dx_leaves) {
+		for (i = 0; i < num_dx_leaves; i++)
+			brelse(dx_leaves[i]);
+		kfree(dx_leaves);
+	}
+	return ret;
+}
+
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dx_entry_list *entry_list;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+	entry_list = &dx_root->dr_entries;
+
+	if (le16_to_cpu(entry_list->de_num_used) >=
+	    le16_to_cpu(entry_list->de_count))
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+					   struct buffer_head *di_bh,
+					   const char *name,
+					   int namelen,
+					   struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret, free_dx_root = 1;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct buffer_head *leaf_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+	if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+		ret = -ENOSPC;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_dx_root_inline(dx_root)) {
+		ret = ocfs2_inline_dx_has_space(dx_root_bh);
+
+		if (ret == 0)
+			goto search_el;
+
+		/*
+		 * We ran out of room in the root block. Expand it to
+		 * an extent, then allow ocfs2_find_dir_space_dx to do
+		 * the rest.
+		 */
+		ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Insert preparation for an indexed directory is split into two
+	 * steps. The call to find_dir_space_dx reserves room in the index for
+	 * an additional item. If we run out of space there, it's a real error
+	 * we can't continue on.
+	 */
+	ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+				      namelen, lookup);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+search_el:
+	/*
+	 * Next, we need to find space in the unindexed tree. This call
+	 * searches using the free space linked list. If the unindexed tree
+	 * lacks sufficient space, we'll expand it below. The expansion code
+	 * is smart enough to add any new blocks to the free space list.
+	 */
+	ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Do this up here - ocfs2_extend_dir might need the dx_root */
+	lookup->dl_dx_root_bh = dx_root_bh;
+	free_dx_root = 0;
+
+	if (ret == -ENOSPC) {
+		ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We make the assumption here that new leaf blocks are added
+		 * to the front of our free list.
+		 */
+		lookup->dl_prev_leaf_bh = NULL;
+		lookup->dl_leaf_bh = leaf_bh;
+	}
+
+out:
+	if (free_dx_root)
+		brelse(dx_root_bh);
+	return ret;
+}
+
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 struct inode *dir,
 				 struct buffer_head *parent_fe_bh,
 				 const char *name,
 				 int namelen,
-				 struct buffer_head **ret_de_bh)
+				 struct ocfs2_dir_lookup_result *lookup)
 {
 	int ret;
 	unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
 	     namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
-	*ret_de_bh = NULL;
-
 	if (!namelen) {
 		ret = -EINVAL;
 		mlog_errno(ret);
 		goto out;
 	}
 
+	/*
+	 * Do this up front to reduce confusion.
+	 *
+	 * The directory might start inline, then be turned into an
+	 * indexed one, in which case we'd need to hash deep inside
+	 * ocfs2_find_dir_space_id(). Since
+	 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+	 * done, there seems no point in spreading out the calls. We
+	 * can optimize away the case where the file system doesn't
+	 * support indexing.
+	 */
+	if (ocfs2_supports_indexed_dirs(osb))
+		ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+
+	if (ocfs2_dir_indexed(dir)) {
+		ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+						      name, namelen, lookup);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
 					      namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 		BUG_ON(bh);
 
 		ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
-				       &bh);
+				       lookup, &bh);
 		if (ret) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 		BUG_ON(!bh);
 	}
 
-	*ret_de_bh = bh;
+	lookup->dl_leaf_bh = bh;
 	bh = NULL;
 out:
 	brelse(bh);
 	return ret;
 }
+
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+				     struct buffer_head *di_bh,
+				     struct buffer_head *dx_root_bh)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dx_root_block *dx_root;
+	struct inode *dx_alloc_inode = NULL;
+	struct buffer_head *dx_alloc_bh = NULL;
+	handle_t *handle;
+	u64 blk;
+	u16 bit;
+	u64 bg_blkno;
+
+	dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+	dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(dx_root->dr_suballoc_slot));
+	if (!dx_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&dx_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, dir, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+	di->i_dx_root = cpu_to_le64(0ULL);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	blk = le64_to_cpu(dx_root->dr_blkno);
+	bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	ocfs2_inode_unlock(dx_alloc_inode, 1);
+
+out_mutex:
+	mutex_unlock(&dx_alloc_inode->i_mutex);
+	brelse(dx_alloc_bh);
+out:
+	iput(dx_alloc_inode);
+	return ret;
+}
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+	int ret;
+	unsigned int uninitialized_var(clen);
+	u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+	u64 uninitialized_var(blkno);
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct buffer_head *dx_root_bh = NULL;
+	struct ocfs2_dx_root_block *dx_root;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!ocfs2_dir_indexed(dir))
+		return 0;
+
+	ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+	if (ocfs2_dx_root_inline(dx_root))
+		goto remove_index;
+
+	ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+
+	/* XXX: What if dr_clusters is too large? */
+	while (le32_to_cpu(dx_root->dr_clusters)) {
+		ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+					      major_hash, &cpos, &blkno, &clen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+					       &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (cpos == 0)
+			break;
+
+		major_hash = cpos - 1;
+	}
+
+remove_index:
+	ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(dir, dx_root_bh);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	brelse(dx_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
 
-struct buffer_head *ocfs2_find_entry(const char *name,
-				     int namelen,
-				     struct inode *dir,
-				     struct ocfs2_dir_entry **res_dir);
+struct ocfs2_dx_hinfo {
+	u32	major_hash;
+	u32	minor_hash;
+};
+
+struct ocfs2_dir_lookup_result {
+	struct buffer_head		*dl_leaf_bh;	/* Unindexed leaf
+							 * block */
+	struct ocfs2_dir_entry		*dl_entry;	/* Target dirent in
+							 * unindexed leaf */
+
+	struct buffer_head		*dl_dx_root_bh;	/* Root of indexed
+							 * tree */
+
+	struct buffer_head		*dl_dx_leaf_bh;	/* Indexed leaf block */
+	struct ocfs2_dx_entry		*dl_dx_entry;	/* Target dx_entry in
+							 * indexed leaf */
+	struct ocfs2_dx_hinfo		dl_hinfo;	/* Name hash results */
+
+	struct buffer_head		*dl_prev_leaf_bh;/* Previous entry in
+							  * dir free space
+							  * list. NULL if
+							  * previous entry is
+							  * dx root block. */
+};
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+
+int ocfs2_find_entry(const char *name, int namelen,
+		     struct inode *dir,
+		     struct ocfs2_dir_lookup_result *lookup);
 int ocfs2_delete_entry(handle_t *handle,
 		       struct inode *dir,
-		       struct ocfs2_dir_entry *de_del,
-		       struct buffer_head *bh);
+		       struct ocfs2_dir_lookup_result *res);
 int __ocfs2_add_entry(handle_t *handle,
 		      struct inode *dir,
 		      const char *name, int namelen,
 		      struct inode *inode, u64 blkno,
 		      struct buffer_head *parent_fe_bh,
-		      struct buffer_head *insert_bh);
+		      struct ocfs2_dir_lookup_result *lookup);
 static inline int ocfs2_add_entry(handle_t *handle,
 				  struct dentry *dentry,
 				  struct inode *inode, u64 blkno,
 				  struct buffer_head *parent_fe_bh,
-				  struct buffer_head *insert_bh)
+				  struct ocfs2_dir_lookup_result *lookup)
 {
 	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
 				 dentry->d_name.name, dentry->d_name.len,
-				 inode, blkno, parent_fe_bh, insert_bh);
+				 inode, blkno, parent_fe_bh, lookup);
 }
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+		       struct ocfs2_dir_lookup_result *res,
 		       struct inode *new_entry_inode);
 
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
 			      int namelen);
 int ocfs2_empty_dir(struct inode *inode);
+
 int ocfs2_find_files_on_disk(const char *name,
 			     int namelen,
 			     u64 *blkno,
 			     struct inode *inode,
-			     struct buffer_head **dirent_bh,
-			     struct ocfs2_dir_entry **dirent);
+			     struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
 			       int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 struct buffer_head *parent_fe_bh,
 				 const char *name,
 				 int namelen,
-				 struct buffer_head **ret_de_bh);
+				 struct ocfs2_dir_lookup_result *lookup);
 struct ocfs2_alloc_context;
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       handle_t *handle,
 		       struct inode *parent,
 		       struct inode *inode,
 		       struct buffer_head *fe_bh,
-		       struct ocfs2_alloc_context *data_ac);
+		       struct ocfs2_alloc_context *data_ac,
+		       struct ocfs2_alloc_context *meta_ac);
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
 
 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
 							    void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
 enum dlm_mle_type {
 	DLM_MLE_BLOCK,
 	DLM_MLE_MASTER,
-	DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name {
-	u8 len;
-	u8 name[DLM_LOCKID_NAME_MAX];
+	DLM_MLE_MIGRATION,
+	DLM_MLE_NUM_TYPES
 };
 
 struct dlm_master_list_entry {
-	struct list_head list;
+	struct hlist_node master_hash_node;
 	struct list_head hb_events;
 	struct dlm_ctxt *dlm;
 	spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
 	enum dlm_mle_type type;
 	struct o2hb_callback_func mle_hb_up;
 	struct o2hb_callback_func mle_hb_down;
-	union {
-		struct dlm_lock_resource *res;
-		struct dlm_lock_name name;
-	} u;
+	struct dlm_lock_resource *mleres;
+	unsigned char mname[DLM_LOCKID_NAME_MAX];
+	unsigned int mnamelen;
+	unsigned int mnamehash;
 };
 
 enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
 	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct dlm_recovery_ctxt reco;
 	spinlock_t master_lock;
-	struct list_head master_list;
+	struct hlist_head **master_hash;
 	struct list_head mle_hb_events;
 
 	/* these give a really vague idea of the system load */
-	atomic_t local_resources;
-	atomic_t remote_resources;
-	atomic_t unknown_resources;
+	atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
+	atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
+	atomic_t res_tot_count;
+	atomic_t res_cur_count;
 
 	struct dlm_debug_ctxt *dlm_debug_ctxt;
 	struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
 	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
 }
 
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+						 unsigned i)
+{
+	return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+			(i % DLM_BUCKETS_PER_PAGE);
+}
+
 /* these keventd work queue items are for less-frequently
  * called functions that cannot be directly called from the
  * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					      unsigned int len);
 
 int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-			      struct dlm_lock_resource *res,
-			      u8 owner);
+
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 						 const char *lockid,
 						 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
 					  DLM_LOCK_RES_MIGRATING));
 }
 
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+
 /* create/destroy slab caches */
 int dlm_init_master_caches(void);
 void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
 	return bit;
 }
 
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+					 struct dlm_lock_resource *res,
+					 u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	res->owner = owner;
+}
 
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+					    struct dlm_lock_resource *res,
+					    u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner != res->owner)
+		dlm_set_lockres_owner(dlm, res, owner);
+}
 
 #endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
 	int out = 0;
-	unsigned int namelen;
-	const char *name;
 	char *mle_type;
 
-	if (mle->type != DLM_MLE_MASTER) {
-		namelen = mle->u.name.len;
-		name = mle->u.name.name;
-	} else {
-		namelen = mle->u.res->lockname.len;
-		name = mle->u.res->lockname.name;
-	}
-
 	if (mle->type == DLM_MLE_BLOCK)
 		mle_type = "BLK";
 	else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 	else
 		mle_type = "MIG";
 
-	out += stringify_lockname(name, namelen, buf + out, len - out);
+	out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
 	out += snprintf(buf + out, len - out,
 			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
 			mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
 	struct dlm_master_list_entry *mle;
-	int out = 0;
-	unsigned long total = 0;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	int i, out = 0;
+	unsigned long total = 0, longest = 0, bktcnt;
 
 	out += snprintf(db->buf + out, db->len - out,
 			"Dumping MLEs for Domain: %s\n", dlm->name);
 
 	spin_lock(&dlm->master_lock);
-	list_for_each_entry(mle, &dlm->master_list, list) {
-		++total;
-		if (db->len - out < 200)
-			continue;
-		out += dump_mle(mle, db->buf + out, db->len - out);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each(list, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+			++total;
+			++bktcnt;
+			if (db->len - out < 200)
+				continue;
+			out += dump_mle(mle, db->buf + out, db->len - out);
+		}
+		longest = max(longest, bktcnt);
+		bktcnt = 0;
 	}
 	spin_unlock(&dlm->master_lock);
 
 	out += snprintf(db->buf + out, db->len - out,
-			"Total on list: %ld\n", total);
+			"Total: %ld, Longest: %ld\n", total, longest);
 	return out;
 }
 
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 	int out = 0;
 	struct dlm_reco_node_data *node;
 	char *state;
-	int lres, rres, ures, tres;
-
-	lres = atomic_read(&dlm->local_resources);
-	rres = atomic_read(&dlm->remote_resources);
-	ures = atomic_read(&dlm->unknown_resources);
-	tres = lres + rres + ures;
+	int cur_mles = 0, tot_mles = 0;
+	int i;
 
 	spin_lock(&dlm->spinlock);
 
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 				 db->buf + out, db->len - out);
 	out += snprintf(db->buf + out, db->len - out, "\n");
 
-	/* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+	/* Lock Resources: xxx (xxx) */
+	out += snprintf(db->buf + out, db->len - out,
+			"Lock Resources: %d (%d)\n",
+			atomic_read(&dlm->res_cur_count),
+			atomic_read(&dlm->res_tot_count));
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+		cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+
+	/* MLEs: xxx (xxx) */
+	out += snprintf(db->buf + out, db->len - out,
+			"MLEs: %d (%d)\n", cur_mles, tot_mles);
+
+	/*  Blocking: xxx (xxx) */
+	out += snprintf(db->buf + out, db->len - out,
+			"  Blocking: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+
+	/*  Mastery: xxx (xxx) */
+	out += snprintf(db->buf + out, db->len - out,
+			"  Mastery: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+
+	/*  Migration: xxx (xxx) */
 	out += snprintf(db->buf + out, db->len - out,
-			"Mastered Resources Total: %d  Locally: %d  "
-			"Remotely: %d  Unknown: %d\n",
-			tres, lres, rres, ures);
+			"  Migration: %d (%d)\n",
+			atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
+			atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
 
 	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
 	out += snprintf(db->buf + out, db->len - out,
 			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
-			"PendingBASTs=%s  Master=%s\n",
+			"PendingBASTs=%s\n",
 			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
 			(list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
 			(list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
-			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
-			(list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
 
 	/* Purge Count: xxx  Refs: xxx */
 	out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 	if (dlm->lockres_hash)
 		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
+	if (dlm->master_hash)
+		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+
 	if (dlm->name)
 		kfree(dlm->name);
 
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	for (i = 0; i < DLM_HASH_BUCKETS; i++)
 		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
 
+	dlm->master_hash = (struct hlist_head **)
+				dlm_alloc_pagevec(DLM_HASH_PAGES);
+	if (!dlm->master_hash) {
+		mlog_errno(-ENOMEM);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
 	strcpy(dlm->name, domain);
 	dlm->key = key;
 	dlm->node_num = o2nm_this_node();
 
 	ret = dlm_create_debugfs_subroot(dlm);
 	if (ret < 0) {
+		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
 		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 		kfree(dlm->name);
 		kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	init_waitqueue_head(&dlm->reco.event);
 	init_waitqueue_head(&dlm->ast_wq);
 	init_waitqueue_head(&dlm->migration_wq);
-	INIT_LIST_HEAD(&dlm->master_list);
 	INIT_LIST_HEAD(&dlm->mle_hb_events);
 
 	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 
 	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
 	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-	atomic_set(&dlm->local_resources, 0);
-	atomic_set(&dlm->remote_resources, 0);
-	atomic_set(&dlm->unknown_resources, 0);
+
+	atomic_set(&dlm->res_tot_count, 0);
+	atomic_set(&dlm->res_cur_count, 0);
+	for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+		atomic_set(&dlm->mle_tot_count[i], 0);
+		atomic_set(&dlm->mle_cur_count[i], 0);
+	}
 
 	spin_lock_init(&dlm->work_lock);
 	INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27caf..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 				const char *name,
 				unsigned int namelen)
 {
-	struct dlm_lock_resource *res;
-
 	if (dlm != mle->dlm)
 		return 0;
 
-	if (mle->type == DLM_MLE_BLOCK ||
-	    mle->type == DLM_MLE_MIGRATION) {
-		if (namelen != mle->u.name.len ||
-    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
-			return 0;
-	} else {
-		res = mle->u.res;
-		if (namelen != res->lockname.len ||
-		    memcmp(res->lockname.name, name, namelen) != 0)
-			return 0;
-	}
+	if (namelen != mle->mnamelen ||
+	    memcmp(name, mle->mname, namelen) != 0)
+		return 0;
+
 	return 1;
 }
 
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 
 	mle->dlm = dlm;
 	mle->type = type;
-	INIT_LIST_HEAD(&mle->list);
+	INIT_HLIST_NODE(&mle->master_hash_node);
 	INIT_LIST_HEAD(&mle->hb_events);
 	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
 	spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	mle->new_master = O2NM_MAX_NODES;
 	mle->inuse = 0;
 
+	BUG_ON(mle->type != DLM_MLE_BLOCK &&
+	       mle->type != DLM_MLE_MASTER &&
+	       mle->type != DLM_MLE_MIGRATION);
+
 	if (mle->type == DLM_MLE_MASTER) {
 		BUG_ON(!res);
-		mle->u.res = res;
-	} else if (mle->type == DLM_MLE_BLOCK) {
-		BUG_ON(!name);
-		memcpy(mle->u.name.name, name, namelen);
-		mle->u.name.len = namelen;
-	} else /* DLM_MLE_MIGRATION */ {
+		mle->mleres = res;
+		memcpy(mle->mname, res->lockname.name, res->lockname.len);
+		mle->mnamelen = res->lockname.len;
+		mle->mnamehash = res->lockname.hash;
+	} else {
 		BUG_ON(!name);
-		memcpy(mle->u.name.name, name, namelen);
-		mle->u.name.len = namelen;
+		mle->mleres = NULL;
+		memcpy(mle->mname, name, namelen);
+		mle->mnamelen = namelen;
+		mle->mnamehash = dlm_lockid_hash(name, namelen);
 	}
 
+	atomic_inc(&dlm->mle_tot_count[mle->type]);
+	atomic_inc(&dlm->mle_cur_count[mle->type]);
+
 	/* copy off the node_map and register hb callbacks on our copy */
 	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
 	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	__dlm_mle_attach_hb_events(dlm, mle);
 }
 
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	if (!hlist_unhashed(&mle->master_hash_node))
+		hlist_del_init(&mle->master_hash_node);
+}
+
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+	struct hlist_head *bucket;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	bucket = dlm_master_hash(dlm, mle->mnamehash);
+	hlist_add_head(&mle->master_hash_node, bucket);
+}
 
 /* returns 1 if found, 0 if not */
 static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
 			char *name, unsigned int namelen)
 {
 	struct dlm_master_list_entry *tmpmle;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	unsigned int hash;
 
 	assert_spin_locked(&dlm->master_lock);
 
-	list_for_each_entry(tmpmle, &dlm->master_list, list) {
+	hash = dlm_lockid_hash(name, namelen);
+	bucket = dlm_master_hash(dlm, hash);
+	hlist_for_each(list, bucket) {
+		tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+				     master_hash_node);
 		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
 			continue;
 		dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
 	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
 	dlm = mle->dlm;
 
-	if (mle->type != DLM_MLE_MASTER) {
-		mlog(0, "calling mle_release for %.*s, type %d\n",
-		     mle->u.name.len, mle->u.name.name, mle->type);
-	} else {
-		mlog(0, "calling mle_release for %.*s, type %d\n",
-		     mle->u.res->lockname.len,
-		     mle->u.res->lockname.name, mle->type);
-	}
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
 
+	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+	     mle->type);
+
 	/* remove from list if not already */
-	if (!list_empty(&mle->list))
-		list_del_init(&mle->list);
+	__dlm_unlink_mle(dlm, mle);
 
 	/* detach the mle from the domain node up/down events */
 	__dlm_mle_detach_hb_events(dlm, mle);
 
+	atomic_dec(&dlm->mle_cur_count[mle->type]);
+
 	/* NOTE: kfree under spinlock here.
 	 * if this is bad, we can move this to a freelist. */
 	kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
 		kmem_cache_destroy(dlm_lockres_cache);
 }
 
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
-				  struct dlm_lock_resource *res,
-				  u8 owner)
-{
-	assert_spin_locked(&res->spinlock);
-
-	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
-
-	if (owner == dlm->node_num)
-		atomic_inc(&dlm->local_resources);
-	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-		atomic_inc(&dlm->unknown_resources);
-	else
-		atomic_inc(&dlm->remote_resources);
-
-	res->owner = owner;
-}
-
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-			      struct dlm_lock_resource *res, u8 owner)
-{
-	assert_spin_locked(&res->spinlock);
-
-	if (owner == res->owner)
-		return;
-
-	if (res->owner == dlm->node_num)
-		atomic_dec(&dlm->local_resources);
-	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-		atomic_dec(&dlm->unknown_resources);
-	else
-		atomic_dec(&dlm->remote_resources);
-
-	dlm_set_lockres_owner(dlm, res, owner);
-}
-
-
 static void dlm_lockres_release(struct kref *kref)
 {
 	struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
 	}
 	spin_unlock(&dlm->track_lock);
 
+	atomic_dec(&dlm->res_cur_count);
+
 	dlm_put(dlm);
 
 	if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	kref_init(&res->refs);
 
+	atomic_inc(&dlm->res_tot_count);
+	atomic_inc(&dlm->res_cur_count);
+
 	/* just for consistency */
 	spin_lock(&res->spinlock);
 	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
 		alloc_mle = NULL;
 		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
 		set_bit(dlm->node_num, mle->maybe_map);
-		list_add(&mle->list, &dlm->master_list);
+		__dlm_insert_mle(dlm, mle);
 
 		/* still holding the dlm spinlock, check the recovery map
 		 * to see if there are any nodes that still need to be 
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 						     res->lockname.len,
 						     res->lockname.name);
 						mle->type = DLM_MLE_MASTER;
-						mle->u.res = res;
+						mle->mleres = res;
 					}
 				}
 			}
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
 
 	BUG_ON(mle->type == DLM_MLE_MIGRATION);
 
-	if (mle->type != DLM_MLE_MASTER) {
-		request.namelen = mle->u.name.len;
-		memcpy(request.name, mle->u.name.name, request.namelen);
-	} else {
-		request.namelen = mle->u.res->lockname.len;
-		memcpy(request.name, mle->u.res->lockname.name,
-			request.namelen);
-	}
+	request.namelen = (u8)mle->mnamelen;
+	memcpy(request.name, mle->mname, request.namelen);
 
 again:
 	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
 		// "add the block.\n");
 		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
 		set_bit(request->node_idx, mle->maybe_map);
-		list_add(&mle->list, &dlm->master_list);
+		__dlm_insert_mle(dlm, mle);
 		response = DLM_MASTER_RESP_NO;
 	} else {
 		// mlog(0, "mle was found\n");
@@ -1849,12 +1831,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 		if (!mle) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
 			    res->owner != assert->node_idx) {
-				mlog(ML_ERROR, "assert_master from "
-					  "%u, but current owner is "
-					  "%u! (%.*s)\n",
-				       assert->node_idx, res->owner,
-				       namelen, name);
-				goto kill;
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
 			}
 		} else if (mle->type != DLM_MLE_MIGRATION) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -1967,7 +1949,7 @@ ok:
 			     assert->node_idx, rr, extra_ref, mle->inuse);
 			dlm_print_one_mle(mle);
 		}
-		list_del_init(&mle->list);
+		__dlm_unlink_mle(dlm, mle);
 		__dlm_mle_detach_hb_events(dlm, mle);
 		__dlm_put_mle(mle);
 		if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 			tmp->master = master;
 			atomic_set(&tmp->woken, 1);
 			wake_up(&tmp->wq);
-			/* remove it from the list so that only one
-			 * mle will be found */
-			list_del_init(&tmp->list);
-			/* this was obviously WRONG.  mle is uninited here.  should be tmp. */
+			/* remove it so that only one mle will be found */
+			__dlm_unlink_mle(dlm, tmp);
 			__dlm_mle_detach_hb_events(dlm, tmp);
 			ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
 			mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 	mle->master = master;
 	/* do this for consistency with other mle types */
 	set_bit(new_master, mle->maybe_map);
-	list_add(&mle->list, &dlm->master_list);
+	__dlm_insert_mle(dlm, mle);
 
 	return ret;
 }
 
-
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+/*
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+					struct dlm_master_list_entry *mle)
 {
-	struct dlm_master_list_entry *mle, *next;
 	struct dlm_lock_resource *res;
-	unsigned int hash;
 
-	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-top:
-	assert_spin_locked(&dlm->spinlock);
+	/* Find the lockres associated to the mle and set its owner to UNK */
+	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
+				   mle->mnamehash);
+	if (res) {
+		spin_unlock(&dlm->master_lock);
 
-	/* clean the master list */
-	spin_lock(&dlm->master_lock);
-	list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
-		BUG_ON(mle->type != DLM_MLE_BLOCK &&
-		       mle->type != DLM_MLE_MASTER &&
-		       mle->type != DLM_MLE_MIGRATION);
-
-		/* MASTER mles are initiated locally.  the waiting
-		 * process will notice the node map change
-		 * shortly.  let that happen as normal. */
-		if (mle->type == DLM_MLE_MASTER)
-			continue;
+		/* move lockres onto recovery list */
+		spin_lock(&res->spinlock);
+		dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+		dlm_move_lockres_to_recovery_list(dlm, res);
+		spin_unlock(&res->spinlock);
+		dlm_lockres_put(res);
 
+		/* about to get rid of mle, detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
 
-		/* BLOCK mles are initiated by other nodes.
-		 * need to clean up if the dead node would have
-		 * been the master. */
-		if (mle->type == DLM_MLE_BLOCK) {
-			int bit;
+		/* dump the mle */
+		spin_lock(&dlm->master_lock);
+		__dlm_put_mle(mle);
+		spin_unlock(&dlm->master_lock);
+	}
 
-			spin_lock(&mle->spinlock);
-			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
-			if (bit != dead_node) {
-				mlog(0, "mle found, but dead node %u would "
-				     "not have been master\n", dead_node);
-				spin_unlock(&mle->spinlock);
-			} else {
-				/* must drop the refcount by one since the
-				 * assert_master will never arrive.  this
-				 * may result in the mle being unlinked and
-				 * freed, but there may still be a process
-				 * waiting in the dlmlock path which is fine. */
-				mlog(0, "node %u was expected master\n",
-				     dead_node);
-				atomic_set(&mle->woken, 1);
-				spin_unlock(&mle->spinlock);
-				wake_up(&mle->wq);
-				/* do not need events any longer, so detach 
-				 * from heartbeat */
-				__dlm_mle_detach_hb_events(dlm, mle);
-				__dlm_put_mle(mle);
-			}
-			continue;
-		}
+	return res;
+}
 
-		/* everything else is a MIGRATION mle */
-
-		/* the rule for MIGRATION mles is that the master
-		 * becomes UNKNOWN if *either* the original or
-		 * the new master dies.  all UNKNOWN lockreses
-		 * are sent to whichever node becomes the recovery
-		 * master.  the new master is responsible for
-		 * determining if there is still a master for
-		 * this lockres, or if he needs to take over
-		 * mastery.  either way, this node should expect
-		 * another message to resolve this. */
-		if (mle->master != dead_node &&
-		    mle->new_master != dead_node)
-			continue;
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+				    struct dlm_master_list_entry *mle)
+{
+	__dlm_mle_detach_hb_events(dlm, mle);
 
-		/* if we have reached this point, this mle needs to
-		 * be removed from the list and freed. */
+	spin_lock(&mle->spinlock);
+	__dlm_unlink_mle(dlm, mle);
+	atomic_set(&mle->woken, 1);
+	spin_unlock(&mle->spinlock);
 
-		/* remove from the list early.  NOTE: unlinking
-		 * list_head while in list_for_each_safe */
-		__dlm_mle_detach_hb_events(dlm, mle);
-		spin_lock(&mle->spinlock);
-		list_del_init(&mle->list);
+	wake_up(&mle->wq);
+}
+
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle, u8 dead_node)
+{
+	int bit;
+
+	BUG_ON(mle->type != DLM_MLE_BLOCK);
+
+	spin_lock(&mle->spinlock);
+	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+	if (bit != dead_node) {
+		mlog(0, "mle found, but dead node %u would not have been "
+		     "master\n", dead_node);
+		spin_unlock(&mle->spinlock);
+	} else {
+		/* Must drop the refcount by one since the assert_master will
+		 * never arrive. This may result in the mle being unlinked and
+		 * freed, but there may still be a process waiting in the
+		 * dlmlock path which is fine. */
+		mlog(0, "node %u was expected master\n", dead_node);
 		atomic_set(&mle->woken, 1);
 		spin_unlock(&mle->spinlock);
 		wake_up(&mle->wq);
 
-		mlog(0, "%s: node %u died during migration from "
-		     "%u to %u!\n", dlm->name, dead_node,
-		     mle->master, mle->new_master);
-		/* if there is a lockres associated with this
-	 	 * mle, find it and set its owner to UNKNOWN */
-		hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
-		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-					   mle->u.name.len, hash);
-		if (res) {
-			/* unfortunately if we hit this rare case, our
-		 	 * lock ordering is messed.  we need to drop
-		 	 * the master lock so that we can take the
-		  	 * lockres lock, meaning that we will have to
-			 * restart from the head of list. */
-			spin_unlock(&dlm->master_lock);
+		/* Do not need events any longer, so detach from heartbeat */
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
+	}
+}
 
-			/* move lockres onto recovery list */
-			spin_lock(&res->spinlock);
-			dlm_set_lockres_owner(dlm, res,
-				      	DLM_LOCK_RES_OWNER_UNKNOWN);
-			dlm_move_lockres_to_recovery_list(dlm, res);
-			spin_unlock(&res->spinlock);
-			dlm_lockres_put(res);
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+	struct hlist_head *bucket;
+	struct hlist_node *list;
+	unsigned int i;
 
-			/* about to get rid of mle, detach from heartbeat */
-			__dlm_mle_detach_hb_events(dlm, mle);
+	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
 
-			/* dump the mle */
-			spin_lock(&dlm->master_lock);
-			__dlm_put_mle(mle);
-			spin_unlock(&dlm->master_lock);
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each(list, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+
+			BUG_ON(mle->type != DLM_MLE_BLOCK &&
+			       mle->type != DLM_MLE_MASTER &&
+			       mle->type != DLM_MLE_MIGRATION);
+
+			/* MASTER mles are initiated locally. The waiting
+			 * process will notice the node map change shortly.
+			 * Let that happen as normal. */
+			if (mle->type == DLM_MLE_MASTER)
+				continue;
+
+			/* BLOCK mles are initiated by other nodes. Need to
+			 * clean up if the dead node would have been the
+			 * master. */
+			if (mle->type == DLM_MLE_BLOCK) {
+				dlm_clean_block_mle(dlm, mle, dead_node);
+				continue;
+			}
 
-			/* restart */
-			goto top;
-		}
+			/* Everything else is a MIGRATION mle */
+
+			/* The rule for MIGRATION mles is that the master
+			 * becomes UNKNOWN if *either* the original or the new
+			 * master dies. All UNKNOWN lockres' are sent to
+			 * whichever node becomes the recovery master. The new
+			 * master is responsible for determining if there is
+			 * still a master for this lockres, or if he needs to
+			 * take over mastery. Either way, this node should
+			 * expect another message to resolve this. */
+
+			if (mle->master != dead_node &&
+			    mle->new_master != dead_node)
+				continue;
+
+			/* If we have reached this point, this mle needs to be
+			 * removed from the list and freed. */
+			dlm_clean_migration_mle(dlm, mle);
+
+			mlog(0, "%s: node %u died during migration from "
+			     "%u to %u!\n", dlm->name, dead_node, mle->master,
+			     mle->new_master);
+
+			/* If we find a lockres associated with the mle, we've
+			 * hit this rare case that messes up our lock ordering.
+			 * If so, we need to drop the master lock so that we can
+			 * take the lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			res = dlm_reset_mleres_owner(dlm, mle);
+			if (res)
+				/* restart */
+				goto top;
 
-		/* this may be the last reference */
-		__dlm_put_mle(mle);
+			/* This may be the last reference */
+			__dlm_put_mle(mle);
+		}
 	}
 	spin_unlock(&dlm->master_lock);
 }
 
-
 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 			 u8 old_master)
 {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029f..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 	spin_lock(&res->spinlock);
 	if (!__dlm_lockres_unused(res)) {
-		spin_unlock(&res->spinlock);
 		mlog(0, "%s:%.*s: tried to purge but not unused\n",
 		     dlm->name, res->lockname.len, res->lockname.name);
-		return -ENOTEMPTY;
+		__dlm_print_one_lock_resource(res);
+		spin_unlock(&res->spinlock);
+		BUG();
 	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "%s:%.*s: Delay dropref as this lockres is "
+		     "being remastered\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		/* Re-add the lockres to the end of the purge list */
+		if (!list_empty(&res->purge)) {
+			list_del_init(&res->purge);
+			list_add_tail(&res->purge, &dlm->purge_list);
+		}
+		spin_unlock(&res->spinlock);
+		return 0;
+	}
+
 	master = (res->owner == dlm->node_num);
+
 	if (!master)
 		res->state |= DLM_LOCK_RES_DROPPING_REF;
 	spin_unlock(&res->spinlock);
@@ -181,8 +197,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
-						  DLM_LOCK_RES_MIGRATING));
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef324..fcf879ed6930 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 	else
 		BUG_ON(res->owner == dlm->node_num);
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->ast_lock);
 	/* We want to be sure that we're not freeing a lock
 	 * that still has AST's pending... */
 	in_use = !list_empty(&lock->ast_list);
-	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm->ast_lock);
 	if (in_use) {
 	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
 		    "while waiting for an ast!", res->lockname.len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 206a2370876a..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+	.flags		= 0,
+};
+
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
 	.get_osb	= ocfs2_get_dentry_osb,
 	.post_unlock	= ocfs2_dentry_post_unlock,
@@ -320,9 +324,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert);
-#define ocfs2_log_dlm_error(_func, _err, _lockres) do {			\
-	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-	     _err, _func, _lockres->l_name);				\
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -617,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
 				   &ocfs2_rename_lops, osb);
 }
 
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+					 struct ocfs2_super *osb)
+{
+	/* nfs_sync lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+				   &ocfs2_nfs_sync_lops, osb);
+}
+
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 			      struct ocfs2_file_private *fp)
 {
@@ -2412,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
 		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+				    0, 0);
+	if (status < 0)
+		mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+
+	return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres,
+				     ex ? LKM_EXMODE : LKM_PRMODE);
+}
+
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
 	int ret;
@@ -2793,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 local:
 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+	ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
 
 	osb->cconn = conn;
 
@@ -2828,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 
 	ocfs2_lock_res_free(&osb->osb_super_lockres);
 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+	ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
 
 	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
 	osb->cconn = NULL;
@@ -3010,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
 	ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
 	ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
 }
 
 int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
 			int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
 
 #include "ocfs2.h"
 
+#include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dcache.h"
@@ -38,6 +39,7 @@
 #include "inode.h"
 
 #include "buffer_head_io.h"
+#include "suballoc.h"
 
 struct ocfs2_inode_handle
 {
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 		struct ocfs2_inode_handle *handle)
 {
 	struct inode *inode;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 blkno = handle->ih_blkno;
+	int status, set;
 	struct dentry *result;
 
 	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
 
-	if (handle->ih_blkno == 0) {
-		mlog_errno(-ESTALE);
-		return ERR_PTR(-ESTALE);
+	if (blkno == 0) {
+		mlog(0, "nfs wants inode with blkno: 0\n");
+		result = ERR_PTR(-ESTALE);
+		goto bail;
+	}
+
+	inode = ocfs2_ilookup(sb, blkno);
+	/*
+	 * If the inode exists in memory, we only need to check it's
+	 * generation number
+	 */
+	if (inode)
+		goto check_gen;
+
+	/*
+	 * This will synchronize us against ocfs2_delete_inode() on
+	 * all nodes
+	 */
+	status = ocfs2_nfs_sync_lock(osb, 1);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+		goto check_err;
+	}
+
+	status = ocfs2_test_inode_bit(osb, blkno, &set);
+	if (status < 0) {
+		if (status == -EINVAL) {
+			/*
+			 * The blkno NFS gave us doesn't even show up
+			 * as an inode, we return -ESTALE to be
+			 * nice
+			 */
+			mlog(0, "test inode bit failed %d\n", status);
+			status = -ESTALE;
+		} else {
+			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		}
+		goto unlock_nfs_sync;
+	}
+
+	/* If the inode allocator bit is clear, this inode must be stale */
+	if (!set) {
+		mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+		status = -ESTALE;
+		goto unlock_nfs_sync;
 	}
 
-	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+	inode = ocfs2_iget(osb, blkno, 0, 0);
 
-	if (IS_ERR(inode))
-		return (void *)inode;
+unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(osb, 1);
 
+check_err:
+	if (status < 0) {
+		if (status == -ESTALE) {
+			mlog(0, "stale inode ino: %llu generation: %u\n",
+			     blkno, handle->ih_generation);
+		}
+		result = ERR_PTR(status);
+		goto bail;
+	}
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		result = (void *)inode;
+		goto bail;
+	}
+
+check_gen:
 	if (handle->ih_generation != inode->i_generation) {
 		iput(inode);
-		return ERR_PTR(-ESTALE);
+		mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+		     handle->ih_generation);
+		result = ERR_PTR(-ESTALE);
+		goto bail;
 	}
 
 	result = d_obtain_alias(inode);
 	if (!IS_ERR(result))
 		result->d_op = &ocfs2_dentry_ops;
+	else
+		mlog_errno(PTR_ERR(result));
 
+bail:
 	mlog_exit_ptr(result);
 	return result;
 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dir.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
 		oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
 
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+	struct ocfs2_find_inode_args args;
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = 0;
+
+	return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 			 int sysfile_type)
 {
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
-	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_nlink = ocfs2_read_links_count(fe);
 
 	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	ocfs2_set_inode_flags(inode);
 
+	OCFS2_I(inode)->ip_last_used_slot = 0;
+	OCFS2_I(inode)->ip_last_used_group = 0;
 	mlog_exit_void();
 }
 
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
-					ocfs2_quota_trans_credits(inode->i_sb));
+				   ocfs2_quota_trans_credits(inode->i_sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	/* Remove any dir index tree */
+	if (S_ISDIR(inode->i_mode)) {
+		status = ocfs2_dx_dir_truncate(inode, di_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail_unlock_dir;
+		}
+	}
+
 	/*Free extended attribute resources associated with this inode.*/
 	status = ocfs2_xattr_remove(inode, di_bh);
 	if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
 		goto bail;
 	}
 
+	/*
+	 * Synchronize us against ocfs2_get_dentry. We take this in
+	 * shared mode so that all nodes can still concurrently
+	 * process deletes.
+	 */
+	status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
 	/* Lock down the inode. This gives us an up to date view of
 	 * it's metadata (for verification), and allows us to
 	 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
 		if (status != -ENOENT)
 			mlog_errno(status);
 		ocfs2_cleanup_delete_inode(inode, 0);
-		goto bail_unblock;
+		goto bail_unlock_nfs_sync;
 	}
 
 	/* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
 bail_unlock_inode:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
+
+bail_unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
 bail_unblock:
 	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
 	if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 	fe->i_size = cpu_to_le64(i_size_read(inode));
-	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	ocfs2_set_links_count(fe, inode->i_nlink);
 	fe->i_uid = cpu_to_le32(inode->i_uid);
 	fe->i_gid = cpu_to_le32(inode->i_gid);
 	fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
 	ocfs2_set_inode_flags(inode);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
-	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_nlink = ocfs2_read_links_count(fe);
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
 	inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
 
 	struct inode			vfs_inode;
 	struct jbd2_inode		ip_jinode;
+
+	/* Only valid if the inode is the dir. */
+	u32				ip_last_used_slot;
+	u64				ip_last_used_group;
 };
 
 /*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE		0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
 			 int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 				 int slot);
 static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec);
 
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
 	return __ocfs2_wait_on_mount(osb, 1);
 }
 
-
-
 /*
- * The recovery_list is a simple linked list of node numbers to recover.
- * It is protected by the recovery_lock.
+ * This replay_map is to track online/offline slots, so we could recover
+ * offline slots during recovery and mount
  */
 
-struct ocfs2_recovery_map {
-	unsigned int rm_used;
-	unsigned int *rm_entries;
+enum ocfs2_replay_state {
+	REPLAY_UNNEEDED = 0,	/* Replay is not needed, so ignore this map */
+	REPLAY_NEEDED, 		/* Replay slots marked in rm_replay_slots */
+	REPLAY_DONE 		/* Replay was already queued */
 };
 
+struct ocfs2_replay_map {
+	unsigned int rm_slots;
+	enum ocfs2_replay_state rm_state;
+	unsigned char rm_replay_slots[0];
+};
+
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+	if (!osb->replay_map)
+		return;
+
+	/* If we've already queued the replay, we don't have any more to do */
+	if (osb->replay_map->rm_state == REPLAY_DONE)
+		return;
+
+	osb->replay_map->rm_state = state;
+}
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map;
+	int i, node_num;
+
+	/* If replay map is already set, we don't do it again */
+	if (osb->replay_map)
+		return 0;
+
+	replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+			     (osb->max_slots * sizeof(char)), GFP_KERNEL);
+
+	if (!replay_map) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	spin_lock(&osb->osb_lock);
+
+	replay_map->rm_slots = osb->max_slots;
+	replay_map->rm_state = REPLAY_UNNEEDED;
+
+	/* set rm_replay_slots for offline slot(s) */
+	for (i = 0; i < replay_map->rm_slots; i++) {
+		if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+			replay_map->rm_replay_slots[i] = 1;
+	}
+
+	osb->replay_map = replay_map;
+	spin_unlock(&osb->osb_lock);
+	return 0;
+}
+
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+	int i;
+
+	if (!replay_map)
+		return;
+
+	if (replay_map->rm_state != REPLAY_NEEDED)
+		return;
+
+	for (i = 0; i < replay_map->rm_slots; i++)
+		if (replay_map->rm_replay_slots[i])
+			ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+							NULL, NULL);
+	replay_map->rm_state = REPLAY_DONE;
+}
+
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+	struct ocfs2_replay_map *replay_map = osb->replay_map;
+
+	if (!osb->replay_map)
+		return;
+
+	kfree(replay_map);
+	osb->replay_map = NULL;
+}
+
 int ocfs2_recovery_init(struct ocfs2_super *osb)
 {
 	struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
 	},
 };
 
+static struct ocfs2_triggers dr_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+
+static struct ocfs2_triggers dl_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dx_leaf, dl_check),
+};
+
 static int __ocfs2_journal_access(handle_t *handle,
 				  struct inode *inode,
 				  struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 				      type);
 }
 
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
+				      type);
+}
+
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh, int type)
 {
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 }
 
 /* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 {
 	struct ocfs2_journal *journal = osb->journal;
 
-	if (osb->dirty) {
-		/* No need to queue up our truncate_log as regular
-		 * cleanup will catch that. */
-		ocfs2_queue_recovery_completion(journal,
-						osb->slot_num,
-						osb->local_alloc_copy,
-						NULL,
-						NULL);
-		ocfs2_schedule_truncate_log_flush(osb, 0);
+	/* No need to queue up our truncate_log as regular cleanup will catch
+	 * that */
+	ocfs2_queue_recovery_completion(journal, osb->slot_num,
+					osb->local_alloc_copy, NULL, NULL);
+	ocfs2_schedule_truncate_log_flush(osb, 0);
 
-		osb->local_alloc_copy = NULL;
-		osb->dirty = 0;
-	}
+	osb->local_alloc_copy = NULL;
+	osb->dirty = 0;
+
+	/* queue to recover orphan slots for all offline slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+	ocfs2_queue_replay_slots(osb);
+	ocfs2_free_replay_slots(osb);
 }
 
 void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
 		goto bail;
 	}
 
+	status = ocfs2_compute_replay_slots(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* queue recovery for our own slot */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL, NULL);
+
 	spin_lock(&osb->osb_lock);
 	while (rm->rm_used) {
 		/* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
 
 	ocfs2_super_unlock(osb, 1);
 
-	/* We always run recovery on our own orphan dir - the dead
-	 * node(s) may have disallowd a previos inode delete. Re-processing
-	 * is therefore required. */
-	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-					NULL, NULL);
+	/* queue recovery for offline slots */
+	ocfs2_queue_replay_slots(osb);
 
 bail:
 	mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
 		goto restart;
 	}
 
+	ocfs2_free_replay_slots(osb);
 	osb->recovery_thread_task = NULL;
 	mb(); /* sync with ocfs2_recovery_thread_running */
 	wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 		goto done;
 	}
 
+	/* we need to run complete recovery for offline orphan slots */
+	ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+
 	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
 	     node_num, slot_num,
 	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
 struct ocfs2_super;
 struct ocfs2_dinode;
 
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+	unsigned int rm_used;
+	unsigned int *rm_entries;
+};
+
+
 struct ocfs2_journal {
 	enum ocfs2_journal_state   j_state;    /* Journals current state   */
 
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
 
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
 /*
  *  Journal Control:
  *  Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 /* dirblock */
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
 			    struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
 /* Anything that has no ecc */
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 }
 
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+ * bitmap block for the new bit) dx_root update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+	/* 1 block for index, 2 allocs (data, metadata), 1 clusters
+	 * worth of blocks for initial extent. */
+	return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+		ocfs2_clusters_to_blocks(sb, 1);
+}
 
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks + quota update */
-static inline int ocfs2_mknod_credits(struct super_block *sb)
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+				      int xattr_credits)
 {
-	return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+	int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+
+	if (is_dir)
+		dir_credits += ocfs2_add_dir_index_credits(sb);
+
+	return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
 	       ocfs2_quota_trans_credits(sb);
 }
 
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir */
+ * update on dir + index leaf + dx root update for free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-	return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
 	       ocfs2_quota_trans_credits(sb);
 }
 
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
+ * dir inode link + dir inode index leaf + dir index root */
 static inline int ocfs2_unlink_credits(struct super_block *sb)
 {
 	/* The quota update from ocfs2_link_credits is unused here... */
-	return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
 }
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+ * inode alloc group descriptor + orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
 
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
+ * directory + target unlink + 3 x dir index leaves */
 static inline int ocfs2_rename_credits(struct super_block *sb)
 {
-	return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
 }
 
 /* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
 					  + OCFS2_INODE_UPDATE_CREDITS \
 					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
 
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +	\
+				      OCFS2_SUBALLOC_FREE)
+
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+	int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+
+	credits += ocfs2_clusters_to_blocks(sb, 1);
+	credits += ocfs2_quota_trans_credits(sb);
+
+	return credits;
+}
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-	int blocks = ocfs2_mknod_credits(sb);
+	int blocks = ocfs2_mknod_credits(sb, 0, 0);
 
 	/* links can be longer than one block so we may update many
 	 * within our single allocated extent. */
@@ -513,8 +560,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
 					       loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
 }
 
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#include <linux/debugfs.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
-#ifdef CONFIG_OCFS2_FS_STATS
-
-static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
-#define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
-#define LA_DEBUG_VER	1
-static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
-				   size_t count, loff_t *ppos)
-{
-	static DEFINE_MUTEX(la_debug_mutex);
-	struct ocfs2_super *osb = file->private_data;
-	int written, ret;
-	char *buf = osb->local_alloc_debug_buf;
-
-	mutex_lock(&la_debug_mutex);
-	memset(buf, 0, LA_DEBUG_BUF_SZ);
-
-	written = snprintf(buf, LA_DEBUG_BUF_SZ,
-			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
-			   LA_DEBUG_VER,
-			   (unsigned long long)osb->la_last_gd,
-			   osb->local_alloc_default_bits,
-			   osb->local_alloc_bits, osb->local_alloc_state);
-
-	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
-
-	mutex_unlock(&la_debug_mutex);
-	return ret;
-}
-
-static const struct file_operations ocfs2_la_debug_fops = {
-	.open =		ocfs2_la_debug_open,
-	.read =		ocfs2_la_debug_read,
-};
-
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
-	if (!osb->local_alloc_debug_buf)
-		return;
-
-	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
-						     S_IFREG|S_IRUSR,
-						     osb->osb_debug_root,
-						     osb,
-						     &ocfs2_la_debug_fops);
-	if (!osb->local_alloc_debug) {
-		kfree(osb->local_alloc_debug_buf);
-		osb->local_alloc_debug_buf = NULL;
-	}
-}
-
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-	if (osb->local_alloc_debug)
-		debugfs_remove(osb->local_alloc_debug);
-
-	if (osb->local_alloc_debug_buf)
-		kfree(osb->local_alloc_debug_buf);
-
-	osb->local_alloc_debug_buf = NULL;
-	osb->local_alloc_debug = NULL;
-}
-#else	/* CONFIG_OCFS2_FS_STATS */
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-	return;
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-	return;
-}
-#endif
-
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
 	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	ocfs2_init_la_debug(osb);
-
 	if (osb->local_alloc_bits == 0)
 		goto bail;
 
@@ -299,9 +218,6 @@ bail:
 	if (inode)
 		iput(inode);
 
-	if (status < 0)
-		ocfs2_shutdown_la_debug(osb);
-
 	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
 	mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	cancel_delayed_work(&osb->la_enable_wq);
 	flush_workqueue(ocfs2_wq);
 
-	ocfs2_shutdown_la_debug(osb);
-
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
 	sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret2 < 0)
 		mlog_errno(ret2);
-
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba86c3b2..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
 				    struct inode *inode,
 				    char *name,
-				    struct buffer_head **de_bh);
+				    struct ocfs2_dir_lookup_result *lookup);
 
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
 			    struct ocfs2_dinode *fe,
 			    char *name,
-			    struct buffer_head *de_bh,
+			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode);
 
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
 	struct ocfs2_super *osb;
 	struct ocfs2_dinode *dirfe;
 	struct buffer_head *new_fe_bh = NULL;
-	struct buffer_head *de_bh = NULL;
 	struct inode *inode = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *xattr_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
 	int want_clusters = 0;
+	int want_meta = 0;
 	int xattr_credits = 0;
 	struct ocfs2_security_xattr_info si = {
 		.enable = 1,
 	};
 	int did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
 		return status;
 	}
 
-	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+	if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
 		status = -EMLINK;
 		goto leave;
 	}
 
 	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-	if (!dirfe->i_links_count) {
+	if (!ocfs2_read_links_count(dirfe)) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
 		goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
 	/* get a spot inside the dir. */
 	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					      dentry->d_name.name,
-					      dentry->d_name.len, &de_bh);
+					      dentry->d_name.len, &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
 
 	/* calculate meta data/clusters for setting security and acl xattr */
 	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
-					&si, &want_clusters,
-					&xattr_credits, &xattr_ac);
+				       &si, &want_clusters,
+				       &xattr_credits, &want_meta);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
 	/* Reserve a cluster if creating an extent based directory. */
-	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
 		want_clusters += 1;
 
+		/* Dir indexing requires extra space as well */
+		if (ocfs2_supports_indexed_dirs(osb))
+			want_meta++;
+	}
+
+	status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
 	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
-				   xattr_credits);
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+							    S_ISDIR(mode),
+							    xattr_credits));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
 
 	if (S_ISDIR(mode)) {
 		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
-					    new_fe_bh, data_ac);
+					    new_fe_bh, data_ac, meta_ac);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
-		le16_add_cpu(&dirfe->i_links_count, 1);
+		ocfs2_add_links_count(dirfe, 1);
 		status = ocfs2_journal_dirty(handle, parent_fe_bh);
 		if (status < 0) {
 			mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
 	}
 
 	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-				xattr_ac, data_ac);
+				meta_ac, data_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
 
 	if (si.enable) {
 		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
-						 xattr_ac, data_ac);
+						 meta_ac, data_ac);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-				 de_bh);
+				 &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -423,11 +437,12 @@ leave:
 		mlog(0, "Disk is full\n");
 
 	brelse(new_fe_bh);
-	brelse(de_bh);
 	brelse(parent_fe_bh);
 	kfree(si.name);
 	kfree(si.value);
 
+	ocfs2_free_dir_lookup_result(&lookup);
+
 	if ((status < 0) && inode) {
 		clear_nlink(inode);
 		iput(inode);
@@ -439,8 +454,8 @@ leave:
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 
-	if (xattr_ac)
-		ocfs2_free_alloc_context(xattr_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
 
 	mlog_exit(status);
 
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *fel;
 	u64 fe_blkno = 0;
 	u16 suballoc_bit;
+	u16 feat;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
 		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
 	*new_fe_bh = NULL;
 
-	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
-				       &fe_blkno);
+	status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
+				       inode_ac, &suballoc_bit, &fe_blkno);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_mode = cpu_to_le16(inode->i_mode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	ocfs2_set_links_count(fe, inode->i_nlink);
 
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,14 +542,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_dtime = 0;
 
 	/*
-	 * If supported, directories start with inline data.
+	 * If supported, directories start with inline data. If inline
+	 * isn't supported, but indexing is, we start them as indexed.
 	 */
+	feat = le16_to_cpu(fe->i_dyn_features);
 	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
-		u16 feat = le16_to_cpu(fe->i_dyn_features);
-
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
 
-		fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
+		fe->id2.i_data.id_count = cpu_to_le16(
+				ocfs2_max_inline_data_with_xattr(osb->sb, fe));
 	} else {
 		fel = &fe->id2.i_list;
 		fel->l_tree_depth = 0;
@@ -607,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 	int err;
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
-	struct buffer_head *de_bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
 	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -637,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 
 	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					   dentry->d_name.name,
-					   dentry->d_name.len, &de_bh);
+					   dentry->d_name.len, &lookup);
 	if (err < 0) {
 		mlog_errno(err);
 		goto out;
@@ -651,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	}
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+	if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
 		err = -EMLINK;
 		goto out_unlock_inode;
 	}
@@ -673,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
 
 	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
-	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	ocfs2_set_links_count(fe, inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
 	err = ocfs2_journal_dirty(handle, fe_bh);
 	if (err < 0) {
-		le16_add_cpu(&fe->i_links_count, -1);
+		ocfs2_add_links_count(fe, -1);
 		drop_nlink(inode);
 		mlog_errno(err);
 		goto out_commit;
@@ -687,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 
 	err = ocfs2_add_entry(handle, dentry, inode,
 			      OCFS2_I(inode)->ip_blkno,
-			      parent_fe_bh, de_bh);
+			      parent_fe_bh, &lookup);
 	if (err) {
-		le16_add_cpu(&fe->i_links_count, -1);
+		ocfs2_add_links_count(fe, -1);
 		drop_nlink(inode);
 		mlog_errno(err);
 		goto out_commit;
@@ -713,10 +731,11 @@ out_unlock_inode:
 out:
 	ocfs2_inode_unlock(dir, 1);
 
-	brelse(de_bh);
 	brelse(fe_bh);
 	brelse(parent_fe_bh);
 
+	ocfs2_free_dir_lookup_result(&lookup);
+
 	mlog_exit(err);
 
 	return err;
@@ -765,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_node_bh = NULL;
 	handle_t *handle = NULL;
-	struct ocfs2_dir_entry *dirent = NULL;
-	struct buffer_head *dirent_bh = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-	struct buffer_head *orphan_entry_bh = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
 		   dentry->d_name.len, dentry->d_name.name);
@@ -790,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
 	}
 
 	status = ocfs2_find_files_on_disk(dentry->d_name.name,
-					  dentry->d_name.len, &blkno,
-					  dir, &dirent_bh, &dirent);
+					  dentry->d_name.len, &blkno, dir,
+					  &lookup);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -816,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
 	child_locked = 1;
 
 	if (S_ISDIR(inode->i_mode)) {
-	       	if (!ocfs2_empty_dir(inode)) {
-			status = -ENOTEMPTY;
-			goto leave;
-		} else if (inode->i_nlink != 2) {
+		if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
 			status = -ENOTEMPTY;
 			goto leave;
 		}
@@ -835,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
 
 	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
-						  orphan_name,
-						  &orphan_entry_bh);
+						  orphan_name, &orphan_insert);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -862,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
 
 	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
-					  orphan_entry_bh, orphan_dir);
+					  &orphan_insert, orphan_dir);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -870,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
 	}
 
 	/* delete the name from the parent dir */
-	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	status = ocfs2_delete_entry(handle, dir, &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -879,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
 	if (S_ISDIR(inode->i_mode))
 		drop_nlink(inode);
 	drop_nlink(inode);
-	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	ocfs2_set_links_count(fe, inode->i_nlink);
 
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
@@ -915,9 +929,10 @@ leave:
 	}
 
 	brelse(fe_bh);
-	brelse(dirent_bh);
 	brelse(parent_node_bh);
-	brelse(orphan_entry_bh);
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&lookup);
 
 	mlog_exit(status);
 
@@ -1003,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
 			struct inode *new_dir,
 			struct dentry *new_dentry)
 {
-	int status = 0, rename_lock = 0, parents_locked = 0;
-	int old_child_locked = 0, new_child_locked = 0;
+	int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
+	int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *orphan_dir = NULL;
@@ -1019,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
 	handle_t *handle = NULL;
 	struct buffer_head *old_dir_bh = NULL;
 	struct buffer_head *new_dir_bh = NULL;
-	struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
-		*new_de = NULL;
-	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
-	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
-						    // this is the 1st dirent bh
 	nlink_t old_dir_nlink = old_dir->i_nlink;
 	struct ocfs2_dinode *old_di;
+	struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+	struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+	struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	struct ocfs2_dir_lookup_result target_insert = { NULL, };
 
 	/* At some point it might be nice to break this function up a
 	 * bit. */
@@ -1107,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
 	if (S_ISDIR(old_inode->i_mode)) {
 		u64 old_inode_parent;
 
+		update_dot_dot = 1;
 		status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
-						  old_inode, &old_inode_de_bh,
-						  &old_inode_dot_dot_de);
+						  old_inode,
+						  &old_inode_dot_dot_res);
 		if (status) {
 			status = -EIO;
 			goto bail;
@@ -1121,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 
 		if (!new_inode && new_dir != old_dir &&
-		    new_dir->i_nlink >= OCFS2_LINK_MAX) {
+		    new_dir->i_nlink >= ocfs2_link_max(osb)) {
 			status = -EMLINK;
 			goto bail;
 		}
@@ -1150,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	 * to delete it */
 	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
 					  new_dentry->d_name.len,
-					  &newfe_blkno, new_dir, &new_de_bh,
-					  &new_de);
+					  &newfe_blkno, new_dir,
+					  &target_lookup_res);
 	/* The only error we allow here is -ENOENT because the new
 	 * file not existing is perfectly valid. */
 	if ((status < 0) && (status != -ENOENT)) {
@@ -1160,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
 		mlog_errno(status);
 		goto bail;
 	}
+	if (status == 0)
+		target_exists = 1;
 
-	if (!new_de && new_inode) {
+	if (!target_exists && new_inode) {
 		/*
 		 * Target was unlinked by another node while we were
 		 * waiting to get to ocfs2_rename(). There isn't
@@ -1174,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
 
 	/* In case we need to overwrite an existing file, we blow it
 	 * away first */
-	if (new_de) {
+	if (target_exists) {
 		/* VFS didn't think there existed an inode here, but
 		 * someone else in the cluster must have raced our
 		 * rename to create one. Today we error cleanly, in
@@ -1215,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
 
 		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 
-		mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
-		     "newfebh=%p bhblocknr=%llu\n", new_de,
+		mlog(0, "aha rename over existing... new_blkno=%llu "
+		     "newfebh=%p bhblocknr=%llu\n",
 		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
 		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
 
@@ -1224,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
 			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
 							  new_inode,
 							  orphan_name,
-							  &orphan_entry_bh);
+							  &orphan_insert);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -1242,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
 						      new_dentry->d_name.name,
 						      new_dentry->d_name.len,
-						      &insert_entry_bh);
+						      &target_insert);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1257,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
 		goto bail;
 	}
 
-	if (new_de) {
+	if (target_exists) {
 		if (S_ISDIR(new_inode->i_mode)) {
-			if (!ocfs2_empty_dir(new_inode) ||
-			    new_inode->i_nlink != 2) {
+			if (new_inode->i_nlink != 2 ||
+			    !ocfs2_empty_dir(new_inode)) {
 				status = -ENOTEMPTY;
 				goto bail;
 			}
@@ -1273,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 
 		if (S_ISDIR(new_inode->i_mode) ||
-		    (newfe->i_links_count == cpu_to_le16(1))){
+		    (ocfs2_read_links_count(newfe) == 1)) {
 			status = ocfs2_orphan_add(osb, handle, new_inode,
 						  newfe, orphan_name,
-						  orphan_entry_bh, orphan_dir);
+						  &orphan_insert, orphan_dir);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -1284,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 
 		/* change the dirent to point to the correct inode */
-		status = ocfs2_update_entry(new_dir, handle, new_de_bh,
-					    new_de, old_inode);
+		status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
+					    old_inode);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1293,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
 		new_dir->i_version++;
 
 		if (S_ISDIR(new_inode->i_mode))
-			newfe->i_links_count = 0;
+			ocfs2_set_links_count(newfe, 0);
 		else
-			le16_add_cpu(&newfe->i_links_count, -1);
+			ocfs2_add_links_count(newfe, -1);
 
 		status = ocfs2_journal_dirty(handle, newfe_bh);
 		if (status < 0) {
@@ -1306,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		/* if the name was not found in new_dir, add it now */
 		status = ocfs2_add_entry(handle, new_dentry, old_inode,
 					 OCFS2_I(old_inode)->ip_blkno,
-					 new_dir_bh, insert_entry_bh);
+					 new_dir_bh, &target_insert);
 	}
 
 	old_inode->i_ctime = CURRENT_TIME;
@@ -1333,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
 	 * because the insert might have changed the type of directory
 	 * we're dealing with.
 	 */
-	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
-				     old_dentry->d_name.len,
-				     old_dir, &old_de);
-	if (!old_de_bh) {
-		status = -EIO;
+	status = ocfs2_find_entry(old_dentry->d_name.name,
+				  old_dentry->d_name.len, old_dir,
+				  &old_entry_lookup);
+	if (status)
 		goto bail;
-	}
 
-	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+	status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1352,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
 		new_inode->i_ctime = CURRENT_TIME;
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-	if (old_inode_de_bh) {
-		status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
-					    old_inode_dot_dot_de, new_dir);
+
+	if (update_dot_dot) {
+		status = ocfs2_update_entry(old_inode, handle,
+					    &old_inode_dot_dot_res, new_dir);
 		old_dir->i_nlink--;
 		if (new_inode) {
 			new_inode->i_nlink--;
@@ -1390,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
 		} else {
 			struct ocfs2_dinode *fe;
 			status = ocfs2_journal_access_di(handle, old_dir,
-							 old_dir_bh,
-							 OCFS2_JOURNAL_ACCESS_WRITE);
+						      old_dir_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
-			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+			ocfs2_set_links_count(fe, old_dir->i_nlink);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
 		}
 	}
-
 	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
 	status = 0;
 bail:
@@ -1428,13 +1444,17 @@ bail:
 
 	if (new_inode)
 		iput(new_inode);
+
+	ocfs2_free_dir_lookup_result(&target_lookup_res);
+	ocfs2_free_dir_lookup_result(&old_entry_lookup);
+	ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+	ocfs2_free_dir_lookup_result(&target_insert);
+
 	brelse(newfe_bh);
 	brelse(old_inode_bh);
 	brelse(old_dir_bh);
 	brelse(new_dir_bh);
-	brelse(new_de_bh);
-	brelse(old_de_bh);
-	brelse(old_inode_de_bh);
 	brelse(orphan_entry_bh);
 	brelse(insert_entry_bh);
 
@@ -1557,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
 	struct inode *inode = NULL;
 	struct super_block *sb;
 	struct buffer_head *new_fe_bh = NULL;
-	struct buffer_head *de_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_dinode *dirfe;
@@ -1571,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
 		.enable = 1,
 	};
 	int did_quota = 0, did_quota_inode = 0;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1591,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
 	}
 
 	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-	if (!dirfe->i_links_count) {
+	if (!ocfs2_read_links_count(dirfe)) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
 		goto bail;
@@ -1604,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
 
 	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
 					      dentry->d_name.name,
-					      dentry->d_name.len, &de_bh);
+					      dentry->d_name.len, &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1743,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-				 de_bh);
+				 &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1771,9 +1791,9 @@ bail:
 
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
-	brelse(de_bh);
 	kfree(si.name);
 	kfree(si.value);
+	ocfs2_free_dir_lookup_result(&lookup);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
@@ -1825,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
 				    struct inode *inode,
 				    char *name,
-				    struct buffer_head **de_bh)
+				    struct ocfs2_dir_lookup_result *lookup)
 {
 	struct inode *orphan_dir_inode;
 	struct buffer_head *orphan_dir_bh = NULL;
@@ -1856,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 
 	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
 					      orphan_dir_bh, name,
-					      OCFS2_ORPHAN_NAMELEN, de_bh);
+					      OCFS2_ORPHAN_NAMELEN, lookup);
 	if (status < 0) {
 		ocfs2_inode_unlock(orphan_dir_inode, 1);
 
@@ -1883,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    struct inode *inode,
 			    struct ocfs2_dinode *fe,
 			    char *name,
-			    struct buffer_head *de_bh,
+			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode)
 {
 	struct buffer_head *orphan_dir_bh = NULL;
@@ -1909,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	 * underneath us... */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
-		le16_add_cpu(&orphan_fe->i_links_count, 1);
-	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+		ocfs2_add_links_count(orphan_fe, 1);
+	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
 
 	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
 	if (status < 0) {
@@ -1921,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
 				   OCFS2_ORPHAN_NAMELEN, inode,
 				   OCFS2_I(inode)->ip_blkno,
-				   orphan_dir_bh, de_bh);
+				   orphan_dir_bh, lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1954,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	char name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct ocfs2_dinode *orphan_fe;
 	int status = 0;
-	struct buffer_head *target_de_bh = NULL;
-	struct ocfs2_dir_entry *target_de = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
 	mlog_entry_void();
 
@@ -1970,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	     OCFS2_ORPHAN_NAMELEN);
 
 	/* find it's spot in the orphan directory */
-	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
-					orphan_dir_inode, &target_de);
-	if (!target_de_bh) {
-		status = -ENOENT;
+	status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+				  &lookup);
+	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 
 	/* remove it from the orphan directory */
-	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
-				    target_de_bh);
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1996,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	/* do the i_nlink dance! :) */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
-		le16_add_cpu(&orphan_fe->i_links_count, -1);
-	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+		ocfs2_add_links_count(orphan_fe, -1);
+	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
 
 	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
 	if (status < 0) {
@@ -2006,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	}
 
 leave:
-	brelse(target_de_bh);
+	ocfs2_free_dir_lookup_result(&lookup);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 077384135f4e..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
 struct ocfs2_dentry_lock;
 struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
 	atomic_t vol_state;
 	struct mutex recovery_lock;
 	struct ocfs2_recovery_map *recovery_map;
+	struct ocfs2_replay_map *replay_map;
 	struct task_struct *recovery_thread_task;
 	int disable_recovery;
 	wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
 
 	u64 la_last_gd;
 
-#ifdef CONFIG_OCFS2_FS_STATS
-	struct dentry *local_alloc_debug;
-	char *local_alloc_debug_buf;
-#endif
-
 	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
 	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
 	struct ocfs2_lock_res osb_rename_lockres;
+	struct ocfs2_lock_res osb_nfs_sync_lockres;
 	struct ocfs2_dlm_debug *osb_dlm_debug;
 
 	struct dentry *osb_debug_root;
+	struct dentry *osb_ctxt;
 
 	wait_queue_head_t recovery_event;
 
@@ -341,6 +340,15 @@ struct ocfs2_super
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
 	wait_queue_head_t		osb_wipe_event;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
+
+	unsigned int			osb_dx_mask;
+	u32				osb_dx_seed[4];
+
+	/* the group we used to allocate inodes. */
+	u64				osb_inode_alloc_group;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -399,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+		return 1;
+	return 0;
+}
+
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+	if (ocfs2_supports_indexed_dirs(osb))
+		return OCFS2_DX_LINK_MAX;
+	return OCFS2_LINK_MAX;
+}
+
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+	u32 nlink = le16_to_cpu(di->i_links_count);
+	u32 hi = le16_to_cpu(di->i_links_count_hi);
+
+	if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+		nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+
+	return nlink;
+}
+
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+	u16 lo, hi;
+
+	lo = nlink;
+	hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+
+	di->i_links_count = cpu_to_le16(lo);
+	di->i_links_count_hi = cpu_to_le16(hi);
+}
+
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+	u32 links = ocfs2_read_links_count(di);
+
+	links += n;
+
+	ocfs2_set_links_count(di, links);
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -479,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
 	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 
+#define OCFS2_IS_VALID_DX_ROOT(ptr)					\
+	(!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_LEAF(ptr)					\
+	(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
@@ -529,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
 	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
 }
 
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+					       u64 blocks)
+{
+	int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_blocks_to_clusters(sb, blocks);
+	return (u64)clusters << bits;
+}
+
 static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
 						u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45aaa36c..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
 #define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE		"DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE		"DXLEAF1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -95,7 +97,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_XATTR \
-					 | OCFS2_FEATURE_INCOMPAT_META_ECC)
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
+					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS	0x0400
+
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
 
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
                                           OCFS2_DIR_ROUND) & \
 					 ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN	OCFS2_DIR_REC_LEN(1)
 
 #define OCFS2_LINK_MAX		32000
+#define	OCFS2_DX_LINK_MAX	((1U << 31) - 1U)
+#define	OCFS2_LINKS_HI_SHIFT	16
+#define	OCFS2_DX_ENTRIES_MAX	(0xffffffffU)
 
 #define S_SHIFT			12
 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
 /*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
 					   for this fs*/
 	__le16 s_reserved0;
-	__le32 s_reserved1;
-/*C0*/  __le64 s_reserved2[16];		/* Fill out superblock */
+	__le32 s_dx_seed[3];		/* seed[0-2] for dx dir hash.
+					 * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];		/* Fill out superblock */
 /*140*/
 
 	/*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
 					   belongs to */
 	__le16 i_suballoc_bit;		/* Bit offset in suballocator
 					   block group */
-/*10*/	__le16 i_reserved0;
+/*10*/	__le16 i_links_count_hi;	/* High 16 bits of links count */
 	__le16 i_xattr_inline_size;
 	__le32 i_clusters;		/* Cluster count */
 	__le32 i_uid;			/* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
 	__le16 i_dyn_features;
 	__le64 i_xattr_loc;
 /*80*/	struct ocfs2_block_check i_check;	/* Error checking */
-/*88*/	__le64 i_reserved2[6];
+/*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
+	__le64 i_reserved2[5];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
 /*40*/
 };
 
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+	__le32		dx_major_hash;	/* Used to find logical
+					 * cluster in index */
+	__le32		dx_minor_hash;	/* Lower bits used to find
+					 * block in cluster */
+	__le64		dx_dirent_blk;	/* Physical block in unindexed
+					 * tree holding this dirent. */
+};
+
+struct ocfs2_dx_entry_list {
+	__le32		de_reserved;
+	__le16		de_count;	/* Maximum number of entries
+					 * possible in de_entries */
+	__le16		de_num_used;	/* Current number of
+					 * de_entries entries */
+	struct	ocfs2_dx_entry		de_entries[0];	/* Indexed dir entries
+							 * in a packed array of
+							 * length de_num_used */
+};
+
+#define OCFS2_DX_FLAG_INLINE	0x01
+
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+	__u8		dr_signature[8];	/* Signature for verification */
+	struct ocfs2_block_check dr_check;	/* Error checking */
+	__le16		dr_suballoc_slot;	/* Slot suballocator this
+						 * block belongs to. */
+	__le16		dr_suballoc_bit;	/* Bit offset in suballocator
+						 * block group */
+	__le32		dr_fs_generation;	/* Must match super block */
+	__le64		dr_blkno;		/* Offset on disk, in blocks */
+	__le64		dr_last_eb_blk;		/* Pointer to last
+						 * extent block */
+	__le32		dr_clusters;		/* Clusters allocated
+						 * to the indexed tree. */
+	__u8		dr_flags;		/* OCFS2_DX_FLAG_* flags */
+	__u8		dr_reserved0;
+	__le16		dr_reserved1;
+	__le64		dr_dir_blkno;		/* Pointer to parent inode */
+	__le32		dr_num_entries;		/* Total number of
+						 * names stored in
+						 * this directory.*/
+	__le32		dr_reserved2;
+	__le64		dr_free_blk;		/* Pointer to head of free
+						 * unindexed block list. */
+	__le64		dr_reserved3[15];
+	union {
+		struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+						   * bits for maximum space
+						   * efficiency. */
+		struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+							* entries. We grow out
+							* to extents if this
+							* gets too big. */
+	};
+};
+
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+	__u8		dl_signature[8];/* Signature for verification */
+	struct ocfs2_block_check dl_check;	/* Error checking */
+	__le64		dl_blkno;	/* Offset on disk, in blocks */
+	__le32		dl_fs_generation;/* Must match super block */
+	__le32		dl_reserved0;
+	__le64		dl_reserved1;
+	struct ocfs2_dx_entry_list	dl_list;
+};
+
 /*
  * On disk allocator group structure for OCFS2
  */
@@ -1070,12 +1166,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 		 offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
 
-static inline int ocfs2_max_inline_data(struct super_block *sb)
-{
-	return sb->s_blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
-}
-
 static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
 						   struct ocfs2_dinode *di)
 {
@@ -1118,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -1138,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+
+	return size / sizeof(struct ocfs2_dx_entry);
+}
+
 static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 {
 	u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_LOCK_TYPE_FLOCK,
 	OCFS2_LOCK_TYPE_QINFO,
+	OCFS2_LOCK_TYPE_NFS_SYNC,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_QINFO:
 			c = 'Q';
 			break;
+		case OCFS2_LOCK_TYPE_NFS_SYNC:
+			c = 'Y';
+			break;
 		default:
 			c = '\0';
 	}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
 #include "buffer_head_io.h"
 
 #define NOT_ALLOC_NEW_GROUP		0
-#define ALLOC_NEW_GROUP			1
+#define ALLOC_NEW_GROUP			0x1
+#define ALLOC_GROUPS_FROM_GLOBAL	0x2
 
 #define OCFS2_MAX_INODES_TO_STEAL	1024
 
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
 				   struct buffer_head *bh,
-				   u64 max_block);
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags);
 
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u16 *bg_bit_off);
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 					     u32 bits_wanted, u64 max_block,
+					     int flags,
 					     struct ocfs2_alloc_context **ac);
 
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
 				   struct buffer_head *bh,
-				   u64 max_block)
+				   u64 max_block,
+				   u64 *last_alloc_group,
+				   int flags)
 {
 	int status, credits;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	cl = &fe->id2.i_chain;
 	status = ocfs2_reserve_clusters_with_limit(osb,
 						   le16_to_cpu(cl->cl_cpg),
-						   max_block, &ac);
+						   max_block, flags, &ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	if (last_alloc_group && *last_alloc_group != 0) {
+		mlog(0, "use old allocation group %llu for block group alloc\n",
+		     (unsigned long long)*last_alloc_group);
+		ac->ac_last_group = *last_alloc_group;
+	}
 	status = ocfs2_claim_clusters(osb,
 				      handle,
 				      ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 
 	status = 0;
+
+	/* save the new last alloc group so that the caller can cache it. */
+	if (last_alloc_group)
+		*last_alloc_group = ac->ac_last_group;
+
 bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 				       struct ocfs2_alloc_context *ac,
 				       int type,
 				       u32 slot,
-				       int alloc_new_group)
+				       u64 *last_alloc_group,
+				       int flags)
 {
 	int status;
 	u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		if (alloc_new_group != ALLOC_NEW_GROUP) {
+		if (!(flags & ALLOC_NEW_GROUP)) {
 			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
 			     "and we don't alloc a new group for it.\n",
 			     slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 		}
 
 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
-						 ac->ac_max_block);
+						 ac->ac_max_block,
+						 last_alloc_group, flags);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 
 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 					     EXTENT_ALLOC_SYSTEM_INODE,
-					     slot, ALLOC_NEW_GROUP);
+					     slot, NULL, ALLOC_NEW_GROUP);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 
 		status = ocfs2_reserve_suballoc_bits(osb, ac,
 						     INODE_ALLOC_SYSTEM_INODE,
-						     slot, NOT_ALLOC_NEW_GROUP);
+						     slot, NULL,
+						     NOT_ALLOC_NEW_GROUP);
 		if (status >= 0) {
 			ocfs2_set_inode_steal_slot(osb, slot);
 			break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 {
 	int status;
 	s16 slot = ocfs2_get_inode_steal_slot(osb);
+	u64 alloc_group;
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 		goto inode_steal;
 
 	atomic_set(&osb->s_num_inodes_stolen, 0);
+	alloc_group = osb->osb_inode_alloc_group;
 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
 					     INODE_ALLOC_SYSTEM_INODE,
-					     osb->slot_num, ALLOC_NEW_GROUP);
+					     osb->slot_num,
+					     &alloc_group,
+					     ALLOC_NEW_GROUP |
+					     ALLOC_GROUPS_FROM_GLOBAL);
 	if (status >= 0) {
 		status = 0;
 
+		spin_lock(&osb->osb_lock);
+		osb->osb_inode_alloc_group = alloc_group;
+		spin_unlock(&osb->osb_lock);
+		mlog(0, "after reservation, new allocation group is "
+		     "%llu\n", (unsigned long long)alloc_group);
+
 		/*
 		 * Some inodes must be freed by us, so try to allocate
 		 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 
 	status = ocfs2_reserve_suballoc_bits(osb, ac,
 					     GLOBAL_BITMAP_SYSTEM_INODE,
-					     OCFS2_INVALID_SLOT,
+					     OCFS2_INVALID_SLOT, NULL,
 					     ALLOC_NEW_GROUP);
 	if (status < 0 && status != -ENOSPC) {
 		mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
  * things a bit. */
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 					     u32 bits_wanted, u64 max_block,
+					     int flags,
 					     struct ocfs2_alloc_context **ac)
 {
 	int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 	(*ac)->ac_max_block = max_block;
 
 	status = -ENOSPC;
-	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 			   u32 bits_wanted,
 			   struct ocfs2_alloc_context **ac)
 {
-	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+						 ALLOC_NEW_GROUP, ac);
 }
 
 /*
@@ -1618,8 +1651,41 @@ bail:
 	return status;
 }
 
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+				      struct buffer_head *parent_fe_bh,
+				      struct ocfs2_alloc_context *ac)
+{
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+	/*
+	 * Try to allocate inodes from some specific group.
+	 *
+	 * If the parent dir has recorded the last group used in allocation,
+	 * cool, use it. Otherwise if we try to allocate new inode from the
+	 * same slot the parent dir belongs to, use the same chunk.
+	 *
+	 * We are very careful here to avoid the mistake of setting
+	 * ac_last_group to a group descriptor from a different (unlocked) slot.
+	 */
+	if (OCFS2_I(dir)->ip_last_used_group &&
+	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+	else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+		ac->ac_last_group = ocfs2_which_suballoc_group(
+					le64_to_cpu(fe->i_blkno),
+					le16_to_cpu(fe->i_suballoc_bit));
+}
+
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+					     struct ocfs2_alloc_context *ac)
+{
+	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
+
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 			  handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
 			  struct ocfs2_alloc_context *ac,
 			  u16 *suballoc_bit,
 			  u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 	BUG_ON(ac->ac_bits_wanted != 1);
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
 
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
 	status = ocfs2_claim_suballoc_bits(osb,
 					   ac,
 					   handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 
 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
 	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
 	status = 0;
 bail:
 	mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
 
 	return ret;
 }
+
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+				       u16 *suballoc_slot, u16 *suballoc_bit)
+{
+	int status;
+	struct buffer_head *inode_bh = NULL;
+	struct ocfs2_dinode *inode_fe;
+
+	mlog_entry("blkno: %llu\n", blkno);
+
+	/* dirty read disk */
+	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+		goto bail;
+	}
+
+	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+		mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+		     blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (suballoc_slot)
+		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+	if (suballoc_bit)
+		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+
+bail:
+	brelse(inode_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+				   struct inode *suballoc,
+				   struct buffer_head *alloc_bh, u64 blkno,
+				   u16 bit, int *res)
+{
+	struct ocfs2_dinode *alloc_fe;
+	struct ocfs2_group_desc *group;
+	struct buffer_head *group_bh = NULL;
+	u64 bg_blkno;
+	int status;
+
+	mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+
+	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+	if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+		     (unsigned int)bit,
+		     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+	status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+					     &group_bh);
+	if (status < 0) {
+		mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+	brelse(group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+	int status;
+	u16 suballoc_bit = 0, suballoc_slot = 0;
+	struct inode *inode_alloc_inode;
+	struct buffer_head *alloc_bh = NULL;
+
+	mlog_entry("blkno: %llu", blkno);
+
+	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+					     &suballoc_bit);
+	if (status < 0) {
+		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+		goto bail;
+	}
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    suballoc_slot);
+	if (!inode_alloc_inode) {
+		/* the error code could be inaccurate, but we are not able to
+		 * get the correct one. */
+		status = -EINVAL;
+		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+		     (u32)suballoc_slot);
+		goto bail;
+	}
+
+	mutex_lock(&inode_alloc_inode->i_mutex);
+	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+	if (status < 0) {
+		mutex_unlock(&inode_alloc_inode->i_mutex);
+		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+		     (u32)suballoc_slot, status);
+		goto bail;
+	}
+
+	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+					 blkno, suballoc_bit, res);
+	if (status < 0)
+		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+	ocfs2_inode_unlock(inode_alloc_inode, 0);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
+
+	iput(inode_alloc_inode);
+	brelse(alloc_bh);
+bail:
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
 			 u64 *blkno_start);
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 			  handle_t *handle,
+			  struct inode *dir,
+			  struct buffer_head *parent_fe_bh,
 			  struct ocfs2_alloc_context *ac,
 			  u16 *suballoc_bit,
 			  u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b1cb38fbe807..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+	int out = 0;
+	int i;
+	struct ocfs2_cluster_connection *cconn = osb->cconn;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+			"Device", osb->dev_str, osb->uuid_str,
+			osb->fs_generation, osb->vol_label);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  Flags: 0x%lX\n", "Volume",
+			atomic_read(&osb->vol_state), osb->osb_flags);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Block: %lu  Cluster: %d\n", "Sizes",
+			osb->sb->s_blocksize, osb->s_clustersize);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Compat: 0x%X  Incompat: 0x%X  "
+			"ROcompat: 0x%X\n",
+			"Features", osb->s_feature_compat,
+			osb->s_feature_incompat, osb->s_feature_ro_compat);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+			osb->s_mount_opt, osb->s_atime_quantum);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+			"Cluster",
+			(*osb->osb_cluster_stack == '\0' ?
+			 "o2cb" : osb->osb_cluster_stack),
+			cconn->cc_namelen, cconn->cc_name,
+			cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+
+	spin_lock(&osb->dc_task_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+			"WorkSeq: %lu\n", "DownCnvt",
+			task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+			osb->dc_wake_sequence, osb->dc_work_sequence);
+	spin_unlock(&osb->dc_task_lock);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+			"Recovery",
+			(osb->recovery_thread_task ?
+			 task_pid_nr(osb->recovery_thread_task) : -1));
+	if (rm->rm_used == 0)
+		out += snprintf(buf + out, len - out, " None\n");
+	else {
+		for (i = 0; i < rm->rm_used; i++)
+			out += snprintf(buf + out, len - out, " %d",
+					rm->rm_entries[i]);
+		out += snprintf(buf + out, len - out, "\n");
+	}
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+			task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+			atomic_read(&osb->needs_checkpoint));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+			"Journal", osb->journal->j_state,
+			atomic_read(&osb->journal->j_num_trans),
+			osb->journal->j_trans_id);
+
+	out += snprintf(buf + out, len - out,
+			"%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+			"SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+			"Stats",
+			atomic_read(&osb->alloc_stats.bitmap_data),
+			atomic_read(&osb->alloc_stats.local_data),
+			atomic_read(&osb->alloc_stats.bg_allocs),
+			atomic_read(&osb->alloc_stats.moves),
+			atomic_read(&osb->alloc_stats.bg_extends));
+
+	out += snprintf(buf + out, len - out,
+			"%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+			"Default: %u bits\n",
+			"LocalAlloc", osb->local_alloc_state,
+			(unsigned long long)osb->la_last_gd,
+			osb->local_alloc_bits, osb->local_alloc_default_bits);
+
+	spin_lock(&osb->osb_lock);
+	out += snprintf(buf + out, len - out,
+			"%10s => Slot: %d  NumStolen: %d\n", "Steal",
+			osb->s_inode_steal_slot,
+			atomic_read(&osb->s_num_inodes_stolen));
+	spin_unlock(&osb->osb_lock);
+
+	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+			"Slots", "Num", "RecoGen");
+
+	for (i = 0; i < osb->max_slots; ++i) {
+		out += snprintf(buf + out, len - out,
+				"%10s  %c %3d  %10d\n",
+				" ",
+				(i == osb->slot_num ? '*' : ' '),
+				i, osb->slot_recovery_generations[i]);
+	}
+
+	return out;
+}
+
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	struct ocfs2_super *osb = inode->i_private;
+	char *buf = NULL;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		goto bail;
+
+	i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+
+	file->private_data = buf;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+				       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return 0;
+}
+#endif	/* CONFIG_DEBUG_FS */
+
+static struct file_operations ocfs2_osb_debug_fops = {
+	.open =		ocfs2_osb_debug_open,
+	.release =	ocfs2_debug_release,
+	.read =		ocfs2_debug_read,
+	.llseek =	generic_file_llseek,
+};
+
 /*
  * write_super and sync_fs ripped right out of ext3.
  */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 		goto read_super_error;
 	}
 
+	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+					    osb->osb_debug_root,
+					    osb,
+					    &ocfs2_osb_debug_fops);
+	if (!osb->osb_ctxt) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
 	status = ocfs2_mount_volume(sb);
 	if (osb->root_inode)
 		inode = igrab(osb->root_inode);
@@ -1537,6 +1711,13 @@ static int ocfs2_get_sector(struct super_block *sb,
 	unlock_buffer(*bh);
 	ll_rw_block(READ, 1, bh);
 	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
 	return 0;
 }
 
@@ -1613,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	debugfs_remove(osb->osb_ctxt);
+
 	ocfs2_disable_quotas(osb);
 
 	ocfs2_shutdown_local_alloc(osb);
@@ -1735,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
 	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
 
+	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+
+	for (i = 0; i < 3; i++)
+		osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+	osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+
 	osb->sb = sb;
 	/* Save off for ocfs2_rw_direct */
 	osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -1747,6 +1936,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
 	ocfs2_init_inode_steal_slot(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
@@ -2122,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 	 * lock, and it's marked as dirty, set the bit in the recover
 	 * map and launch a recovery thread for it. */
 	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	status = ocfs2_compute_replay_slots(osb);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 915039fffe6e..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
 #define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 #define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
 					 - sizeof(struct ocfs2_xattr_block) \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 			       bucket->bu_blocks, bucket->bu_bhs, 0,
 			       NULL);
 	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
 						 bucket->bu_bhs,
 						 bucket->bu_blocks,
 						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		if (rc)
 			mlog_errno(rc);
 	}
@@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
 	int i;
 
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
 				   bucket->bu_bhs, bucket->bu_blocks,
 				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 
 	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
@@ -507,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 			  struct ocfs2_security_xattr_info *si,
 			  int *want_clusters,
 			  int *xattr_credits,
-			  struct ocfs2_alloc_context **xattr_ac)
+			  int *want_meta)
 {
 	int ret = 0;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -542,14 +547,14 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 	 * when blocksize = 512, may reserve one more cluser for
 	 * xattr bucket, otherwise reserve one metadata block
 	 * for them is ok.
+	 * If this is a new directory with inline data,
+	 * we choose to reserve the entire inline area for
+	 * directory contents and force an external xattr block.
 	 */
 	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
 	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
-		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
-		if (ret) {
-			mlog_errno(ret);
-			return ret;
-		}
+		*want_meta = *want_meta + 1;
 		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
 	}
 
@@ -1507,7 +1512,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return -EIO;
 
@@ -2190,7 +2195,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return 0;
 
@@ -2592,8 +2597,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 
 	if (!ret) {
 		/* Update inode ctime. */
-		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_journal_access_di(ctxt->handle, inode,
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4785,19 +4791,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 						char *val,
 						int value_len)
 {
-	int offset;
+	int ret, offset, block_off;
 	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
+	void *base;
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
-	offset = le16_to_cpu(xe->xe_name_offset) +
-		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+	ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
+						xe - xh->xh_entries,
+						&block_off,
+						&offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+	base = bucket_block(xs->bucket, block_off);
+	xv = (struct ocfs2_xattr_value_root *)(base + offset +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len));
 
-	return __ocfs2_xattr_set_value_outside(inode, handle,
-					       xv, val, value_len);
+	ret = __ocfs2_xattr_set_value_outside(inode, handle,
+					      xv, val, value_len);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
 }
 
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -5060,8 +5080,8 @@ try_again:
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 	header_size = sizeof(struct ocfs2_xattr_header) +
 			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE -
-		le16_to_cpu(xh->xh_name_value_len) - header_size;
+	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
+		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
@@ -5094,7 +5114,7 @@ try_again:
 			need = 0;
 	}
 
-	free = xh_free_start - header_size;
+	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
 	/*
 	 * We need to make sure the new name/value pair
 	 * can exist in the same block.
@@ -5127,7 +5147,8 @@ try_again:
 			}
 
 			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size;
+			free = xh_free_start - header_size
+				- OCFS2_XATTR_HEADER_GAP;
 			if (xh_free_start % blocksize < need)
 				free -= xh_free_start % blocksize;
 
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
 			     int *, int *, struct ocfs2_alloc_context **);
 int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
 			  int, struct ocfs2_security_xattr_info *,
-			  int *, int *, struct ocfs2_alloc_context **);
+			  int *, int *, int *);
 
 /*
  * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *s = dentry->d_sb;
 	struct omfs_sb_info *sbi = OMFS_SB(s);
+	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
 	buf->f_type = OMFS_MAGIC;
 	buf->f_bsize = sbi->s_blocksize;
 	buf->f_blocks = sbi->s_num_blocks;
 	buf->f_files = sbi->s_num_blocks;
 	buf->f_namelen = OMFS_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	buf->f_bfree = buf->f_bavail = buf->f_ffree =
 		omfs_count_free(s);
+
 	return 0;
 }
 
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_uid = current_uid();
 	sbi->s_gid = current_gid();
-	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+	sbi->s_dmask = sbi->s_fmask = current_umask();
 
 	if (!parse_options((char *) data, sbi))
 		goto end;
diff --git a/fs/open.c b/fs/open.c
index a3a78ceb2a2b..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -273,7 +274,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	if (!error)
 		error = security_path_truncate(&path, length, 0);
 	if (!error) {
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		error = do_truncate(path.dentry, length, 0, NULL);
 	}
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 
 #include "check.h"
 
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
 
 static struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
 	NULL
 };
 
@@ -400,7 +404,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	pdev->uevent_suppress = 1;
+	dev_set_uevent_suppress(pdev, 1);
 	err = device_add(pdev);
 	if (err)
 		goto out_put;
@@ -410,7 +414,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!p->holder_dir)
 		goto out_del;
 
-	pdev->uevent_suppress = 0;
+	dev_set_uevent_suppress(pdev, 0);
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(pdev, &dev_attr_whole_disk);
 		if (err)
@@ -422,7 +426,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!ddev->uevent_suppress)
+	if (!dev_get_uevent_suppress(pdev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
 	return p;
@@ -455,7 +459,7 @@ void register_disk(struct gendisk *disk)
 	dev_set_name(ddev, disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
-	ddev->uevent_suppress = 1;
+	dev_set_uevent_suppress(ddev, 1);
 
 	if (device_add(ddev))
 		return;
@@ -490,7 +494,7 @@ void register_disk(struct gendisk *disk)
 
 exit:
 	/* announce disk after possible partitions are created */
-	ddev->uevent_suppress = 0;
+	dev_set_uevent_suppress(ddev, 0);
 	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 1e064c4a4f86..46297683cd34 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -21,20 +21,38 @@
  * compute the block number from a
  * cyl-cyl-head-head structure
  */
-static inline int
+static sector_t
 cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
-        return ptr->cc * geo->heads * geo->sectors +
-	       ptr->hh * geo->sectors;
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return cyl * geo->heads * geo->sectors +
+	       head * geo->sectors;
 }
 
 /*
  * compute the block number from a
  * cyl-cyl-head-head-block structure
  */
-static inline int
+static sector_t
 cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
-        return ptr->cc * geo->heads * geo->sectors +
-		ptr->hh * geo->sectors +
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return	cyl * geo->heads * geo->sectors +
+		head * geo->sectors +
 		ptr->b;
 }
 
@@ -43,14 +61,15 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 int
 ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int blocksize, offset, size,res;
-	loff_t i_size;
+	int blocksize, res;
+	loff_t i_size, offset, size, fmt_size;
 	dasd_information2_t *info;
 	struct hd_geometry *geo;
 	char type[5] = {0,};
 	char name[7] = {0,};
 	union label_t {
-		struct vtoc_volume_label vol;
+		struct vtoc_volume_label_cdl vol;
+		struct vtoc_volume_label_ldl lnx;
 		struct vtoc_cms_label cms;
 	} *label;
 	unsigned char *data;
@@ -85,14 +104,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	if (data == NULL)
 		goto out_readerr;
 
-	strncpy (type, data, 4);
-	if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD")))
-		strncpy(name, data + 8, 6);
-	else
-		strncpy(name, data + 4, 6);
 	memcpy(label, data, sizeof(union label_t));
 	put_dev_sector(sect);
 
+	if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
+		strncpy(type, label->vol.vollbl, 4);
+		strncpy(name, label->vol.volid, 6);
+	} else {
+		strncpy(type, label->lnx.vollbl, 4);
+		strncpy(name, label->lnx.volid, 6);
+	}
 	EBCASC(type, 4);
 	EBCASC(name, 6);
 
@@ -110,36 +131,54 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 			/*
 			 * VM style CMS1 labeled disk
 			 */
+			blocksize = label->cms.block_size;
 			if (label->cms.disk_offset != 0) {
 				printk("CMS1/%8s(MDSK):", name);
 				/* disk is reserved minidisk */
-				blocksize = label->cms.block_size;
 				offset = label->cms.disk_offset;
 				size = (label->cms.block_count - 1)
 					* (blocksize >> 9);
 			} else {
 				printk("CMS1/%8s:", name);
 				offset = (info->label_block + 1);
-				size = i_size >> 9;
+				size = label->cms.block_count
+					* (blocksize >> 9);
 			}
+			put_partition(state, 1, offset*(blocksize >> 9),
+				      size-offset*(blocksize >> 9));
 		} else {
-			/*
-			 * Old style LNX1 or unlabeled disk
-			 */
-			if (strncmp(type, "LNX1", 4) == 0)
-				printk ("LNX1/%8s:", name);
-			else
+			if (strncmp(type, "LNX1", 4) == 0) {
+				printk("LNX1/%8s:", name);
+				if (label->lnx.ldl_version == 0xf2) {
+					fmt_size = label->lnx.formatted_blocks
+						* (blocksize >> 9);
+				} else if (!strcmp(info->type, "ECKD")) {
+					/* formated w/o large volume support */
+					fmt_size = geo->cylinders * geo->heads
+					      * geo->sectors * (blocksize >> 9);
+				} else {
+					/* old label and no usable disk geometry
+					 * (e.g. DIAG) */
+					fmt_size = i_size >> 9;
+				}
+				size = i_size >> 9;
+				if (fmt_size < size)
+					size = fmt_size;
+				offset = (info->label_block + 1);
+			} else {
+				/* unlabeled disk */
 				printk("(nonl)");
-			offset = (info->label_block + 1);
-			size = i_size >> 9;
-		}
-		put_partition(state, 1, offset*(blocksize >> 9),
+				size = i_size >> 9;
+				offset = (info->label_block + 1);
+			}
+			put_partition(state, 1, offset*(blocksize >> 9),
 				      size-offset*(blocksize >> 9));
+		}
 	} else if (info->format == DASD_FORMAT_CDL) {
 		/*
 		 * New style CDL formatted disk
 		 */
-		unsigned int blk;
+		sector_t blk;
 		int counter;
 
 		/*
@@ -166,7 +205,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 				/* skip FMT4 / FMT5 / FMT7 labels */
 				if (f1.DS1FMTID == _ascebc['4']
 				    || f1.DS1FMTID == _ascebc['5']
-				    || f1.DS1FMTID == _ascebc['7']) {
+				    || f1.DS1FMTID == _ascebc['7']
+				    || f1.DS1FMTID == _ascebc['9']) {
 					blk++;
 					data = read_dev_sector(bdev, blk *
 							       (blocksize/512),
@@ -174,8 +214,9 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 					continue;
 				}
 
-				/* only FMT1 valid at this point */
-				if (f1.DS1FMTID != _ascebc['1'])
+				/* only FMT1 and 8 labels valid at this point */
+				if (f1.DS1FMTID != _ascebc['1'] &&
+				    f1.DS1FMTID != _ascebc['8'])
 					break;
 
 				/* OK, we got valid partition data */
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5179d5..4af7aa521813 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -667,10 +667,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -684,10 +681,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -699,18 +693,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
-
 	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
-
-	if (retval >= 0)
+	if (retval >= 0) {
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
-
+		if (retval < 0) /* this can happen only if on == T */
+			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
+	}
 	mutex_unlock(&inode->i_mutex);
-
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -870,7 +860,7 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_inode->i_ino);
 }
 
-static struct dentry_operations pipefs_dentry_operations = {
+static const struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 	.d_dname	= pipefs_dname,
 };
@@ -1034,11 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
 	return error;
 }
 
-int do_pipe(int *fd)
-{
-	return do_pipe_flags(fd, 0);
-}
-
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0c9de19a1633..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 /* NOTE:
@@ -146,15 +147,22 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 	return count;
 }
 
-static struct fs_struct *get_fs_struct(struct task_struct *task)
+static int get_fs_path(struct task_struct *task, struct path *path, bool root)
 {
 	struct fs_struct *fs;
+	int result = -ENOENT;
+
 	task_lock(task);
 	fs = task->fs;
-	if(fs)
-		atomic_inc(&fs->count);
+	if (fs) {
+		read_lock(&fs->lock);
+		*path = root ? fs->root : fs->pwd;
+		path_get(path);
+		read_unlock(&fs->lock);
+		result = 0;
+	}
 	task_unlock(task);
-	return fs;
+	return result;
 }
 
 static int get_nr_threads(struct task_struct *tsk)
@@ -172,42 +180,24 @@ static int get_nr_threads(struct task_struct *tsk)
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
-	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
 
 	if (task) {
-		fs = get_fs_struct(task);
+		result = get_fs_path(task, path, 0);
 		put_task_struct(task);
 	}
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = fs->pwd;
-		path_get(&fs->pwd);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
-	}
 	return result;
 }
 
 static int proc_root_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
-	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
 
 	if (task) {
-		fs = get_fs_struct(task);
+		result = get_fs_path(task, path, 1);
 		put_task_struct(task);
 	}
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
-	}
 	return result;
 }
 
@@ -596,7 +586,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	struct task_struct *task = get_proc_task(inode);
 	struct nsproxy *nsp;
 	struct mnt_namespace *ns = NULL;
-	struct fs_struct *fs = NULL;
 	struct path root;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
@@ -610,22 +599,16 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-		if (ns)
-			fs = get_fs_struct(task);
+		if (ns && get_fs_path(task, &root, 1) == 0)
+			ret = 0;
 		put_task_struct(task);
 	}
 
 	if (!ns)
 		goto err;
-	if (!fs)
+	if (ret)
 		goto err_put_ns;
 
-	read_lock(&fs->lock);
-	root = fs->root;
-	path_get(&root);
-	read_unlock(&fs->lock);
-	put_fs_struct(fs);
-
 	ret = -ENOMEM;
 	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
 	if (!p)
@@ -1545,7 +1528,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
-static struct dentry_operations pid_dentry_operations =
+static const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -1717,7 +1700,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static struct dentry_operations tid_fd_dentry_operations =
+static const struct dentry_operations tid_fd_dentry_operations =
 {
 	.d_revalidate	= tid_fd_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -2339,7 +2322,7 @@ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static struct dentry_operations proc_base_dentry_operations =
+static const struct dentry_operations proc_base_dentry_operations =
 {
 	.d_revalidate	= proc_base_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -3066,7 +3049,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	int retval = -ENOENT;
 	ino_t ino;
 	int tid;
-	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 	struct pid_namespace *ns;
 
 	task = get_proc_task(inode);
@@ -3083,18 +3065,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		goto out_no_task;
 	retval = 0;
 
-	switch (pos) {
+	switch ((unsigned long)filp->f_pos) {
 	case 0:
 		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
 			goto out;
-		pos++;
+		filp->f_pos++;
 		/* fall through */
 	case 1:
 		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
+		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
 			goto out;
-		pos++;
+		filp->f_pos++;
 		/* fall through */
 	}
 
@@ -3104,9 +3086,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	ns = filp->f_dentry->d_sb->s_fs_info;
 	tid = (int)filp->f_version;
 	filp->f_version = 0;
-	for (task = first_tid(leader, tid, pos - 2, ns);
+	for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
 	     task;
-	     task = next_tid(task), pos++) {
+	     task = next_tid(task), filp->f_pos++) {
 		tid = task_pid_nr_ns(task, ns);
 		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
 			/* returning this tgid failed, save it as the first
@@ -3117,7 +3099,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		}
 	}
 out:
-	filp->f_pos = pos;
 	put_task_struct(leader);
 out_no_task:
 	return retval;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index db7fa5cab988..fa678abc9db1 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -37,7 +37,7 @@ static int proc_match(int len, const char *name, struct proc_dir_entry *de)
 #define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
 
 static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	       loff_t *ppos)
 {
 	struct inode * inode = file->f_path.dentry->d_inode;
@@ -183,19 +183,47 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 }
 
 static ssize_t
+proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+	       loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	spin_unlock(&pde->pde_unload_lock);
+
+	rv = __proc_file_read(file, buf, nbytes, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t
 proc_file_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct proc_dir_entry * dp;
-	
-	dp = PDE(inode);
-
-	if (!dp->write_proc)
-		return -EIO;
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+
+	if (pde->write_proc) {
+		spin_lock(&pde->pde_unload_lock);
+		if (!pde->proc_fops) {
+			spin_unlock(&pde->pde_unload_lock);
+			return rv;
+		}
+		pde->pde_users++;
+		spin_unlock(&pde->pde_unload_lock);
 
-	/* FIXME: does this routine need ppos?  probably... */
-	return dp->write_proc(file, buffer, count, dp->data);
+		/* FIXME: does this routine need ppos?  probably... */
+		rv = pde->write_proc(file, buffer, count, pde->data);
+		pde_users_dec(pde);
+	}
+	return rv;
 }
 
 
@@ -307,6 +335,21 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
+ *
+ * Current inode allocations in the proc-fs (hex-numbers):
+ *
+ * 00000000		reserved
+ * 00000001-00000fff	static entries	(goners)
+ *      001		root-ino
+ *
+ * 00001000-00001fff	unused
+ * 0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
+ * 80000000-efffffff	unused
+ * f0000000-ffffffff	dynamic entries
+ *
+ * Goal:
+ *	Once we split the thing into several virtual filesystems,
+ *	we will get rid of magical ranges (and this comment, BTW).
  */
 static unsigned int get_inode_number(void)
 {
@@ -363,7 +406,7 @@ static int proc_delete_dentry(struct dentry * dentry)
 	return 1;
 }
 
-static struct dentry_operations proc_dentry_operations =
+static const struct dentry_operations proc_dentry_operations =
 {
 	.d_delete	= proc_delete_dentry,
 };
diff --git a/fs/proc/inode-alloc.txt b/fs/proc/inode-alloc.txt
deleted file mode 100644
index 77212f938c2c..000000000000
--- a/fs/proc/inode-alloc.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Current inode allocations in the proc-fs (hex-numbers):
-
-  00000000		reserved
-  00000001-00000fff	static entries	(goners)
-       001		root-ino
-
-  00001000-00001fff	unused
-  0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
-  80000000-efffffff	unused
-  f0000000-ffffffff	dynamic entries
-
-Goal:
-	a) once we'll split the thing into several virtual filesystems we
-	will get rid of magical ranges (and this file, BTW).
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d78ade305541 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,11 +58,8 @@ static void proc_delete_inode(struct inode *inode)
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
-	if (de) {
-		if (de->owner)
-			module_put(de->owner);
+	if (de)
 		de_put(de);
-	}
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
@@ -127,7 +124,7 @@ static void __pde_users_dec(struct proc_dir_entry *pde)
 		complete(pde->pde_unload_completion);
 }
 
-static void pde_users_dec(struct proc_dir_entry *pde)
+void pde_users_dec(struct proc_dir_entry *pde)
 {
 	spin_lock(&pde->pde_unload_lock);
 	__pde_users_dec(pde);
@@ -449,12 +446,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 {
 	struct inode * inode;
 
-	if (!try_module_get(de->owner))
-		goto out_mod;
-
 	inode = iget_locked(sb, ino);
 	if (!inode)
-		goto out_ino;
+		return NULL;
 	if (inode->i_state & I_NEW) {
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->fd = 0;
@@ -486,13 +480,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 		}
 		unlock_new_inode(inode);
 	} else
-	       module_put(de->owner);
+	       de_put(de);
 	return inode;
-
-out_ino:
-	module_put(de->owner);
-out_mod:
-	return NULL;
 }			
 
 int proc_fill_super(struct super_block *s)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd53ff838498..f6db9618a888 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -91,3 +91,4 @@ struct pde_opener {
 	int (*release)(struct inode *, struct file *);
 	struct list_head lh;
 };
+void pde_users_dec(struct proc_dir_entry *pde);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_read(&mmap_pages_allocated)),
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 /*
  * display a list of all the REGIONs the kernel knows about
- * - nommu kernals have a single flat list
+ * - nommu kernels have a single flat list
  */
 static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b1..e9983837d08d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
 #define KPF_RECLAIM    9
 #define KPF_BUDDY     10
 
-#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
 
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		else
 			kflags = ppage->flags;
 
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
 			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
 			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
 			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863a..9b1e4e9a16bf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -7,7 +7,7 @@
 #include <linux/security.h>
 #include "internal.h"
 
-static struct dentry_operations proc_sys_dentry_operations;
+static const struct dentry_operations proc_sys_dentry_operations;
 static const struct file_operations proc_sys_file_operations;
 static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
@@ -396,7 +396,7 @@ static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
 	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
 }
 
-static struct dentry_operations proc_sys_dentry_operations = {
+static const struct dentry_operations proc_sys_dentry_operations = {
 	.d_revalidate	= proc_sys_revalidate,
 	.d_delete	= proc_sys_delete,
 	.d_compare	= proc_sys_compare,
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d153946d6d15..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,17 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if (!driver->ops->read_proc || !driver->driver_name ||
-	    driver->proc_entry)
+	if (!driver->driver_name || driver->proc_entry ||
+	    !driver->ops->proc_fops)
 		return;
 
-	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
-	if (!ent)
-		return;
-	ent->read_proc = driver->ops->read_proc;
-	ent->owner = driver->owner;
-	ent->data = driver;
-
+	ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+			       driver->ops->proc_fops, driver);
 	driver->proc_entry = ent;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f6299a25594e..1e15a2b176e8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		ns->proc_mnt = mnt;
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static void proc_kill_sb(struct super_block *sb)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 94063840832a..b0ae0be4801f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -693,8 +693,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out_pages;
 	}
 
-	pm.out = (u64 *)buf;
-	pm.end = (u64 *)(buf + count);
+	pm.out = (u64 __user *)buf;
+	pm.end = (u64 __user *)(buf + count);
 
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
@@ -720,9 +720,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (ret == PM_END_OF_BUFFER)
 		ret = 0;
 	/* don't need mmap_sem for these, but this looks cleaner */
-	*ppos += (char *)pm.out - buf;
+	*ppos += (char __user *)pm.out - buf;
 	if (!ret)
-		ret = (char *)pm.out - buf;
+		ret = (char __user *)pm.out - buf;
 
 out_pages:
 	for (; pagecount; pagecount--) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..863464d5519c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	else
 		bytes += kobjsize(mm);
 	
-	if (current->fs && atomic_read(&current->fs->count) > 1)
+	if (current->fs && current->fs->users > 1)
 		sbytes += kobjsize(current->fs);
 	else
 		bytes += kobjsize(current->fs);
@@ -136,14 +137,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index df26aa88fa47..0c10a0b3f146 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,45 +1,43 @@
+#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
 
-static int proc_calc_metrics(char *page, char **start, off_t off,
-				 int count, int *eof, int len)
-{
-	if (len <= off + count)
-		*eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len > count)
-		len = count;
-	if (len < 0)
-		len = 0;
-	return len;
-}
-
-static int uptime_read_proc(char *page, char **start, off_t off, int count,
-			    int *eof, void *data)
+static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
 	struct timespec idle;
-	int len;
 	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
 	cputime_to_timespec(idletime, &idle);
-	len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
 			(unsigned long) idle.tv_sec,
 			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-	return proc_calc_metrics(page, start, off, count, eof, len);
+	return 0;
 }
 
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int __init proc_uptime_init(void)
 {
-	create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
 	return 0;
 }
 module_init(proc_uptime_init);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	lock_kernel();
 
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bfree   = qnx4_count_free_blocks(sb);
 	buf->f_bavail  = buf->f_bfree;
 	buf->f_namelen = QNX4_NAME_MAX;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	unlock_kernel();
 
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
new file mode 100644
index 000000000000..8047e01ef46b
--- /dev/null
+++ b/fs/quota/Kconfig
@@ -0,0 +1,59 @@
+#
+#  Quota configuration
+#
+
+config QUOTA
+	bool "Quota support"
+	help
+	  If you say Y here, you will be able to set per user limits for disk
+	  usage (also called disk quotas). Currently, it works for the
+	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
+	  quotas for which you don't need to run quotacheck(8) after an unclean
+	  shutdown.
+	  For further details, read the Quota mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
+	  with the quota tools. Probably the quota support is only useful for
+	  multi user systems. If unsure, say N.
+
+config QUOTA_NETLINK_INTERFACE
+	bool "Report quota messages through netlink interface"
+	depends on QUOTA && NET
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be reported through netlink interface. If unsure,
+	  say Y.
+
+config PRINT_QUOTA_WARNING
+	bool "Print quota warnings to console (OBSOLETE)"
+	depends on QUOTA
+	default y
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be printed to the process' controlling terminal.
+	  Note that this behavior is currently deprecated and may go away in
+	  future. Please use notification via netlink socket instead.
+
+# Generic support for tree structured quota files. Selected when needed.
+config QUOTA_TREE
+	 tristate
+
+config QFMT_V1
+	tristate "Old quota format support"
+	depends on QUOTA
+	help
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
+	  you have quota working and you don't want to convert to new quota
+	  format say Y here.
+
+config QFMT_V2
+	tristate "Quota format v2 support"
+	depends on QUOTA
+	select QUOTA_TREE
+	help
+	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
+	  need this functionality say Y here.
+
+config QUOTACTL
+	bool
+	depends on XFS_QUOTA || QUOTA
+	default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
new file mode 100644
index 000000000000..385a0831cc99
--- /dev/null
+++ b/fs/quota/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the Linux filesystems.
+#
+# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-y :=
+
+obj-$(CONFIG_QUOTA)		+= dquot.o
+obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
+obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
+obj-$(CONFIG_QUOTACTL)		+= quota.o
diff --git a/fs/dquot.c b/fs/quota/dquot.c
index bca3cac4bee7..607c579e5eca 100644
--- a/fs/dquot.c
+++ b/fs/quota/dquot.c
@@ -129,9 +129,10 @@
  * i_mutex on quota files is special (it's below dqio_mutex)
  */
 
-static DEFINE_SPINLOCK(dq_list_lock);
-static DEFINE_SPINLOCK(dq_state_lock);
-DEFINE_SPINLOCK(dq_data_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
+EXPORT_SYMBOL(dq_data_lock);
 
 static char *quotatypes[] = INITQFNAMES;
 static struct quota_format_type *quota_formats;	/* List of registered formats */
@@ -148,35 +149,46 @@ int register_quota_format(struct quota_format_type *fmt)
 	spin_unlock(&dq_list_lock);
 	return 0;
 }
+EXPORT_SYMBOL(register_quota_format);
 
 void unregister_quota_format(struct quota_format_type *fmt)
 {
 	struct quota_format_type **actqf;
 
 	spin_lock(&dq_list_lock);
-	for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next);
+	for (actqf = &quota_formats; *actqf && *actqf != fmt;
+	     actqf = &(*actqf)->qf_next)
+		;
 	if (*actqf)
 		*actqf = (*actqf)->qf_next;
 	spin_unlock(&dq_list_lock);
 }
+EXPORT_SYMBOL(unregister_quota_format);
 
 static struct quota_format_type *find_quota_format(int id)
 {
 	struct quota_format_type *actqf;
 
 	spin_lock(&dq_list_lock);
-	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
+	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+	     actqf = actqf->qf_next)
+		;
 	if (!actqf || !try_module_get(actqf->qf_owner)) {
 		int qm;
 
 		spin_unlock(&dq_list_lock);
 		
-		for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++);
-		if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name))
+		for (qm = 0; module_names[qm].qm_fmt_id &&
+			     module_names[qm].qm_fmt_id != id; qm++)
+			;
+		if (!module_names[qm].qm_fmt_id ||
+		    request_module(module_names[qm].qm_mod_name))
 			return NULL;
 
 		spin_lock(&dq_list_lock);
-		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
+		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+		     actqf = actqf->qf_next)
+			;
 		if (actqf && !try_module_get(actqf->qf_owner))
 			actqf = NULL;
 	}
@@ -215,6 +227,7 @@ static unsigned int dq_hash_bits, dq_hash_mask;
 static struct hlist_head *dquot_hash;
 
 struct dqstats dqstats;
+EXPORT_SYMBOL(dqstats);
 
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
@@ -230,7 +243,8 @@ hashfn(const struct super_block *sb, unsigned int id, int type)
  */
 static inline void insert_dquot_hash(struct dquot *dquot)
 {
-	struct hlist_head *head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
+	struct hlist_head *head;
+	head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
 	hlist_add_head(&dquot->dq_hash, head);
 }
 
@@ -239,17 +253,19 @@ static inline void remove_dquot_hash(struct dquot *dquot)
 	hlist_del_init(&dquot->dq_hash);
 }
 
-static inline struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, unsigned int id, int type)
+static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
+				unsigned int id, int type)
 {
 	struct hlist_node *node;
 	struct dquot *dquot;
 
 	hlist_for_each (node, dquot_hash+hashent) {
 		dquot = hlist_entry(node, struct dquot, dq_hash);
-		if (dquot->dq_sb == sb && dquot->dq_id == id && dquot->dq_type == type)
+		if (dquot->dq_sb == sb && dquot->dq_id == id &&
+		    dquot->dq_type == type)
 			return dquot;
 	}
-	return NODQUOT;
+	return NULL;
 }
 
 /* Add a dquot to the tail of the free list */
@@ -309,6 +325,7 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 	spin_unlock(&dq_list_lock);
 	return 0;
 }
+EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 
 /* This function needs dq_list_lock */
 static inline int clear_dquot_dirty(struct dquot *dquot)
@@ -345,8 +362,10 @@ int dquot_acquire(struct dquot *dquot)
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
 		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
 		/* Write the info if needed */
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret < 0)
 			goto out_iolock;
 		if (ret2 < 0) {
@@ -360,6 +379,7 @@ out_iolock:
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_acquire);
 
 /*
  *	Write dquot to disk
@@ -380,8 +400,10 @@ int dquot_commit(struct dquot *dquot)
 	 * => we have better not writing it */
 	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
 		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret >= 0)
 			ret = ret2;
 	}
@@ -389,6 +411,7 @@ out_sem:
 	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_commit);
 
 /*
  *	Release dquot
@@ -406,8 +429,10 @@ int dquot_release(struct dquot *dquot)
 	if (dqopt->ops[dquot->dq_type]->release_dqblk) {
 		ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot);
 		/* Write the info */
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret >= 0)
 			ret = ret2;
 	}
@@ -417,6 +442,7 @@ out_dqlock:
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_release);
 
 void dquot_destroy(struct dquot *dquot)
 {
@@ -516,6 +542,7 @@ out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_scan_active);
 
 int vfs_quota_sync(struct super_block *sb, int type)
 {
@@ -533,7 +560,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
 		while (!list_empty(dirty)) {
-			dquot = list_first_entry(dirty, struct dquot, dq_dirty);
+			dquot = list_first_entry(dirty, struct dquot,
+						 dq_dirty);
 			/* Dirty and inactive can be only bad dquot... */
 			if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
 				clear_dquot_dirty(dquot);
@@ -563,6 +591,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 
 	return 0;
 }
+EXPORT_SYMBOL(vfs_quota_sync);
 
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -672,6 +701,7 @@ we_slept:
 	put_dquot_last(dquot);
 	spin_unlock(&dq_list_lock);
 }
+EXPORT_SYMBOL(dqput);
 
 struct dquot *dquot_alloc(struct super_block *sb, int type)
 {
@@ -685,7 +715,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
-		return NODQUOT;
+		return NULL;
 
 	mutex_init(&dquot->dq_lock);
 	INIT_LIST_HEAD(&dquot->dq_free);
@@ -711,10 +741,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
-	struct dquot *dquot = NODQUOT, *empty = NODQUOT;
+	struct dquot *dquot = NULL, *empty = NULL;
 
         if (!sb_has_quota_active(sb, type))
-		return NODQUOT;
+		return NULL;
 we_slept:
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_state_lock);
@@ -725,15 +755,17 @@ we_slept:
 	}
 	spin_unlock(&dq_state_lock);
 
-	if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
-		if (empty == NODQUOT) {
+	dquot = find_dquot(hashent, sb, id, type);
+	if (!dquot) {
+		if (!empty) {
 			spin_unlock(&dq_list_lock);
-			if ((empty = get_empty_dquot(sb, type)) == NODQUOT)
+			empty = get_empty_dquot(sb, type);
+			if (!empty)
 				schedule();	/* Try to wait for a moment... */
 			goto we_slept;
 		}
 		dquot = empty;
-		empty = NODQUOT;
+		empty = NULL;
 		dquot->dq_id = id;
 		/* all dquots go on the inuse_list */
 		put_inuse(dquot);
@@ -749,13 +781,14 @@ we_slept:
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
 	}
-	/* Wait for dq_lock - after this we know that either dquot_release() is already
-	 * finished or it will be canceled due to dq_count > 1 test */
+	/* Wait for dq_lock - after this we know that either dquot_release() is
+	 * already finished or it will be canceled due to dq_count > 1 test */
 	wait_on_dquot(dquot);
-	/* Read the dquot and instantiate it (everything done only if needed) */
-	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
+	/* Read the dquot / allocate space in quota file */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
+	    sb->dq_op->acquire_dquot(dquot) < 0) {
 		dqput(dquot);
-		dquot = NODQUOT;
+		dquot = NULL;
 		goto out;
 	}
 #ifdef __DQUOT_PARANOIA
@@ -767,6 +800,7 @@ out:
 
 	return dquot;
 }
+EXPORT_SYMBOL(dqget);
 
 static int dqinit_needed(struct inode *inode, int type)
 {
@@ -775,9 +809,9 @@ static int dqinit_needed(struct inode *inode, int type)
 	if (IS_NOQUOTA(inode))
 		return 0;
 	if (type != -1)
-		return inode->i_dquot[type] == NODQUOT;
+		return !inode->i_dquot[type];
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			return 1;
 	return 0;
 }
@@ -789,12 +823,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
 		if (!atomic_read(&inode->i_writecount))
 			continue;
 		if (!dqinit_needed(inode, type))
 			continue;
-		if (inode->i_state & (I_FREEING|I_WILL_FREE))
-			continue;
 
 		__iget(inode);
 		spin_unlock(&inode_lock);
@@ -813,7 +847,10 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	iput(old_inode);
 }
 
-/* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
+/*
+ * Return 0 if dqput() won't block.
+ * (note that 1 doesn't necessarily mean blocking)
+ */
 static inline int dqput_blocks(struct dquot *dquot)
 {
 	if (atomic_read(&dquot->dq_count) <= 1)
@@ -821,22 +858,27 @@ static inline int dqput_blocks(struct dquot *dquot)
 	return 0;
 }
 
-/* Remove references to dquots from inode - add dquot to list for freeing if needed */
-/* We can't race with anybody because we hold dqptr_sem for writing... */
+/*
+ * Remove references to dquots from inode and add dquot to list for freeing
+ * if we have the last referece to dquot
+ * We can't race with anybody because we hold dqptr_sem for writing...
+ */
 static int remove_inode_dquot_ref(struct inode *inode, int type,
 				  struct list_head *tofree_head)
 {
 	struct dquot *dquot = inode->i_dquot[type];
 
-	inode->i_dquot[type] = NODQUOT;
-	if (dquot != NODQUOT) {
+	inode->i_dquot[type] = NULL;
+	if (dquot) {
 		if (dqput_blocks(dquot)) {
 #ifdef __DQUOT_PARANOIA
 			if (atomic_read(&dquot->dq_count) != 1)
 				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
 			spin_lock(&dq_list_lock);
-			list_add(&dquot->dq_free, tofree_head);	/* As dquot must have currently users it can't be on the free list... */
+			/* As dquot must have currently users it can't be on
+			 * the free list... */
+			list_add(&dquot->dq_free, tofree_head);
 			spin_unlock(&dq_list_lock);
 			return 1;
 		}
@@ -846,19 +888,22 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 	return 0;
 }
 
-/* Free list of dquots - called from inode.c */
-/* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */
+/*
+ * Free list of dquots
+ * Dquots are removed from inodes and no new references can be got so we are
+ * the only ones holding reference
+ */
 static void put_dquot_list(struct list_head *tofree_head)
 {
 	struct list_head *act_head;
 	struct dquot *dquot;
 
 	act_head = tofree_head->next;
-	/* So now we have dquots on the list... Just free them */
 	while (act_head != tofree_head) {
 		dquot = list_entry(act_head, struct dquot, dq_free);
 		act_head = act_head->next;
-		list_del_init(&dquot->dq_free);	/* Remove dquot from the list so we won't have problems... */
+		/* Remove dquot from the list so we won't have problems... */
+		list_del_init(&dquot->dq_free);
 		dqput(dquot);
 	}
 }
@@ -870,6 +915,12 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		/*
+		 *  We have to scan also I_NEW inodes because they can already
+		 *  have quota pointer initialized. Luckily, we need to touch
+		 *  only quota pointers and these have separate locking
+		 *  (dqptr_sem).
+		 */
 		if (!IS_NOQUOTA(inode))
 			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
@@ -899,7 +950,29 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 }
 
-static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
+static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace += number;
+}
+
+/*
+ * Claim reserved quota space
+ */
+static void dquot_claim_reserved_space(struct dquot *dquot,
+						qsize_t number)
+{
+	WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+	dquot->dq_dqb.dqb_curspace += number;
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static inline
+void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
 	    dquot->dq_dqb.dqb_curinodes >= number)
@@ -911,7 +984,7 @@ static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 	clear_bit(DQ_INODES_B, &dquot->dq_flags);
 }
 
-static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
+static void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
 	    dquot->dq_dqb.dqb_curspace >= number)
@@ -938,7 +1011,7 @@ static int warning_issued(struct dquot *dquot, const int warntype)
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 static int flag_print_warnings = 1;
 
-static inline int need_print_warning(struct dquot *dquot)
+static int need_print_warning(struct dquot *dquot)
 {
 	if (!flag_print_warnings)
 		return 0;
@@ -1057,10 +1130,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	ret = genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-	if (ret < 0 && ret != -ESRCH)
-		printk(KERN_ERR
-			"VFS: Failed to send notification message: %d\n", ret);
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -1068,13 +1138,17 @@ err_out:
 	kfree_skb(skb);
 }
 #endif
-
-static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
+/*
+ * Write warnings to the console and send warning messages over netlink.
+ *
+ * Note that this function can sleep.
+ */
+static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
+		if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
 		    !warning_issued(dquots[i], warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 			print_warning(dquots[i], warntype[i]);
@@ -1085,42 +1159,47 @@ static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 		}
 }
 
-static inline char ignore_hardlimit(struct dquot *dquot)
+static int ignore_hardlimit(struct dquot *dquot)
 {
 	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
 
 	return capable(CAP_SYS_RESOURCE) &&
-	    (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & V1_DQF_RSQUASH));
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & V1_DQF_RSQUASH));
 }
 
 /* needs dq_data_lock */
 static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
+	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+
 	*warntype = QUOTA_NL_NOWARN;
 	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_IHARDWARN;
 		return NO_QUOTA;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit &&
-	    dquot->dq_dqb.dqb_itime && get_seconds() >= dquot->dq_dqb.dqb_itime &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_itime &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_ISOFTLONGWARN;
 		return NO_QUOTA;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
 	    dquot->dq_dqb.dqb_itime == 0) {
 		*warntype = QUOTA_NL_ISOFTWARN;
-		dquot->dq_dqb.dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
+		dquot->dq_dqb.dqb_itime = get_seconds() +
+		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
 	}
 
 	return QUOTA_OK;
@@ -1129,13 +1208,19 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 /* needs dq_data_lock */
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
+	qsize_t tspace;
+	struct super_block *sb = dquot->dq_sb;
+
 	*warntype = QUOTA_NL_NOWARN;
-	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
+	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+		+ space;
+
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
+	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1143,8 +1228,9 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BSOFTLONGWARN;
@@ -1152,11 +1238,12 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
-			dquot->dq_dqb.dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace;
+			dquot->dq_dqb.dqb_btime = get_seconds() +
+			    sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
 		}
 		else
 			/*
@@ -1171,15 +1258,18 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
+	qsize_t newinodes;
+
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
 	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
 	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
 		return QUOTA_NL_NOWARN;
 
-	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+	newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
+	if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
 		return QUOTA_NL_ISOFTBELOW;
 	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
-	    dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+	    newinodes < dquot->dq_dqb.dqb_ihardlimit)
 		return QUOTA_NL_IHARDBELOW;
 	return QUOTA_NL_NOWARN;
 }
@@ -1206,7 +1296,7 @@ int dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
 	int cnt, ret = 0;
-	struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+	struct dquot *got[MAXQUOTAS] = { NULL, NULL };
 	struct super_block *sb = inode->i_sb;
 
 	/* First test before acquiring mutex - solves deadlocks when we
@@ -1239,9 +1329,9 @@ int dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		if (inode->i_dquot[cnt] == NODQUOT) {
+		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
-			got[cnt] = NODQUOT;
+			got[cnt] = NULL;
 		}
 	}
 out_err:
@@ -1251,6 +1341,7 @@ out_err:
 		dqput(got[cnt]);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_initialize);
 
 /*
  * 	Release all quotas referenced by inode
@@ -1263,7 +1354,7 @@ int dquot_drop(struct inode *inode)
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		put[cnt] = inode->i_dquot[cnt];
-		inode->i_dquot[cnt] = NODQUOT;
+		inode->i_dquot[cnt] = NULL;
 	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 
@@ -1271,6 +1362,7 @@ int dquot_drop(struct inode *inode)
 		dqput(put[cnt]);
 	return 0;
 }
+EXPORT_SYMBOL(dquot_drop);
 
 /* Wrapper to remove references to quota structures from inode */
 void vfs_dq_drop(struct inode *inode)
@@ -1287,12 +1379,13 @@ void vfs_dq_drop(struct inode *inode)
 		 * must assure that nobody can come after the DQUOT_DROP and
 		 * add quota pointers back anyway */
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt] != NODQUOT)
+			if (inode->i_dquot[cnt])
 				break;
 		if (cnt < MAXQUOTAS)
 			inode->i_sb->dq_op->drop(inode);
 	}
 }
+EXPORT_SYMBOL(vfs_dq_drop);
 
 /*
  * Following four functions update i_blocks+i_bytes fields and
@@ -1306,51 +1399,93 @@ void vfs_dq_drop(struct inode *inode)
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+int __dquot_alloc_space(struct inode *inode, qsize_t number,
+			int warn, int reserve)
 {
-	int cnt, ret = NO_QUOTA;
+	int cnt, ret = QUOTA_OK;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode)) {
-out_add:
-		inode_add_bytes(inode, number);
-		return QUOTA_OK;
-	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode)) {	/* Now we can do reliable test... */
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		goto out_add;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA)
-			goto warn_put_all;
+		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
+		    == NO_QUOTA) {
+			ret = NO_QUOTA;
+			goto out_unlock;
+		}
 	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		dquot_incr_space(inode->i_dquot[cnt], number);
+		if (reserve)
+			dquot_resv_space(inode->i_dquot[cnt], number);
+		else
+			dquot_incr_space(inode->i_dquot[cnt], number);
 	}
-	inode_add_bytes(inode, number);
-	ret = QUOTA_OK;
-warn_put_all:
+	if (!reserve)
+		inode_add_bytes(inode, number);
+out_unlock:
 	spin_unlock(&dq_data_lock);
-	if (ret == QUOTA_OK)
-		/* Dirtify all the dquots - this can block when journalling */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt])
-				mark_dquot_dirty(inode->i_dquot[cnt]);
 	flush_warnings(inode->i_dquot, warntype);
+	return ret;
+}
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+	int cnt, ret = QUOTA_OK;
+
+	/*
+	 * First test before acquiring mutex - solves deadlocks when we
+	 * re-enter the quota code and are already holding the mutex
+	 */
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out_unlock;
+	}
+
+	ret = __dquot_alloc_space(inode, number, warn, 0);
+	if (ret == NO_QUOTA)
+		goto out_unlock;
+
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+out_unlock:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
 	return ret;
 }
+EXPORT_SYMBOL(dquot_alloc_space);
+
+int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
+{
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	ret = __dquot_alloc_space(inode, number, warn, 1);
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dquot_reserve_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1373,14 +1508,15 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) == NO_QUOTA)
+		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
+		    == NO_QUOTA)
 			goto warn_put_all;
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		dquot_incr_inodes(inode->i_dquot[cnt], number);
 	}
@@ -1396,6 +1532,73 @@ warn_put_all:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_alloc_inode);
+
+int dquot_claim_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))	{
+		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			dquot_claim_reserved_space(inode->i_dquot[cnt],
+							number);
+	}
+	/* Update inode bytes */
+	inode_add_bytes(inode, number);
+	spin_unlock(&dq_data_lock);
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dquot_claim_space);
+
+/*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	spin_lock(&dq_data_lock);
+	/* Release reserved dquots */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			dquot_free_reserved_space(inode->i_dquot[cnt], number);
+	}
+	spin_unlock(&dq_data_lock);
+
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return;
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1421,7 +1624,7 @@ out_sub:
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
 		dquot_decr_space(inode->i_dquot[cnt], number);
@@ -1436,6 +1639,7 @@ out_sub:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
+EXPORT_SYMBOL(dquot_free_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1458,7 +1662,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
 		dquot_decr_inodes(inode->i_dquot[cnt], number);
@@ -1472,6 +1676,20 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
+EXPORT_SYMBOL(dquot_free_inode);
+
+/*
+ * call back function, get reserved quota space from underlying fs
+ */
+qsize_t dquot_get_reserved_space(struct inode *inode)
+{
+	qsize_t reserved_space = 0;
+
+	if (sb_any_quota_active(inode->i_sb) &&
+	    inode->i_sb->dq_op->get_reserved_space)
+		reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
+	return reserved_space;
+}
 
 /*
  * Transfer the number of inode and blocks from one diskquota to an other.
@@ -1481,7 +1699,8 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
  */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-	qsize_t space;
+	qsize_t space, cur_space;
+	qsize_t rsv_space = 0;
 	struct dquot *transfer_from[MAXQUOTAS];
 	struct dquot *transfer_to[MAXQUOTAS];
 	int cnt, ret = QUOTA_OK;
@@ -1496,22 +1715,16 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		return QUOTA_OK;
 	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_from[cnt] = NODQUOT;
-		transfer_to[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
+		transfer_to[cnt] = NULL;
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
-		switch (cnt) {
-			case USRQUOTA:
-				if (!chuid)
-					continue;
-				transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_uid, cnt);
-				break;
-			case GRPQUOTA:
-				if (!chgid)
-					continue;
-				transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_gid, cnt);
-				break;
-		}
 	}
+	if (chuid)
+		transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
+					      USRQUOTA);
+	if (chgid)
+		transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
+					      GRPQUOTA);
 
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
@@ -1520,10 +1733,12 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		goto put_all;
 	}
 	spin_lock(&dq_data_lock);
-	space = inode_get_bytes(inode);
+	cur_space = inode_get_bytes(inode);
+	rsv_space = dquot_get_reserved_space(inode);
+	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
 		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
@@ -1539,7 +1754,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		/*
 		 * Skip changes for same uid or gid or for turned off quota-type.
 		 */
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 
 		/* Due to IO error we might not have transfer_from[] structure */
@@ -1549,11 +1764,14 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 			warntype_from_space[cnt] =
 				info_bdq_free(transfer_from[cnt], space);
 			dquot_decr_inodes(transfer_from[cnt], 1);
-			dquot_decr_space(transfer_from[cnt], space);
+			dquot_decr_space(transfer_from[cnt], cur_space);
+			dquot_free_reserved_space(transfer_from[cnt],
+						  rsv_space);
 		}
 
 		dquot_incr_inodes(transfer_to[cnt], 1);
-		dquot_incr_space(transfer_to[cnt], space);
+		dquot_incr_space(transfer_to[cnt], cur_space);
+		dquot_resv_space(transfer_to[cnt], rsv_space);
 
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
@@ -1567,7 +1785,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (transfer_to[cnt]) {
 			mark_dquot_dirty(transfer_to[cnt]);
 			/* The reference we got is transferred to the inode */
-			transfer_to[cnt] = NODQUOT;
+			transfer_to[cnt] = NULL;
 		}
 	}
 warn_put_all:
@@ -1585,10 +1803,11 @@ over_quota:
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Clear dquot pointers we don't want to dqput() */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_from[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
 	ret = NO_QUOTA;
 	goto warn_put_all;
 }
+EXPORT_SYMBOL(dquot_transfer);
 
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
@@ -1600,7 +1819,7 @@ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 	}
 	return 0;
 }
-
+EXPORT_SYMBOL(vfs_dq_transfer);
 
 /*
  * Write info of quota file to disk
@@ -1615,6 +1834,7 @@ int dquot_commit_info(struct super_block *sb, int type)
 	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_commit_info);
 
 /*
  * Definitions of diskquota operations.
@@ -1700,8 +1920,8 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 		drop_dquot_ref(sb, cnt);
 		invalidate_dquots(sb, cnt);
 		/*
-		 * Now all dquots should be invalidated, all writes done so we should be only
-		 * users of the info. No locks needed.
+		 * Now all dquots should be invalidated, all writes done so we
+		 * should be only users of the info. No locks needed.
 		 */
 		if (info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
@@ -1739,10 +1959,12 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_loaded(sb, cnt)) {
-				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
+				mutex_lock_nested(&toputinode[cnt]->i_mutex,
+						  I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
-				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
+				truncate_inode_pages(&toputinode[cnt]->i_data,
+						     0);
 				mutex_unlock(&toputinode[cnt]->i_mutex);
 				mark_inode_dirty(toputinode[cnt]);
 			}
@@ -1767,13 +1989,14 @@ put_inodes:
 		}
 	return ret;
 }
+EXPORT_SYMBOL(vfs_quota_disable);
 
 int vfs_quota_off(struct super_block *sb, int type, int remount)
 {
 	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
 				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
 }
-
+EXPORT_SYMBOL(vfs_quota_off);
 /*
  *	Turn quotas on on a device
  */
@@ -1831,7 +2054,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
 		down_write(&dqopt->dqptr_sem);
-		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
+					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
 		up_write(&dqopt->dqptr_sem);
 		sb->dq_op->drop(inode);
@@ -1850,7 +2074,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	dqopt->info[type].dqi_fmt_id = format_id;
 	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
 	mutex_lock(&dqopt->dqio_mutex);
-	if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) {
+	error = dqopt->ops[type]->read_file_info(sb, type);
+	if (error < 0) {
 		mutex_unlock(&dqopt->dqio_mutex);
 		goto out_file_init;
 	}
@@ -1930,6 +2155,7 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on_path);
 
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
@@ -1947,6 +2173,7 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 	}
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on);
 
 /*
  * More powerful function for turning on quotas allowing setting
@@ -1993,6 +2220,7 @@ out_lock:
 load_quota:
 	return vfs_load_quota_inode(inode, type, format_id, flags);
 }
+EXPORT_SYMBOL(vfs_quota_enable);
 
 /*
  * This function is used when filesystem needs to initialize quotas
@@ -2022,6 +2250,7 @@ out:
 	dput(dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on_mount);
 
 /* Wrapper to turn on quotas when remounting rw */
 int vfs_dq_quota_on_remount(struct super_block *sb)
@@ -2038,6 +2267,7 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
 	}
 	return ret;
 }
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2057,7 +2287,7 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	spin_lock(&dq_data_lock);
 	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
 	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
-	di->dqb_curspace = dm->dqb_curspace;
+	di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
 	di->dqb_curinodes = dm->dqb_curinodes;
@@ -2067,18 +2297,20 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	spin_unlock(&dq_data_lock);
 }
 
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct if_dqblk *di)
 {
 	struct dquot *dquot;
 
 	dquot = dqget(sb, id, type);
-	if (dquot == NODQUOT)
+	if (!dquot)
 		return -ESRCH;
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
 
 	return 0;
 }
+EXPORT_SYMBOL(vfs_get_dqblk);
 
 /* Generic routine for setting common part of quota structure */
 static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
@@ -2097,7 +2329,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 
 	spin_lock(&dq_data_lock);
 	if (di->dqb_valid & QIF_SPACE) {
-		dm->dqb_curspace = di->dqb_curspace;
+		dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
 		check_blim = 1;
 		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
@@ -2130,22 +2362,25 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	}
 
 	if (check_blim) {
-		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
+		if (!dm->dqb_bsoftlimit ||
+		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-		}
-		else if (!(di->dqb_valid & QIF_BTIME))	/* Set grace only if user hasn't provided his own... */
+		} else if (!(di->dqb_valid & QIF_BTIME))
+			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
 	}
 	if (check_ilim) {
-		if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
+		if (!dm->dqb_isoftlimit ||
+		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
-		}
-		else if (!(di->dqb_valid & QIF_ITIME))	/* Set grace only if user hasn't provided his own... */
+		} else if (!(di->dqb_valid & QIF_ITIME))
+			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
 	}
-	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
+	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
+	    dm->dqb_isoftlimit)
 		clear_bit(DQ_FAKE_B, &dquot->dq_flags);
 	else
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -2155,7 +2390,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	return 0;
 }
 
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct if_dqblk *di)
 {
 	struct dquot *dquot;
 	int rc;
@@ -2170,6 +2406,7 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 out:
 	return rc;
 }
+EXPORT_SYMBOL(vfs_set_dqblk);
 
 /* Generic routine for getting common part of quota file information */
 int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2191,6 +2428,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
+EXPORT_SYMBOL(vfs_get_dqinfo);
 
 /* Generic routine for setting common part of quota file information */
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2210,7 +2448,8 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	if (ii->dqi_valid & IIF_IGRACE)
 		mi->dqi_igrace = ii->dqi_igrace;
 	if (ii->dqi_valid & IIF_FLAGS)
-		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK);
+		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
+				(ii->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
@@ -2219,6 +2458,7 @@ out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return err;
 }
+EXPORT_SYMBOL(vfs_set_dqinfo);
 
 struct quotactl_ops vfs_quotactl_ops = {
 	.quota_on	= vfs_quota_on,
@@ -2368,43 +2608,10 @@ static int __init dquot_init(void)
 
 #ifdef CONFIG_QUOTA_NETLINK_INTERFACE
 	if (genl_register_family(&quota_genl_family) != 0)
-		printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n");
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
 #endif
 
 	return 0;
 }
 module_init(dquot_init);
-
-EXPORT_SYMBOL(register_quota_format);
-EXPORT_SYMBOL(unregister_quota_format);
-EXPORT_SYMBOL(dqstats);
-EXPORT_SYMBOL(dq_data_lock);
-EXPORT_SYMBOL(vfs_quota_enable);
-EXPORT_SYMBOL(vfs_quota_on);
-EXPORT_SYMBOL(vfs_quota_on_path);
-EXPORT_SYMBOL(vfs_quota_on_mount);
-EXPORT_SYMBOL(vfs_quota_disable);
-EXPORT_SYMBOL(vfs_quota_off);
-EXPORT_SYMBOL(dquot_scan_active);
-EXPORT_SYMBOL(vfs_quota_sync);
-EXPORT_SYMBOL(vfs_get_dqinfo);
-EXPORT_SYMBOL(vfs_set_dqinfo);
-EXPORT_SYMBOL(vfs_get_dqblk);
-EXPORT_SYMBOL(vfs_set_dqblk);
-EXPORT_SYMBOL(dquot_commit);
-EXPORT_SYMBOL(dquot_commit_info);
-EXPORT_SYMBOL(dquot_acquire);
-EXPORT_SYMBOL(dquot_release);
-EXPORT_SYMBOL(dquot_mark_dquot_dirty);
-EXPORT_SYMBOL(dquot_initialize);
-EXPORT_SYMBOL(dquot_drop);
-EXPORT_SYMBOL(vfs_dq_drop);
-EXPORT_SYMBOL(dqget);
-EXPORT_SYMBOL(dqput);
-EXPORT_SYMBOL(dquot_alloc_space);
-EXPORT_SYMBOL(dquot_alloc_inode);
-EXPORT_SYMBOL(dquot_free_space);
-EXPORT_SYMBOL(dquot_free_inode);
-EXPORT_SYMBOL(dquot_transfer);
-EXPORT_SYMBOL(vfs_dq_transfer);
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/quota.c b/fs/quota/quota.c
index d76ada914f98..b7f5a468f076 100644
--- a/fs/quota.c
+++ b/fs/quota/quota.c
@@ -20,7 +20,8 @@
 #include <linux/types.h>
 
 /* Check validity of generic quotactl commands */
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
+				  qid_t id)
 {
 	if (type >= MAXQUOTAS)
 		return -EINVAL;
@@ -72,7 +73,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 		case Q_SETINFO:
 		case Q_SETQUOTA:
 		case Q_GETQUOTA:
-			/* This is just informative test so we are satisfied without a lock */
+			/* This is just an informative test so we are satisfied
+			 * without the lock */
 			if (!sb_has_quota_active(sb, type))
 				return -ESRCH;
 	}
@@ -92,7 +94,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 }
 
 /* Check validity of XFS Quota Manager commands */
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
+			      qid_t id)
 {
 	if (type >= XQM_MAXQUOTAS)
 		return -EINVAL;
@@ -142,7 +145,8 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
 	return 0;
 }
 
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
+				qid_t id)
 {
 	int error;
 
@@ -180,7 +184,8 @@ static void quota_sync_sb(struct super_block *sb, int type)
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
+		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+				  I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
 		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
 	}
@@ -200,14 +205,15 @@ void sync_dquots(struct super_block *sb, int type)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		/* This test just improves performance so it needn't be reliable... */
+		/* This test just improves performance so it needn't be
+		 * reliable... */
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 			if (type != -1 && type != cnt)
 				continue;
 			if (!sb_has_quota_active(sb, cnt))
 				continue;
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
-			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+			   list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
 				continue;
 			break;
 		}
@@ -227,7 +233,8 @@ restart:
 }
 
 /* Copy parameters and call proper function */
-static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr)
+static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
+		       void __user *addr)
 {
 	int ret;
 
@@ -235,7 +242,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_QUOTAON: {
 			char *pathname;
 
-			if (IS_ERR(pathname = getname(addr)))
+			pathname = getname(addr);
+			if (IS_ERR(pathname))
 				return PTR_ERR(pathname);
 			ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
 			putname(pathname);
@@ -261,7 +269,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_GETINFO: {
 			struct if_dqinfo info;
 
-			if ((ret = sb->s_qcop->get_info(sb, type, &info)))
+			ret = sb->s_qcop->get_info(sb, type, &info);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &info, sizeof(info)))
 				return -EFAULT;
@@ -277,7 +286,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_GETQUOTA: {
 			struct if_dqblk idq;
 
-			if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq)))
+			ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &idq, sizeof(idq)))
 				return -EFAULT;
@@ -322,7 +332,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_XGETQUOTA: {
 			struct fs_disk_quota fdq;
 
-			if ((ret = sb->s_qcop->get_xquota(sb, type, id, &fdq)))
+			ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &fdq, sizeof(fdq)))
 				return -EFAULT;
@@ -341,7 +352,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
  * look up a superblock on which quota ops will be performed
  * - use the name of a block device to find the superblock thereon
  */
-static inline struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special)
 {
 #ifdef CONFIG_BLOCK
 	struct block_device *bdev;
diff --git a/fs/quota_tree.c b/fs/quota/quota_tree.c
index 953404c95b17..f81f4bcfb178 100644
--- a/fs/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,8 +22,6 @@ MODULE_LICENSE("GPL");
 
 #define __QUOTA_QT_PARANOIA
 
-typedef char *dqbuf_t;
-
 static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
 {
 	unsigned int epb = info->dqi_usable_bs >> 2;
@@ -35,46 +33,42 @@ static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
 }
 
 /* Number of entries in one blocks */
-static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
 {
 	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
 	       / info->dqi_entry_size;
 }
 
-static dqbuf_t getdqbuf(size_t size)
+static char *getdqbuf(size_t size)
 {
-	dqbuf_t buf = kmalloc(size, GFP_NOFS);
+	char *buf = kmalloc(size, GFP_NOFS);
 	if (!buf)
-		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+		printk(KERN_WARNING
+		       "VFS: Not enough memory for quota buffers.\n");
 	return buf;
 }
 
-static inline void freedqbuf(dqbuf_t buf)
-{
-	kfree(buf);
-}
-
-static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
 	struct super_block *sb = info->dqi_sb;
 
 	memset(buf, 0, info->dqi_usable_bs);
-	return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+	return sb->s_op->quota_read(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 }
 
-static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
 	struct super_block *sb = info->dqi_sb;
 
-	return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+	return sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 }
 
 /* Remove empty block from list and return it */
 static int get_free_dqblk(struct qtree_mem_dqinfo *info)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int ret, blk;
 
@@ -98,12 +92,12 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
 	mark_info_dirty(info->dqi_sb, info->dqi_type);
 	ret = blk;
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
 /* Insert empty block to the list */
-static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
 {
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int err;
@@ -120,9 +114,10 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
 }
 
 /* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
 {
-	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
 	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -153,21 +148,24 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
 		info->dqi_free_entry = nextblk;
 		mark_info_dirty(info->dqi_sb, info->dqi_type);
 	}
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+		printk(KERN_ERR
+		       "VFS: Can't write block (%u) with free entries.\n",
+		       blk);
 	return 0;
 out_buf:
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	return err;
 }
 
 /* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
 {
-	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int err;
 
@@ -188,12 +186,12 @@ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
 		if (err < 0)
 			goto out_buf;
 	}
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	info->dqi_free_entry = blk;
 	mark_info_dirty(info->dqi_sb, info->dqi_type);
 	return 0;
 out_buf:
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	return err;
 }
 
@@ -215,7 +213,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 {
 	uint blk, i;
 	struct qt_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	char *ddquot;
 
 	*err = 0;
@@ -233,11 +231,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 		blk = get_free_dqblk(info);
 		if ((int)blk < 0) {
 			*err = blk;
-			freedqbuf(buf);
+			kfree(buf);
 			return 0;
 		}
 		memset(buf, 0, info->dqi_usable_bs);
-		/* This is enough as block is already zeroed and entry list is empty... */
+		/* This is enough as the block is already zeroed and the entry
+		 * list is empty... */
 		info->dqi_free_entry = blk;
 		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
 	}
@@ -253,9 +252,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	}
 	le16_add_cpu(&dh->dqdh_entries, 1);
 	/* Find free structure in block */
-	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
-	     i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
-	     i++, ddquot += info->dqi_entry_size);
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (qtree_entry_unused(info, ddquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
 #ifdef __QUOTA_QT_PARANOIA
 	if (i == qtree_dqstr_in_blk(info)) {
 		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
@@ -273,10 +275,10 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
 			sizeof(struct qt_disk_dqdbheader) +
 			i * info->dqi_entry_size;
-	freedqbuf(buf);
+	kfree(buf);
 	return blk;
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return 0;
 }
 
@@ -284,7 +286,7 @@ out_buf:
 static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			  uint *treeblk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0, newson = 0, newact = 0;
 	__le32 *ref;
 	uint newblk;
@@ -333,7 +335,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		put_free_dqblk(info, buf, *treeblk);
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -346,14 +348,15 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
 }
 
 /*
- *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ * We don't have to be afraid of deadlocks as we never have quotas on quota
+ * files...
  */
 int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 {
 	int type = dquot->dq_type;
 	struct super_block *sb = dquot->dq_sb;
 	ssize_t ret;
-	dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+	char *ddquot = getdqbuf(info->dqi_entry_size);
 
 	if (!ddquot)
 		return -ENOMEM;
@@ -364,15 +367,15 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		if (ret < 0) {
 			printk(KERN_ERR "VFS: Error %zd occurred while "
 					"creating quota.\n", ret);
-			freedqbuf(ddquot);
+			kfree(ddquot);
 			return ret;
 		}
 	}
 	spin_lock(&dq_data_lock);
 	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
 	spin_unlock(&dq_data_lock);
-	ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
-					info->dqi_entry_size, dquot->dq_off);
+	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
+				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
 		       sb->s_id);
@@ -382,7 +385,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		ret = 0;
 	}
 	dqstats.writes++;
-	freedqbuf(ddquot);
+	kfree(ddquot);
 
 	return ret;
 }
@@ -393,7 +396,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			uint blk)
 {
 	struct qt_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0;
 
 	if (!buf)
@@ -444,7 +447,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	}
 	dquot->dq_off = 0;	/* Quota is now unattached */
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -452,7 +455,7 @@ out_buf:
 static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		       uint *blk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0;
 	uint newblk;
 	__le32 *ref = (__le32 *)buf;
@@ -475,9 +478,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		int i;
 		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
 		/* Block got empty? */
-		for (i = 0;
-		     i < (info->dqi_usable_bs >> 2) && !ref[i];
-		     i++);
+		for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
+			;
 		/* Don't put the root block into the free block list */
 		if (i == (info->dqi_usable_bs >> 2)
 		    && *blk != QT_TREEOFF) {
@@ -491,7 +493,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		}
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -510,7 +512,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
 static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 				 struct dquot *dquot, uint blk)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	loff_t ret = 0;
 	int i;
 	char *ddquot;
@@ -522,9 +524,12 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
 		goto out_buf;
 	}
-	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
-	     i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
-	     i++, ddquot += info->dqi_entry_size);
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (info->dqi_ops->is_id(ddquot, dquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
 	if (i == qtree_dqstr_in_blk(info)) {
 		printk(KERN_ERR "VFS: Quota for id %u referenced "
 		  "but not present.\n", dquot->dq_id);
@@ -535,7 +540,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -543,7 +548,7 @@ out_buf:
 static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 				struct dquot *dquot, uint blk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	loff_t ret = 0;
 	__le32 *ref = (__le32 *)buf;
 
@@ -563,7 +568,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 	else
 		ret = find_block_dqentry(info, dquot, blk);
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -579,7 +584,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	int type = dquot->dq_type;
 	struct super_block *sb = dquot->dq_sb;
 	loff_t offset;
-	dqbuf_t ddquot;
+	char *ddquot;
 	int ret = 0;
 
 #ifdef __QUOTA_QT_PARANOIA
@@ -607,8 +612,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ddquot = getdqbuf(info->dqi_entry_size);
 	if (!ddquot)
 		return -ENOMEM;
-	ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
-				   info->dqi_entry_size, dquot->dq_off);
+	ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
+				   dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
@@ -616,7 +621,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 				"structure for id %u.\n", dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-		freedqbuf(ddquot);
+		kfree(ddquot);
 		goto out;
 	}
 	spin_lock(&dq_data_lock);
@@ -627,7 +632,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	    !dquot->dq_dqb.dqb_isoftlimit)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 	spin_unlock(&dq_data_lock);
-	freedqbuf(ddquot);
+	kfree(ddquot);
 out:
 	dqstats.reads++;
 	return ret;
@@ -638,7 +643,8 @@ EXPORT_SYMBOL(qtree_read_dquot);
  * the only one operating on dquot (thanks to dq_lock) */
 int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 {
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) &&
+	    !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
 		return qtree_delete_dquot(info, dquot);
 	return 0;
 }
diff --git a/fs/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..a1ab8db81a51 100644
--- a/fs/quota_tree.h
+++ b/fs/quota/quota_tree.h
diff --git a/fs/quota_v1.c b/fs/quota/quota_v1.c
index b4af1c69ad16..0edcf42b1778 100644
--- a/fs/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -62,11 +62,14 @@ static int v1_read_dqblk(struct dquot *dquot)
 
 	/* Set structure to 0s in case read fails/is after end of file */
 	memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
-	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
+			sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
 
 	v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
-	if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 &&
-	    dquot->dq_dqb.dqb_ihardlimit == 0 && dquot->dq_dqb.dqb_isoftlimit == 0)
+	if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
+	    dquot->dq_dqb.dqb_bsoftlimit == 0 &&
+	    dquot->dq_dqb.dqb_ihardlimit == 0 &&
+	    dquot->dq_dqb.dqb_isoftlimit == 0)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 	dqstats.reads++;
 
@@ -81,13 +84,16 @@ static int v1_commit_dqblk(struct dquot *dquot)
 
 	v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
 	if (dquot->dq_id == 0) {
-		dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
-		dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
+		dqblk.dqb_btime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
+		dqblk.dqb_itime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
 	}
 	ret = 0;
 	if (sb_dqopt(dquot->dq_sb)->files[type])
-		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk,
-					sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
+			v1_dqoff(dquot->dq_id));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
 			dquot->dq_sb->s_id);
@@ -130,15 +136,20 @@ static int v1_check_quota_file(struct super_block *sb, int type)
 		return 0;
 	blocks = isize >> BLOCK_SIZE_BITS;
 	off = isize & (BLOCK_SIZE - 1);
-	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk))
+	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) %
+	    sizeof(struct v1_disk_dqblk))
 		return 0;
-	/* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */
-	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+	/* Doublecheck whether we didn't get file with new format - with old
+	 * quotactl() this could happen */
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader))
 		return 1;	/* Probably not new format */
 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
 		return 1;	/* Definitely not new format */
-	printk(KERN_INFO "VFS: %s: Refusing to turn on old quota format on given file. It probably contains newer quota format.\n", sb->s_id);
+	printk(KERN_INFO
+	       "VFS: %s: Refusing to turn on old quota format on given file."
+	       " It probably contains newer quota format.\n", sb->s_id);
         return 0;		/* Seems like a new format file -> refuse it */
 }
 
@@ -148,7 +159,9 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	struct v1_disk_dqblk dqblk;
 	int ret;
 
-	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
@@ -157,8 +170,10 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	/* limits are stored as unsigned 32-bit data */
 	dqopt->info[type].dqi_maxblimit = 0xffffffff;
 	dqopt->info[type].dqi_maxilimit = 0xffffffff;
-	dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
-	dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+	dqopt->info[type].dqi_igrace =
+			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+	dqopt->info[type].dqi_bgrace =
+			dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
 out:
 	return ret;
 }
@@ -170,8 +185,9 @@ static int v1_write_file_info(struct super_block *sb, int type)
 	int ret;
 
 	dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
-	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
-	    sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
diff --git a/fs/quota_v2.c b/fs/quota/quota_v2.c
index b618b563635c..a5475fb1ae44 100644
--- a/fs/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -54,7 +54,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 	static const uint quota_magics[] = V2_INITQMAGICS;
 	static const uint quota_versions[] = V2_INITQVERSIONS;
  
-	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
 		printk("quota_v2: failed read expected=%zd got=%zd\n",
 			sizeof(struct v2_disk_dqheader), size);
diff --git a/fs/quotaio_v1.h b/fs/quota/quotaio_v1.h
index 746654b5de70..746654b5de70 100644
--- a/fs/quotaio_v1.h
+++ b/fs/quota/quotaio_v1.h
diff --git a/fs/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe580685c..530fe580685c 100644
--- a/fs/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a28376..ebb2c417912c 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -18,7 +18,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
-#include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 
@@ -60,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
  */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-	struct pagevec lru_pvec;
 	unsigned long npages, xpages, loop, limit;
 	struct page *pages;
 	unsigned order;
@@ -103,21 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	memset(data, 0, newsize);
 
 	/* attach all the pages to the inode's address space */
-	pagevec_init(&lru_pvec, 0);
 	for (loop = 0; loop < npages; loop++) {
 		struct page *page = pages + loop;
 
-		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+					GFP_KERNEL);
 		if (ret < 0)
 			goto add_error;
 
-		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add_file(&lru_pvec);
+		/* prevent the page from being discarded on memory pressure */
+		SetPageDirty(page);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
@@ -126,9 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	return -EFBIG;
 
  add_error:
-	page_cache_release(pages + loop);
-	for (loop++; loop < npages; loop++)
-		__free_page(pages + loop);
+	while (loop < npages)
+		__free_page(pages + loop++);
 	return ret;
 }
 
@@ -201,11 +197,6 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 	if (ret)
 		return ret;
 
-	/* by providing our own setattr() method, we skip this quotaism */
-	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
-	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
-		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
-
 	/* pick out size-changing events */
 	if (ia->ia_valid & ATTR_SIZE) {
 		loff_t size = i_size_read(inode);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b87..a404fb88e456 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
+#define RAMFS_DEFAULT_MODE	0755
+
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
 static const struct super_operations ramfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+struct ramfs_mount_opts {
+	umode_t mode;
+};
+
+enum {
+	Opt_mode,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_mode, "mode=%o"},
+	{Opt_err, NULL}
+};
+
+struct ramfs_fs_info {
+	struct ramfs_mount_opts mount_opts;
 };
 
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int token;
+	char *p;
+
+	opts->mode = RAMFS_DEFAULT_MODE;
+
+	while ((p = strsep(&data, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		default:
+			printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
+	struct ramfs_fs_info *fsi;
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	if (!fsi) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	sb->s_fs_info = fsi;
+
+	err = ramfs_parse_options(data, &fsi->mount_opts);
+	if (err)
+		goto fail;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_magic = RAMFS_MAGIC;
 	sb->s_op = &ramfs_ops;
 	sb->s_time_gran = 1;
-	inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
-	if (!inode)
-		return -ENOMEM;
+	inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
 
 	root = d_alloc_root(inode);
 	if (!root) {
-		iput(inode);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto fail;
 	}
 	sb->s_root = root;
 	return 0;
+fail:
+	kfree(fsi);
+	iput(inode);
+	return err;
 }
 
 int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
 			    mnt);
 }
 
+static void ramfs_kill_sb(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	kill_litter_super(sb);
+}
+
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
 	.get_sb		= ramfs_get_sb,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
+	struct file *file;
+	ssize_t ret = -EBADF;
+	int fput_needed;
+
+	if (pos < 0)
+		return -EINVAL;
+
+	file = fget_light(fd, &fput_needed);
+	if (file) {
+		ret = -ESPIPE;
+		if (file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(file, vec, vlen, &pos);
+		fput_light(file, fput_needed);
+	}
+
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
 config REISERFS_FS
 	tristate "Reiserfs support"
+	select CRC32
 	help
 	  Stores not just filenames but the files themselves in a balanced
 	  tree.  Uses journalling.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 0eb7ac080484..7c5ab6330dd6 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,10 +7,10 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
 		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
 		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o procfs.o
+		 item_ops.o ioctl.o procfs.o xattr.o
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
-reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o
+reiserfs-objs += xattr_user.o xattr_trusted.o
 endif
 
 ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 90e1670e4e6f..14e8c9d460e5 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -1,4 +1,4 @@
-[LICENSING] 
+[LICENSING]
 
 ReiserFS is hereby licensed under the GNU General
 Public License version 2.
@@ -31,7 +31,7 @@ the GPL as not allowing those additional licensing options, you read
 it wrongly, and Richard Stallman agrees with me, when carefully read
 you can see that those restrictions on additional terms do not apply
 to the owner of the copyright, and my interpretation of this shall
-govern for this license.  
+govern for this license.
 
 Finally, nothing in this license shall be interpreted to allow you to
 fail to fairly credit me, or to remove my credits, without my
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4646caa60455..e716161ab325 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -40,8 +40,8 @@
 
 #define SET_OPTION(optname) \
    do { \
-        reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \
-        set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
+	reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
+	set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
     } while(0)
 #define TEST_OPTION(optname, s) \
     test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
@@ -64,9 +64,9 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 	unsigned int bmap_count = reiserfs_bmap_count(s);
 
 	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
-		reiserfs_warning(s,
-				 "vs-4010: is_reusable: block number is out of range %lu (%u)",
-				 block, SB_BLOCK_COUNT(s));
+		reiserfs_error(s, "vs-4010",
+			       "block number is out of range %lu (%u)",
+			       block, SB_BLOCK_COUNT(s));
 		return 0;
 	}
 
@@ -79,31 +79,30 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
 		if (block >= bmap1 &&
 		    block <= bmap1 + bmap_count) {
-			reiserfs_warning(s, "vs: 4019: is_reusable: "
-					 "bitmap block %lu(%u) can't be freed or reused",
-					 block, bmap_count);
+			reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
 			return 0;
 		}
 	} else {
 		if (offset == 0) {
-			reiserfs_warning(s, "vs: 4020: is_reusable: "
-					 "bitmap block %lu(%u) can't be freed or reused",
-					 block, bmap_count);
+			reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
+				       "can't be freed or reused",
+				       block, bmap_count);
 			return 0;
 		}
 	}
 
 	if (bmap >= bmap_count) {
-		reiserfs_warning(s,
-				 "vs-4030: is_reusable: there is no so many bitmap blocks: "
-				 "block=%lu, bitmap_nr=%u", block, bmap);
+		reiserfs_error(s, "vs-4030", "bitmap for requested block "
+			       "is out of range: block=%lu, bitmap_nr=%u",
+			       block, bmap);
 		return 0;
 	}
 
 	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
-		reiserfs_warning(s,
-				 "vs-4050: is_reusable: this is root block (%u), "
-				 "it must be busy", SB_ROOT_BLOCK(s));
+		reiserfs_error(s, "vs-4050", "this is root block (%u), "
+			       "it must be busy", SB_ROOT_BLOCK(s));
 		return 0;
 	}
 
@@ -154,8 +153,8 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 /* - I mean `a window of zero bits' as in description of this function - Zam. */
 
 	if (!bi) {
-		reiserfs_warning(s, "NULL bitmap info pointer for bitmap %d",
-				 bmap_n);
+		reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
+			       "for bitmap %d", bmap_n);
 		return 0;
 	}
 
@@ -400,11 +399,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 	get_bit_address(s, block, &nr, &offset);
 
 	if (nr >= reiserfs_bmap_count(s)) {
-		reiserfs_warning(s, "vs-4075: reiserfs_free_block: "
-				 "block %lu is out of range on %s "
-				 "(nr=%u,max=%u)", block,
-				 reiserfs_bdevname(s), nr,
-				 reiserfs_bmap_count(s));
+		reiserfs_error(s, "vs-4075", "block %lu is out of range",
+			       block);
 		return;
 	}
 
@@ -416,9 +412,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	/* clear bit for the given block in bit map */
 	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
-		reiserfs_warning(s, "vs-4080: reiserfs_free_block: "
-				 "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
-				 reiserfs_bdevname(s), block);
+		reiserfs_error(s, "vs-4080",
+			       "block %lu: bit already cleared", block);
 	}
 	apbi[nr].free_count++;
 	journal_mark_dirty(th, s, bmbh);
@@ -430,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	journal_mark_dirty(th, s, sbh);
 	if (for_unformatted)
-		DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
+		vfs_dq_free_block_nodirty(inode, 1);
 }
 
 void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -445,7 +440,7 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th,
 		return;
 
 	if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
-		reiserfs_panic(th->t_super, "bitmap-4072",
+		reiserfs_error(th->t_super, "bitmap-4072",
 			       "Trying to free block outside file system "
 			       "boundaries (%lu > %lu)",
 			       block, sb_block_count(REISERFS_SB(s)->s_rs));
@@ -477,9 +472,8 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
 	BUG_ON(!th->t_trans_id);
 #ifdef CONFIG_REISERFS_CHECK
 	if (ei->i_prealloc_count < 0)
-		reiserfs_warning(th->t_super,
-				 "zam-4001:%s: inode has negative prealloc blocks count.",
-				 __func__);
+		reiserfs_error(th->t_super, "zam-4001",
+			       "inode has negative prealloc blocks count.");
 #endif
 	while (ei->i_prealloc_count > 0) {
 		reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
@@ -515,9 +509,9 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
 				i_prealloc_list);
 #ifdef CONFIG_REISERFS_CHECK
 		if (!ei->i_prealloc_count) {
-			reiserfs_warning(th->t_super,
-					 "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.",
-					 __func__);
+			reiserfs_error(th->t_super, "zam-4001",
+				       "inode is in prealloc list but has "
+				       "no preallocated blocks.");
 		}
 #endif
 		__discard_prealloc(th, ei);
@@ -631,12 +625,12 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
 			continue;
 		}
 
-		reiserfs_warning(s, "zam-4001: %s : unknown option - %s",
-				 __func__, this_char);
+		reiserfs_warning(s, "zam-4001", "unknown option - %s",
+				 this_char);
 		return 1;
 	}
 
-	reiserfs_warning(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
+	reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
 	return 0;
 }
 
@@ -1055,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       amount_needed, hint->inode->i_uid);
 #endif
 		quota_ret =
-		    DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+		    vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
 		if (quota_ret)	/* Quota exceeded? */
 			return QUOTA_EXCEEDED;
 		if (hint->preallocate && hint->prealloc_size) {
@@ -1064,8 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 				       "reiserquota: allocating (prealloc) %d blocks id=%u",
 				       hint->prealloc_size, hint->inode->i_uid);
 #endif
-			quota_ret =
-			    DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode,
+			quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
 							 hint->prealloc_size);
 			if (quota_ret)
 				hint->preallocate = hint->prealloc_size = 0;
@@ -1098,7 +1091,10 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 					       nr_allocated,
 					       hint->inode->i_uid);
 #endif
-				DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);	/* Free not allocated blocks */
+				/* Free not allocated blocks */
+				vfs_dq_free_block_nodirty(hint->inode,
+					amount_needed + hint->prealloc_size -
+					nr_allocated);
 			}
 			while (nr_allocated--)
 				reiserfs_free_block(hint->th, hint->inode,
@@ -1129,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       REISERFS_I(hint->inode)->i_prealloc_count,
 			       hint->inode->i_uid);
 #endif
-		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
+		vfs_dq_free_block_nodirty(hint->inode, amount_needed +
 					 hint->prealloc_size - nr_allocated -
 					 REISERFS_I(hint->inode)->
 					 i_prealloc_count);
@@ -1219,7 +1215,9 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb,
 	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
 
 	/* The first bit must ALWAYS be 1 */
-	BUG_ON(!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data));
+	if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
+		reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
+			       "corrupted: first bit must be 1", bh->b_blocknr);
 
 	info->free_count = 0;
 
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index e6b03d2020c1..67a80d7e59e2 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,10 +41,10 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #define store_ih(where,what) copy_item_head (where, what)
 
-//
-static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
+			   filldir_t filldir, loff_t *pos)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = dentry->d_inode;
 	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
 	INITIALIZE_PATH(path_to_entry);
 	struct buffer_head *bh;
@@ -64,13 +64,9 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	/* form key for search the next directory entry using f_pos field of
 	   file structure */
-	make_cpu_key(&pos_key, inode,
-		     (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, TYPE_DIRENTRY,
-		     3);
+	make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
 	next_pos = cpu_key_k_offset(&pos_key);
 
-	/*  reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos); */
-
 	path_to_entry.reada = PATH_READA;
 	while (1) {
 	      research:
@@ -144,7 +140,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				/* Ignore the .reiserfs_priv entry */
 				if (reiserfs_xattrs(inode->i_sb) &&
 				    !old_format_only(inode->i_sb) &&
-				    filp->f_path.dentry == inode->i_sb->s_root &&
+				    dentry == inode->i_sb->s_root &&
 				    REISERFS_SB(inode->i_sb)->priv_root &&
 				    REISERFS_SB(inode->i_sb)->priv_root->d_inode
 				    && deh_objectid(deh) ==
@@ -156,7 +152,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				}
 
 				d_off = deh_offset(deh);
-				filp->f_pos = d_off;
+				*pos = d_off;
 				d_ino = deh_objectid(deh);
 				if (d_reclen <= 32) {
 					local_buf = small_buf;
@@ -223,15 +219,21 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	}			/* while */
 
-      end:
-	filp->f_pos = next_pos;
+end:
+	*pos = next_pos;
 	pathrelse(&path_to_entry);
 	reiserfs_check_path(&path_to_entry);
-      out:
+out:
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
 }
 
+static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+}
+
 /* compose directory item containing "." and ".." entries (entries are
    not aligned to 4 byte boundary) */
 /* the last four params are LE */
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2f87f5b14630..4beb964a2a3e 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -29,6 +29,43 @@ struct tree_balance *cur_tb = NULL;	/* detects whether more than one
 					   is interrupting do_balance */
 #endif
 
+static inline void buffer_info_init_left(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->L[0];
+	bi->bi_parent   = tb->FL[0];
+	bi->bi_position = get_left_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_right(struct tree_balance *tb,
+                                          struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = tb->R[0];
+	bi->bi_parent   = tb->FR[0];
+	bi->bi_position = get_right_neighbor_position(tb, 0);
+}
+
+static inline void buffer_info_init_tbS0(struct tree_balance *tb,
+                                         struct buffer_info *bi)
+{
+	bi->tb          = tb;
+	bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
+	bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
+	bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
+}
+
+static inline void buffer_info_init_bh(struct tree_balance *tb,
+                                       struct buffer_info *bi,
+                                       struct buffer_head *bh)
+{
+	bi->tb          = tb;
+	bi->bi_bh       = bh;
+	bi->bi_parent   = NULL;
+	bi->bi_position = 0;
+}
+
 inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
 				       struct buffer_head *bh, int flag)
 {
@@ -39,21 +76,21 @@ inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
 #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
 
-/* summary: 
+/* summary:
  if deleting something ( tb->insert_size[0] < 0 )
    return(balance_leaf_when_delete()); (flag d handled here)
  else
    if lnum is larger than 0 we put items into the left node
    if rnum is larger than 0 we put items into the right node
    if snum1 is larger than 0 we put items into the new node s1
-   if snum2 is larger than 0 we put items into the new node s2 
+   if snum2 is larger than 0 we put items into the new node s2
 Note that all *num* count new items being created.
 
 It would be easier to read balance_leaf() if each of these summary
 lines was a separate procedure rather than being inlined.  I think
 that there are many passages here and in balance_leaf_when_delete() in
 which two calls to one procedure can replace two passages, and it
-might save cache space and improve software maintenance costs to do so.  
+might save cache space and improve software maintenance costs to do so.
 
 Vladimir made the perceptive comment that we should offload most of
 the decision making in this function into fix_nodes/check_balance, and
@@ -86,6 +123,7 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 	       "PAP-12010: tree can not be empty");
 
 	ih = B_N_PITEM_HEAD(tbS0, item_pos);
+	buffer_info_init_tbS0(tb, &bi);
 
 	/* Delete or truncate the item */
 
@@ -96,10 +134,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 		       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
 		       -tb->insert_size[0], ih);
 
-		bi.tb = tb;
-		bi.bi_bh = tbS0;
-		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-		bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
 		leaf_delete_items(&bi, 0, item_pos, 1, -1);
 
 		if (!item_pos && tb->CFL[0]) {
@@ -121,10 +155,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 		break;
 
 	case M_CUT:{		/* cut item in S[0] */
-			bi.tb = tb;
-			bi.bi_bh = tbS0;
-			bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-			bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
 			if (is_direntry_le_ih(ih)) {
 
 				/* UFS unlink semantics are such that you can only delete one directory entry at a time. */
@@ -153,8 +183,8 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 
 	default:
 		print_cur_tb("12040");
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
+		reiserfs_panic(tb->tb_sb, "PAP-12040",
+			       "unexpected mode: %s(%d)",
 			       (flag ==
 				M_PASTE) ? "PASTE" : ((flag ==
 						       M_INSERT) ? "INSERT" :
@@ -258,15 +288,15 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
     )
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0] 
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0]
 							   of the affected item */
 	struct buffer_info bi;
 	struct buffer_head *S_new[2];	/* new nodes allocated to hold what could not fit into S */
 	int snum[2];		/* number of items that will be placed
 				   into S_new (includes partially shifted
 				   items) */
-	int sbytes[2];		/* if an item is partially shifted into S_new then 
-				   if it is a directory item 
+	int sbytes[2];		/* if an item is partially shifted into S_new then
+				   if it is a directory item
 				   it is the number of entries from the item that are shifted into S_new
 				   else
 				   it is the number of bytes from the item that are shifted into S_new
@@ -325,11 +355,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					       ih_item_len(ih));
 
 					/* Insert new item into L[0] */
-					bi.tb = tb;
-					bi.bi_bh = tb->L[0];
-					bi.bi_parent = tb->FL[0];
-					bi.bi_position =
-					    get_left_neighbor_position(tb, 0);
+					buffer_info_init_left(tb, &bi);
 					leaf_insert_into_buf(&bi,
 							     n + item_pos -
 							     ret_val, ih, body,
@@ -369,11 +395,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					    leaf_shift_left(tb, tb->lnum[0] - 1,
 							    tb->lbytes);
 					/* Insert new item into L[0] */
-					bi.tb = tb;
-					bi.bi_bh = tb->L[0];
-					bi.bi_parent = tb->FL[0];
-					bi.bi_position =
-					    get_left_neighbor_position(tb, 0);
+					buffer_info_init_left(tb, &bi);
 					leaf_insert_into_buf(&bi,
 							     n + item_pos -
 							     ret_val, ih, body,
@@ -429,13 +451,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							}
 
 							/* Append given directory entry to directory item */
-							bi.tb = tb;
-							bi.bi_bh = tb->L[0];
-							bi.bi_parent =
-							    tb->FL[0];
-							bi.bi_position =
-							    get_left_neighbor_position
-							    (tb, 0);
+							buffer_info_init_left(tb, &bi);
 							leaf_paste_in_buffer
 							    (&bi,
 							     n + item_pos -
@@ -449,8 +465,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							/* when we have merge directory item, pos_in_item has been changed too */
 
 							/* paste new directory entry. 1 is entry number */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   n +
 									   item_pos
 									   -
@@ -524,13 +539,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 									     (tbS0,
 									      item_pos)));
 							/* Append to body of item in L[0] */
-							bi.tb = tb;
-							bi.bi_bh = tb->L[0];
-							bi.bi_parent =
-							    tb->FL[0];
-							bi.bi_position =
-							    get_left_neighbor_position
-							    (tb, 0);
+							buffer_info_init_left(tb, &bi);
 							leaf_paste_in_buffer
 							    (&bi,
 							     n + item_pos -
@@ -681,11 +690,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					    leaf_shift_left(tb, tb->lnum[0],
 							    tb->lbytes);
 					/* Append to body of item in L[0] */
-					bi.tb = tb;
-					bi.bi_bh = tb->L[0];
-					bi.bi_parent = tb->FL[0];
-					bi.bi_position =
-					    get_left_neighbor_position(tb, 0);
+					buffer_info_init_left(tb, &bi);
 					leaf_paste_in_buffer(&bi,
 							     n + item_pos -
 							     ret_val,
@@ -699,7 +704,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   n + item_pos -
 							   ret_val);
 					if (is_direntry_le_ih(pasted))
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   n +
 								   item_pos -
 								   ret_val,
@@ -722,8 +727,9 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 				}
 				break;
 			default:	/* cases d and t */
-				reiserfs_panic(tb->tb_sb,
-					       "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
+				reiserfs_panic(tb->tb_sb, "PAP-12130",
+					       "lnum > 0: unexpected mode: "
+					       " %s(%d)",
 					       (flag ==
 						M_DELETE) ? "DELETE" : ((flag ==
 									 M_CUT)
@@ -776,11 +782,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					set_le_ih_k_offset(ih, offset);
 					put_ih_item_len(ih, tb->rbytes);
 					/* Insert part of the item into R[0] */
-					bi.tb = tb;
-					bi.bi_bh = tb->R[0];
-					bi.bi_parent = tb->FR[0];
-					bi.bi_position =
-					    get_right_neighbor_position(tb, 0);
+					buffer_info_init_right(tb, &bi);
 					if ((old_len - tb->rbytes) > zeros_num) {
 						r_zeros_number = 0;
 						r_body =
@@ -817,11 +819,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     tb->rnum[0] - 1,
 							     tb->rbytes);
 					/* Insert new item into R[0] */
-					bi.tb = tb;
-					bi.bi_bh = tb->R[0];
-					bi.bi_parent = tb->FR[0];
-					bi.bi_position =
-					    get_right_neighbor_position(tb, 0);
+					buffer_info_init_right(tb, &bi);
 					leaf_insert_into_buf(&bi,
 							     item_pos - n +
 							     tb->rnum[0] - 1,
@@ -881,21 +879,14 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							    pos_in_item -
 							    entry_count +
 							    tb->rbytes - 1;
-							bi.tb = tb;
-							bi.bi_bh = tb->R[0];
-							bi.bi_parent =
-							    tb->FR[0];
-							bi.bi_position =
-							    get_right_neighbor_position
-							    (tb, 0);
+							buffer_info_init_right(tb, &bi);
 							leaf_paste_in_buffer
 							    (&bi, 0,
 							     paste_entry_position,
 							     tb->insert_size[0],
 							     body, zeros_num);
 							/* paste entry */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   0,
 									   paste_entry_position,
 									   1,
@@ -1019,12 +1010,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 						    (tb, tb->CFR[0], 0);
 
 						/* Append part of body into R[0] */
-						bi.tb = tb;
-						bi.bi_bh = tb->R[0];
-						bi.bi_parent = tb->FR[0];
-						bi.bi_position =
-						    get_right_neighbor_position
-						    (tb, 0);
+						buffer_info_init_right(tb, &bi);
 						if (n_rem > zeros_num) {
 							r_zeros_number = 0;
 							r_body =
@@ -1071,12 +1057,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     tb->rbytes);
 					/* append item in R[0] */
 					if (pos_in_item >= 0) {
-						bi.tb = tb;
-						bi.bi_bh = tb->R[0];
-						bi.bi_parent = tb->FR[0];
-						bi.bi_position =
-						    get_right_neighbor_position
-						    (tb, 0);
+						buffer_info_init_right(tb, &bi);
 						leaf_paste_in_buffer(&bi,
 								     item_pos -
 								     n +
@@ -1096,7 +1077,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   tb->rnum[0]);
 					if (is_direntry_le_ih(pasted)
 					    && pos_in_item >= 0) {
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos -
 								   n +
 								   tb->rnum[0],
@@ -1136,8 +1117,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 			}
 			break;
 		default:	/* cases d and t */
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
+			reiserfs_panic(tb->tb_sb, "PAP-12175",
+				       "rnum > 0: unexpected mode: %s(%d)",
 				       (flag ==
 					M_DELETE) ? "DELETE" : ((flag ==
 								 M_CUT) ? "CUT"
@@ -1167,8 +1148,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 		   not set correctly */
 		if (tb->CFL[0]) {
 			if (!tb->CFR[0])
-				reiserfs_panic(tb->tb_sb,
-					       "vs-12195: balance_leaf: CFR not initialized");
+				reiserfs_panic(tb->tb_sb, "vs-12195",
+					       "CFR not initialized");
 			copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
 				 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
 			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
@@ -1232,10 +1213,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					put_ih_item_len(ih, sbytes[i]);
 
 					/* Insert part of the item into S_new[i] before 0-th item */
-					bi.tb = tb;
-					bi.bi_bh = S_new[i];
-					bi.bi_parent = NULL;
-					bi.bi_position = 0;
+					buffer_info_init_bh(tb, &bi, S_new[i]);
 
 					if ((old_len - sbytes[i]) > zeros_num) {
 						r_zeros_number = 0;
@@ -1267,10 +1245,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							S_new[i]);
 
 					/* Insert new item into S_new[i] */
-					bi.tb = tb;
-					bi.bi_bh = S_new[i];
-					bi.bi_parent = NULL;
-					bi.bi_position = 0;
+					buffer_info_init_bh(tb, &bi, S_new[i]);
 					leaf_insert_into_buf(&bi,
 							     item_pos - n +
 							     snum[i] - 1, ih,
@@ -1327,10 +1302,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     sbytes[i] - 1,
 							     S_new[i]);
 							/* Paste given directory entry to directory item */
-							bi.tb = tb;
-							bi.bi_bh = S_new[i];
-							bi.bi_parent = NULL;
-							bi.bi_position = 0;
+							buffer_info_init_bh(tb, &bi, S_new[i]);
 							leaf_paste_in_buffer
 							    (&bi, 0,
 							     pos_in_item -
@@ -1339,8 +1311,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							     tb->insert_size[0],
 							     body, zeros_num);
 							/* paste new directory entry */
-							leaf_paste_entries(bi.
-									   bi_bh,
+							leaf_paste_entries(&bi,
 									   0,
 									   pos_in_item
 									   -
@@ -1401,11 +1372,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 						if (n_rem < 0)
 							n_rem = 0;
 						/* Append part of body into S_new[0] */
-						bi.tb = tb;
-						bi.bi_bh = S_new[i];
-						bi.bi_parent = NULL;
-						bi.bi_position = 0;
-
+						buffer_info_init_bh(tb, &bi, S_new[i]);
 						if (n_rem > zeros_num) {
 							r_zeros_number = 0;
 							r_body =
@@ -1475,7 +1442,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					    && (pos_in_item != ih_item_len(ih_check)
 						|| tb->insert_size[0] <= 0))
 						reiserfs_panic(tb->tb_sb,
-							       "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
+							     "PAP-12235",
+							     "pos_in_item "
+							     "must be equal "
+							     "to ih_item_len");
 #endif				/* CONFIG_REISERFS_CHECK */
 
 					leaf_mi =
@@ -1489,10 +1459,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 					       leaf_mi);
 
 					/* paste into item */
-					bi.tb = tb;
-					bi.bi_bh = S_new[i];
-					bi.bi_parent = NULL;
-					bi.bi_position = 0;
+					buffer_info_init_bh(tb, &bi, S_new[i]);
 					leaf_paste_in_buffer(&bi,
 							     item_pos - n +
 							     snum[i],
@@ -1505,7 +1472,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							   item_pos - n +
 							   snum[i]);
 					if (is_direntry_le_ih(pasted)) {
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos -
 								   n + snum[i],
 								   pos_in_item,
@@ -1535,8 +1502,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 			}
 			break;
 		default:	/* cases d and t */
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
+			reiserfs_panic(tb->tb_sb, "PAP-12245",
+				       "blknum > 2: unexpected mode: %s(%d)",
 				       (flag ==
 					M_DELETE) ? "DELETE" : ((flag ==
 								 M_CUT) ? "CUT"
@@ -1559,10 +1526,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 
 		switch (flag) {
 		case M_INSERT:	/* insert item into S[0] */
-			bi.tb = tb;
-			bi.bi_bh = tbS0;
-			bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-			bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
+			buffer_info_init_tbS0(tb, &bi);
 			leaf_insert_into_buf(&bi, item_pos, ih, body,
 					     zeros_num);
 
@@ -1589,14 +1553,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 						       "PAP-12260: insert_size is 0 already");
 
 						/* prepare space */
-						bi.tb = tb;
-						bi.bi_bh = tbS0;
-						bi.bi_parent =
-						    PATH_H_PPARENT(tb->tb_path,
-								   0);
-						bi.bi_position =
-						    PATH_H_POSITION(tb->tb_path,
-								    1);
+						buffer_info_init_tbS0(tb, &bi);
 						leaf_paste_in_buffer(&bi,
 								     item_pos,
 								     pos_in_item,
@@ -1606,7 +1563,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 								     zeros_num);
 
 						/* paste entry */
-						leaf_paste_entries(bi.bi_bh,
+						leaf_paste_entries(&bi,
 								   item_pos,
 								   pos_in_item,
 								   1,
@@ -1644,14 +1601,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 						RFALSE(tb->insert_size[0] <= 0,
 						       "PAP-12275: insert size must not be %d",
 						       tb->insert_size[0]);
-						bi.tb = tb;
-						bi.bi_bh = tbS0;
-						bi.bi_parent =
-						    PATH_H_PPARENT(tb->tb_path,
-								   0);
-						bi.bi_position =
-						    PATH_H_POSITION(tb->tb_path,
-								    1);
+						buffer_info_init_tbS0(tb, &bi);
 						leaf_paste_in_buffer(&bi,
 								     item_pos,
 								     pos_in_item,
@@ -1681,10 +1631,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 							print_cur_tb("12285");
 							reiserfs_panic(tb->
 								       tb_sb,
-								       "PAP-12285: balance_leaf: insert_size must be 0 (%d)",
-								       tb->
-								       insert_size
-								       [0]);
+							    "PAP-12285",
+							    "insert_size "
+							    "must be 0 "
+							    "(%d)",
+							    tb->insert_size[0]);
 						}
 					}
 #endif				/* CONFIG_REISERFS_CHECK */
@@ -1697,11 +1648,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 	if (flag == M_PASTE && tb->insert_size[0]) {
 		print_cur_tb("12290");
 		reiserfs_panic(tb->tb_sb,
-			       "PAP-12290: balance_leaf: insert_size is still not 0 (%d)",
+			       "PAP-12290", "insert_size is still not 0 (%d)",
 			       tb->insert_size[0]);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
-
 	return 0;
 }				/* Leaf level of the tree is balanced (end of balance_leaf) */
 
@@ -1724,7 +1674,6 @@ void make_empty_node(struct buffer_info *bi)
 struct buffer_head *get_FEB(struct tree_balance *tb)
 {
 	int i;
-	struct buffer_head *first_b;
 	struct buffer_info bi;
 
 	for (i = 0; i < MAX_FEB_SIZE; i++)
@@ -1732,19 +1681,15 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
 			break;
 
 	if (i == MAX_FEB_SIZE)
-		reiserfs_panic(tb->tb_sb,
-			       "vs-12300: get_FEB: FEB list is empty");
+		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
 
-	bi.tb = tb;
-	bi.bi_bh = first_b = tb->FEB[i];
-	bi.bi_parent = NULL;
-	bi.bi_position = 0;
+	buffer_info_init_bh(tb, &bi, tb->FEB[i]);
 	make_empty_node(&bi);
-	set_buffer_uptodate(first_b);
+	set_buffer_uptodate(tb->FEB[i]);
+	tb->used[i] = tb->FEB[i];
 	tb->FEB[i] = NULL;
-	tb->used[i] = first_b;
 
-	return (first_b);
+	return tb->used[i];
 }
 
 /* This is now used because reiserfs_free_block has to be able to
@@ -1755,15 +1700,16 @@ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
 	int i;
 
 	if (buffer_dirty(bh))
-		reiserfs_warning(tb->tb_sb,
-				 "store_thrown deals with dirty buffer");
+		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
+				 "called with dirty buffer");
 	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
 		if (!tb->thrown[i]) {
 			tb->thrown[i] = bh;
 			get_bh(bh);	/* free_thrown puts this */
 			return;
 		}
-	reiserfs_warning(tb->tb_sb, "store_thrown: too many thrown buffers");
+	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
+			 "too many thrown buffers");
 }
 
 static void free_thrown(struct tree_balance *tb)
@@ -1774,8 +1720,8 @@ static void free_thrown(struct tree_balance *tb)
 		if (tb->thrown[i]) {
 			blocknr = tb->thrown[i]->b_blocknr;
 			if (buffer_dirty(tb->thrown[i]))
-				reiserfs_warning(tb->tb_sb,
-						 "free_thrown deals with dirty buffer %d",
+				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
+						 "called with dirty buffer %d",
 						 blocknr);
 			brelse(tb->thrown[i]);	/* incremented in store_thrown */
 			reiserfs_free_block(tb->transaction_handle, NULL,
@@ -1873,20 +1819,19 @@ static void check_internal_node(struct super_block *s, struct buffer_head *bh,
 	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
 		if (!is_reusable(s, dc_block_number(dc), 1)) {
 			print_cur_tb(mes);
-			reiserfs_panic(s,
-				       "PAP-12338: check_internal_node: invalid child pointer %y in %b",
+			reiserfs_panic(s, "PAP-12338",
+				       "invalid child pointer %y in %b",
 				       dc, bh);
 		}
 	}
 }
 
-static int locked_or_not_in_tree(struct buffer_head *bh, char *which)
+static int locked_or_not_in_tree(struct tree_balance *tb,
+				  struct buffer_head *bh, char *which)
 {
 	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
 	    !B_IS_IN_TREE(bh)) {
-		reiserfs_warning(NULL,
-				 "vs-12339: locked_or_not_in_tree: %s (%b)",
-				 which, bh);
+		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
 		return 1;
 	}
 	return 0;
@@ -1897,26 +1842,28 @@ static int check_before_balancing(struct tree_balance *tb)
 	int retval = 0;
 
 	if (cur_tb) {
-		reiserfs_panic(tb->tb_sb, "vs-12335: check_before_balancing: "
-			       "suspect that schedule occurred based on cur_tb not being null at this point in code. "
-			       "do_balance cannot properly handle schedule occurring while it runs.");
+		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
+			       "occurred based on cur_tb not being null at "
+			       "this point in code. do_balance cannot properly "
+			       "handle schedule occurring while it runs.");
 	}
 
 	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
 	   prepped all of these for us). */
 	if (tb->lnum[0]) {
-		retval |= locked_or_not_in_tree(tb->L[0], "L[0]");
-		retval |= locked_or_not_in_tree(tb->FL[0], "FL[0]");
-		retval |= locked_or_not_in_tree(tb->CFL[0], "CFL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
 		check_leaf(tb->L[0]);
 	}
 	if (tb->rnum[0]) {
-		retval |= locked_or_not_in_tree(tb->R[0], "R[0]");
-		retval |= locked_or_not_in_tree(tb->FR[0], "FR[0]");
-		retval |= locked_or_not_in_tree(tb->CFR[0], "CFR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
+		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
+		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
 		check_leaf(tb->R[0]);
 	}
-	retval |= locked_or_not_in_tree(PATH_PLAST_BUFFER(tb->tb_path), "S[0]");
+	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
+					"S[0]");
 	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
 
 	return retval;
@@ -1930,8 +1877,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 		    dc_size(B_N_CHILD
 			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
 			print_cur_tb("12221");
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
+			reiserfs_panic(tb->tb_sb, "PAP-12355",
+				       "shift to left was incorrect");
 		}
 	}
 	if (tb->rnum[0]) {
@@ -1940,8 +1887,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 		    dc_size(B_N_CHILD
 			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
 			print_cur_tb("12222");
-			reiserfs_panic(tb->tb_sb,
-				       "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
+			reiserfs_panic(tb->tb_sb, "PAP-12360",
+				       "shift to right was incorrect");
 		}
 	}
 	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
@@ -1955,7 +1902,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 					       PATH_H_POSITION(tb->tb_path,
 							       1))));
 		print_cur_tb("12223");
-		reiserfs_warning(tb->tb_sb,
+		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
 				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
 				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
 				 left,
@@ -1966,8 +1913,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
 					 (PATH_H_PBUFFER(tb->tb_path, 1),
 					  PATH_H_POSITION(tb->tb_path, 1))),
 				 right);
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12365: check_after_balance_leaf: S is incorrect");
+		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
 	}
 }
 
@@ -2037,7 +1983,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
 	/* store_print_tb (tb); */
 
 	/* do not delete, just comment it out */
-/*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, 
+/*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
 	     "check");*/
 	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
@@ -2102,14 +2048,13 @@ void do_balance(struct tree_balance *tb,	/* tree_balance structure */
 	tb->need_balance_dirty = 0;
 
 	if (FILESYSTEM_CHANGED_TB(tb)) {
-		reiserfs_panic(tb->tb_sb,
-			       "clm-6000: do_balance, fs generation has changed\n");
+		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
+			       "changed");
 	}
 	/* if we have no real work to do  */
 	if (!tb->insert_size[0]) {
-		reiserfs_warning(tb->tb_sb,
-				 "PAP-12350: do_balance: insert_size == 0, mode == %c",
-				 flag);
+		reiserfs_warning(tb->tb_sb, "PAP-12350",
+				 "insert_size == 0, mode == %c", flag);
 		unfix_nodes(tb);
 		return;
 	}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 33408417038c..9f436668b7f8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -20,14 +20,14 @@
 ** insertion/balancing, for files that are written in one write.
 ** It avoids unnecessary tail packings (balances) for files that are written in
 ** multiple writes and are small enough to have tails.
-** 
+**
 ** file_release is called by the VFS layer when the file is closed.  If
 ** this is the last open file descriptor, and the file
 ** small enough to have a tail, and the tail is currently in an
 ** unformatted node, the tail is converted back into a direct item.
-** 
+**
 ** We use reiserfs_truncate_file to pack the tail, since it already has
-** all the conditions coded.  
+** all the conditions coded.
 */
 static int reiserfs_file_release(struct inode *inode, struct file *filp)
 {
@@ -76,7 +76,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 			 * and let the admin know what is going on.
 			 */
 			igrab(inode);
-			reiserfs_warning(inode->i_sb,
+			reiserfs_warning(inode->i_sb, "clm-9001",
 					 "pinning inode %lu because the "
 					 "preallocation can't be freed",
 					 inode->i_ino);
@@ -134,23 +134,23 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
  * be removed...
  */
 
-static int reiserfs_sync_file(struct file *p_s_filp,
-			      struct dentry *p_s_dentry, int datasync)
+static int reiserfs_sync_file(struct file *filp,
+			      struct dentry *dentry, int datasync)
 {
-	struct inode *p_s_inode = p_s_dentry->d_inode;
-	int n_err;
+	struct inode *inode = dentry->d_inode;
+	int err;
 	int barrier_done;
 
-	BUG_ON(!S_ISREG(p_s_inode->i_mode));
-	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
-	reiserfs_write_lock(p_s_inode->i_sb);
-	barrier_done = reiserfs_commit_for_inode(p_s_inode);
-	reiserfs_write_unlock(p_s_inode->i_sb);
-	if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
-		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
+	BUG_ON(!S_ISREG(inode->i_mode));
+	err = sync_mapping_buffers(inode->i_mapping);
+	reiserfs_write_lock(inode->i_sb);
+	barrier_done = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
-	return (n_err < 0) ? -EIO : 0;
+	return (err < 0) ? -EIO : 0;
 }
 
 /* taken fs/buffer.c:__block_commit_write */
@@ -223,7 +223,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 }
 
 /* Write @count bytes at position @ppos in a file indicated by @file
-   from the buffer @buf.  
+   from the buffer @buf.
 
    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 07d05e0842b7..5e5a4e6fbaf8 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -30,8 +30,8 @@
  ** get_direct_parent
  ** get_neighbors
  ** fix_nodes
- ** 
- ** 
+ **
+ **
  **/
 
 #include <linux/time.h>
@@ -135,8 +135,7 @@ static void create_virtual_node(struct tree_balance *tb, int h)
 		vn->vn_free_ptr +=
 		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
 		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8030: create_virtual_node: "
+			reiserfs_panic(tb->tb_sb, "vs-8030",
 				       "virtual node space consumed");
 
 		if (!is_affected)
@@ -186,8 +185,9 @@ static void create_virtual_node(struct tree_balance *tb, int h)
 			     && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
 				/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
 				print_block(Sh, 0, -1, -1);
-				reiserfs_panic(tb->tb_sb,
-					       "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
+				reiserfs_panic(tb->tb_sb, "vs-8045",
+					       "rdkey %k, affected item==%d "
+					       "(mode==%c) Must be %c",
 					       key, vn->vn_affected_item_num,
 					       vn->vn_mode, M_DELETE);
 			}
@@ -377,9 +377,9 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 	int needed_nodes;
 	int start_item,		/* position of item we start filling node from */
 	 end_item,		/* position of item we finish filling node by */
-	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item 
+	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item
 				   we do not include into node that is being filled */
-	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item 
+	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item
 				   we do node include into node that is being filled */
 	int split_item_positions[2];	/* these are positions in virtual item of
 					   items, that are split between S[0] and
@@ -496,8 +496,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 		snum012[needed_nodes - 1 + 3] = units;
 
 		if (needed_nodes > 2)
-			reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
-					 "split_item_position is out of boundary");
+			reiserfs_warning(tb->tb_sb, "vs-8111",
+					 "split_item_position is out of range");
 		snum012[needed_nodes - 1]++;
 		split_item_positions[needed_nodes - 1] = i;
 		needed_nodes++;
@@ -533,8 +533,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
 
 		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
 		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
-			reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
-					 "directory or indirect item");
+			reiserfs_warning(tb->tb_sb, "vs-8115",
+					 "not directory or indirect item");
 	}
 
 	/* now we know S2bytes, calculate S1bytes */
@@ -569,7 +569,7 @@ extern struct tree_balance *cur_tb;
 
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
- * where it will later be used by the functions that actually do the balancing. 
+ * where it will later be used by the functions that actually do the balancing.
  * Parameters:
  *	tb	tree_balance structure;
  *	h	current level of the node;
@@ -749,25 +749,26 @@ else \
 		  -1, -1);\
 }
 
-static void free_buffers_in_tb(struct tree_balance *p_s_tb)
+static void free_buffers_in_tb(struct tree_balance *tb)
 {
-	int n_counter;
-
-	decrement_counters_in_path(p_s_tb->tb_path);
-
-	for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
-		decrement_bcount(p_s_tb->L[n_counter]);
-		p_s_tb->L[n_counter] = NULL;
-		decrement_bcount(p_s_tb->R[n_counter]);
-		p_s_tb->R[n_counter] = NULL;
-		decrement_bcount(p_s_tb->FL[n_counter]);
-		p_s_tb->FL[n_counter] = NULL;
-		decrement_bcount(p_s_tb->FR[n_counter]);
-		p_s_tb->FR[n_counter] = NULL;
-		decrement_bcount(p_s_tb->CFL[n_counter]);
-		p_s_tb->CFL[n_counter] = NULL;
-		decrement_bcount(p_s_tb->CFR[n_counter]);
-		p_s_tb->CFR[n_counter] = NULL;
+	int i;
+
+	pathrelse(tb->tb_path);
+
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+
+		tb->L[i] = NULL;
+		tb->R[i] = NULL;
+		tb->FL[i] = NULL;
+		tb->FR[i] = NULL;
+		tb->CFL[i] = NULL;
+		tb->CFR[i] = NULL;
 	}
 }
 
@@ -777,14 +778,14 @@ static void free_buffers_in_tb(struct tree_balance *p_s_tb)
  *	        NO_DISK_SPACE - no disk space.
  */
 /* The function is NOT SCHEDULE-SAFE! */
-static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
+static int get_empty_nodes(struct tree_balance *tb, int h)
 {
-	struct buffer_head *p_s_new_bh,
-	    *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
-	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
-	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
-	 n_retval = CARRY_ON;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	struct buffer_head *new_bh,
+	    *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
+	int counter, number_of_freeblk, amount_needed,	/* number of needed empty blocks */
+	 retval = CARRY_ON;
+	struct super_block *sb = tb->tb_sb;
 
 	/* number_of_freeblk is the number of empty blocks which have been
 	   acquired for use by the balancing algorithm minus the number of
@@ -792,7 +793,7 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 	   number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
 	   after empty blocks are acquired, and the balancing analysis is
 	   then restarted, amount_needed is the number needed by this level
-	   (n_h) of the balancing analysis.
+	   (h) of the balancing analysis.
 
 	   Note that for systems with many processes writing, it would be
 	   more layout optimal to calculate the total number needed by all
@@ -800,54 +801,54 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
 
 	/* Initiate number_of_freeblk to the amount acquired prior to the restart of
 	   the analysis or 0 if not restarted, then subtract the amount needed
-	   by all of the levels of the tree below n_h. */
-	/* blknum includes S[n_h], so we subtract 1 in this calculation */
-	for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
-	     n_counter < n_h; n_counter++)
-		n_number_of_freeblk -=
-		    (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
+	   by all of the levels of the tree below h. */
+	/* blknum includes S[h], so we subtract 1 in this calculation */
+	for (counter = 0, number_of_freeblk = tb->cur_blknum;
+	     counter < h; counter++)
+		number_of_freeblk -=
+		    (tb->blknum[counter]) ? (tb->blknum[counter] -
 						   1) : 0;
 
 	/* Allocate missing empty blocks. */
-	/* if p_s_Sh == 0  then we are getting a new root */
-	n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
+	/* if Sh == 0  then we are getting a new root */
+	amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
 	/*  Amount_needed = the amount that we need more than the amount that we have. */
-	if (n_amount_needed > n_number_of_freeblk)
-		n_amount_needed -= n_number_of_freeblk;
+	if (amount_needed > number_of_freeblk)
+		amount_needed -= number_of_freeblk;
 	else			/* If we have enough already then there is nothing to do. */
 		return CARRY_ON;
 
 	/* No need to check quota - is not allocated for blocks used for formatted nodes */
-	if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
-				       n_amount_needed) == NO_DISK_SPACE)
+	if (reiserfs_new_form_blocknrs(tb, blocknrs,
+				       amount_needed) == NO_DISK_SPACE)
 		return NO_DISK_SPACE;
 
 	/* for each blocknumber we just got, get a buffer and stick it on FEB */
-	for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
-	     n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
+	for (blocknr = blocknrs, counter = 0;
+	     counter < amount_needed; blocknr++, counter++) {
 
-		RFALSE(!*p_n_blocknr,
+		RFALSE(!*blocknr,
 		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
 
-		p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
-		RFALSE(buffer_dirty(p_s_new_bh) ||
-		       buffer_journaled(p_s_new_bh) ||
-		       buffer_journal_dirty(p_s_new_bh),
+		new_bh = sb_getblk(sb, *blocknr);
+		RFALSE(buffer_dirty(new_bh) ||
+		       buffer_journaled(new_bh) ||
+		       buffer_journal_dirty(new_bh),
 		       "PAP-8140: journlaled or dirty buffer %b for the new block",
-		       p_s_new_bh);
+		       new_bh);
 
 		/* Put empty buffers into the array. */
-		RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
+		RFALSE(tb->FEB[tb->cur_blknum],
 		       "PAP-8141: busy slot for new buffer");
 
-		set_buffer_journal_new(p_s_new_bh);
-		p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
+		set_buffer_journal_new(new_bh);
+		tb->FEB[tb->cur_blknum++] = new_bh;
 	}
 
-	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
-		n_retval = REPEAT_SEARCH;
+	if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
+		retval = REPEAT_SEARCH;
 
-	return n_retval;
+	return retval;
 }
 
 /* Get free space of the left neighbor, which is stored in the parent
@@ -895,35 +896,36 @@ static int get_rfree(struct tree_balance *tb, int h)
 }
 
 /* Check whether left neighbor is in memory. */
-static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
+static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
 {
-	struct buffer_head *p_s_father, *left;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
-	b_blocknr_t n_left_neighbor_blocknr;
-	int n_left_neighbor_position;
+	struct buffer_head *father, *left;
+	struct super_block *sb = tb->tb_sb;
+	b_blocknr_t left_neighbor_blocknr;
+	int left_neighbor_position;
 
-	if (!p_s_tb->FL[n_h])	/* Father of the left neighbor does not exist. */
+	/* Father of the left neighbor does not exist. */
+	if (!tb->FL[h])
 		return 0;
 
 	/* Calculate father of the node to be balanced. */
-	p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
+	father = PATH_H_PBUFFER(tb->tb_path, h + 1);
 
-	RFALSE(!p_s_father ||
-	       !B_IS_IN_TREE(p_s_father) ||
-	       !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
-	       !buffer_uptodate(p_s_father) ||
-	       !buffer_uptodate(p_s_tb->FL[n_h]),
+	RFALSE(!father ||
+	       !B_IS_IN_TREE(father) ||
+	       !B_IS_IN_TREE(tb->FL[h]) ||
+	       !buffer_uptodate(father) ||
+	       !buffer_uptodate(tb->FL[h]),
 	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
-	       p_s_father, p_s_tb->FL[n_h]);
+	       father, tb->FL[h]);
 
 	/* Get position of the pointer to the left neighbor into the left father. */
-	n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
-	    p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
+	left_neighbor_position = (father == tb->FL[h]) ?
+	    tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
 	/* Get left neighbor block number. */
-	n_left_neighbor_blocknr =
-	    B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
+	left_neighbor_blocknr =
+	    B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
 	/* Look for the left neighbor in the cache. */
-	if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
+	if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
 
 		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
 		       "vs-8170: left neighbor (%b %z) is not in the tree",
@@ -938,10 +940,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
 #define LEFT_PARENTS  'l'
 #define RIGHT_PARENTS 'r'
 
-static void decrement_key(struct cpu_key *p_s_key)
+static void decrement_key(struct cpu_key *key)
 {
 	// call item specific function for this key
-	item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
+	item_ops[cpu_key_k_type(key)]->decrement_key(key);
 }
 
 /* Calculate far left/right parent of the left/right neighbor of the current node, that
@@ -952,77 +954,77 @@ static void decrement_key(struct cpu_key *p_s_key)
  		SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON         - schedule didn't occur while the function worked;
  */
-static int get_far_parent(struct tree_balance *p_s_tb,
-			  int n_h,
-			  struct buffer_head **pp_s_father,
-			  struct buffer_head **pp_s_com_father, char c_lr_par)
+static int get_far_parent(struct tree_balance *tb,
+			  int h,
+			  struct buffer_head **pfather,
+			  struct buffer_head **pcom_father, char c_lr_par)
 {
-	struct buffer_head *p_s_parent;
+	struct buffer_head *parent;
 	INITIALIZE_PATH(s_path_to_neighbor_father);
-	struct treepath *p_s_path = p_s_tb->tb_path;
+	struct treepath *path = tb->tb_path;
 	struct cpu_key s_lr_father_key;
-	int n_counter,
-	    n_position = INT_MAX,
-	    n_first_last_position = 0,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
+	int counter,
+	    position = INT_MAX,
+	    first_last_position = 0,
+	    path_offset = PATH_H_PATH_OFFSET(path, h);
 
-	/* Starting from F[n_h] go upwards in the tree, and look for the common
-	   ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
+	/* Starting from F[h] go upwards in the tree, and look for the common
+	   ancestor of F[h], and its neighbor l/r, that should be obtained. */
 
-	n_counter = n_path_offset;
+	counter = path_offset;
 
-	RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
+	RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
 	       "PAP-8180: invalid path length");
 
-	for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
+	for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
 		/* Check whether parent of the current buffer in the path is really parent in the tree. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
+		    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
 			return REPEAT_SEARCH;
 		/* Check whether position in the parent is correct. */
-		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_path,
-					  n_counter - 1)) >
-		    B_NR_ITEMS(p_s_parent))
+		if ((position =
+		     PATH_OFFSET_POSITION(path,
+					  counter - 1)) >
+		    B_NR_ITEMS(parent))
 			return REPEAT_SEARCH;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
 			return REPEAT_SEARCH;
 		/* Return delimiting key if position in the parent is not equal to first/last one. */
 		if (c_lr_par == RIGHT_PARENTS)
-			n_first_last_position = B_NR_ITEMS(p_s_parent);
-		if (n_position != n_first_last_position) {
-			*pp_s_com_father = p_s_parent;
-			get_bh(*pp_s_com_father);
-			/*(*pp_s_com_father = p_s_parent)->b_count++; */
+			first_last_position = B_NR_ITEMS(parent);
+		if (position != first_last_position) {
+			*pcom_father = parent;
+			get_bh(*pcom_father);
+			/*(*pcom_father = parent)->b_count++; */
 			break;
 		}
 	}
 
 	/* if we are in the root of the tree, then there is no common father */
-	if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
+	if (counter == FIRST_PATH_ELEMENT_OFFSET) {
 		/* Check whether first buffer in the path is the root of the tree. */
 		if (PATH_OFFSET_PBUFFER
-		    (p_s_tb->tb_path,
+		    (tb->tb_path,
 		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-		    SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
-			*pp_s_father = *pp_s_com_father = NULL;
+		    SB_ROOT_BLOCK(tb->tb_sb)) {
+			*pfather = *pcom_father = NULL;
 			return CARRY_ON;
 		}
 		return REPEAT_SEARCH;
 	}
 
-	RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
+	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
 	       "PAP-8185: (%b %z) level too small",
-	       *pp_s_com_father, *pp_s_com_father);
+	       *pcom_father, *pcom_father);
 
 	/* Check whether the common parent is locked. */
 
-	if (buffer_locked(*pp_s_com_father)) {
-		__wait_on_buffer(*pp_s_com_father);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-			decrement_bcount(*pp_s_com_father);
+	if (buffer_locked(*pcom_father)) {
+		__wait_on_buffer(*pcom_father);
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(*pcom_father);
 			return REPEAT_SEARCH;
 		}
 	}
@@ -1032,128 +1034,131 @@ static int get_far_parent(struct tree_balance *p_s_tb,
 
 	/* Form key to get parent of the left/right neighbor. */
 	le_key2cpu_key(&s_lr_father_key,
-		       B_N_PDELIM_KEY(*pp_s_com_father,
+		       B_N_PDELIM_KEY(*pcom_father,
 				      (c_lr_par ==
-				       LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
-							n_position -
-							1) : (p_s_tb->rkey[n_h -
+				       LEFT_PARENTS) ? (tb->lkey[h - 1] =
+							position -
+							1) : (tb->rkey[h -
 									   1] =
-							      n_position)));
+							      position)));
 
 	if (c_lr_par == LEFT_PARENTS)
 		decrement_key(&s_lr_father_key);
 
 	if (search_by_key
-	    (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
-	     n_h + 1) == IO_ERROR)
+	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
+	     h + 1) == IO_ERROR)
 		// path is released
 		return IO_ERROR;
 
-	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-		decrement_counters_in_path(&s_path_to_neighbor_father);
-		decrement_bcount(*pp_s_com_father);
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		pathrelse(&s_path_to_neighbor_father);
+		brelse(*pcom_father);
 		return REPEAT_SEARCH;
 	}
 
-	*pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
 
-	RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
-	       "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
+	RFALSE(B_LEVEL(*pfather) != h + 1,
+	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
 	RFALSE(s_path_to_neighbor_father.path_length <
 	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
 
 	s_path_to_neighbor_father.path_length--;
-	decrement_counters_in_path(&s_path_to_neighbor_father);
+	pathrelse(&s_path_to_neighbor_father);
 	return CARRY_ON;
 }
 
-/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
- * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
- * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
- * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
+/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
+ * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
+ * FR[path_offset], CFL[path_offset], CFR[path_offset].
+ * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int get_parents(struct tree_balance *p_s_tb, int n_h)
+static int get_parents(struct tree_balance *tb, int h)
 {
-	struct treepath *p_s_path = p_s_tb->tb_path;
-	int n_position,
-	    n_ret_value,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
-	struct buffer_head *p_s_curf, *p_s_curcf;
+	struct treepath *path = tb->tb_path;
+	int position,
+	    ret,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
+	struct buffer_head *curf, *curcf;
 
 	/* Current node is the root of the tree or will be root of the tree */
-	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
 		/* The root can not have parents.
 		   Release nodes which previously were obtained as parents of the current node neighbors. */
-		decrement_bcount(p_s_tb->FL[n_h]);
-		decrement_bcount(p_s_tb->CFL[n_h]);
-		decrement_bcount(p_s_tb->FR[n_h]);
-		decrement_bcount(p_s_tb->CFR[n_h]);
-		p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
-		    p_s_tb->CFR[n_h] = NULL;
+		brelse(tb->FL[h]);
+		brelse(tb->CFL[h]);
+		brelse(tb->FR[h]);
+		brelse(tb->CFR[h]);
+		tb->FL[h]  = NULL;
+		tb->CFL[h] = NULL;
+		tb->FR[h]  = NULL;
+		tb->CFR[h] = NULL;
 		return CARRY_ON;
 	}
 
-	/* Get parent FL[n_path_offset] of L[n_path_offset]. */
-	if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
+	/* Get parent FL[path_offset] of L[path_offset]. */
+	position = PATH_OFFSET_POSITION(path, path_offset - 1);
+	if (position) {
 		/* Current node is not the first child of its parent. */
-		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
-		p_s_curf = p_s_curcf =
-		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-		get_bh(p_s_curf);
-		get_bh(p_s_curf);
-		p_s_tb->lkey[n_h] = n_position - 1;
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->lkey[h] = position - 1;
 	} else {
-		/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
-		   Calculate current common parent of L[n_path_offset] and the current node. Note that
-		   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
-		   Calculate lkey[n_path_offset]. */
-		if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
-						  &p_s_curcf,
+		/* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
+		   Calculate current common parent of L[path_offset] and the current node. Note that
+		   CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
+		   Calculate lkey[path_offset]. */
+		if ((ret = get_far_parent(tb, h + 1, &curf,
+						  &curcf,
 						  LEFT_PARENTS)) != CARRY_ON)
-			return n_ret_value;
+			return ret;
 	}
 
-	decrement_bcount(p_s_tb->FL[n_h]);
-	p_s_tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
-	decrement_bcount(p_s_tb->CFL[n_h]);
-	p_s_tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
+	brelse(tb->FL[h]);
+	tb->FL[h] = curf;	/* New initialization of FL[h]. */
+	brelse(tb->CFL[h]);
+	tb->CFL[h] = curcf;	/* New initialization of CFL[h]. */
 
-	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
-	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
-	       "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
 
-/* Get parent FR[n_h] of R[n_h]. */
+/* Get parent FR[h] of R[h]. */
 
-/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
-	if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
-/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
-   Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
-   not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
-		if ((n_ret_value =
-		     get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
+/* Current node is the last child of F[h]. FR[h] != F[h]. */
+	if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
+/* Calculate current parent of R[h], which is the right neighbor of F[h].
+   Calculate current common parent of R[h] and current node. Note that CFR[h]
+   not equal FR[path_offset] and CFR[h] not equal F[h]. */
+		if ((ret =
+		     get_far_parent(tb, h + 1, &curf, &curcf,
 				    RIGHT_PARENTS)) != CARRY_ON)
-			return n_ret_value;
+			return ret;
 	} else {
-/* Current node is not the last child of its parent F[n_h]. */
-		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
-		p_s_curf = p_s_curcf =
-		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-		get_bh(p_s_curf);
-		get_bh(p_s_curf);
-		p_s_tb->rkey[n_h] = n_position;
+/* Current node is not the last child of its parent F[h]. */
+		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
+		get_bh(curf);
+		get_bh(curf);
+		tb->rkey[h] = position;
 	}
 
-	decrement_bcount(p_s_tb->FR[n_h]);
-	p_s_tb->FR[n_h] = p_s_curf;	/* New initialization of FR[n_path_offset]. */
+	brelse(tb->FR[h]);
+	/* New initialization of FR[path_offset]. */
+	tb->FR[h] = curf;
 
-	decrement_bcount(p_s_tb->CFR[n_h]);
-	p_s_tb->CFR[n_h] = p_s_curcf;	/* New initialization of CFR[n_path_offset]. */
+	brelse(tb->CFR[h]);
+	/* New initialization of CFR[path_offset]. */
+	tb->CFR[h] = curcf;
 
-	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
-	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
-	       "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
+	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
+	       (curcf && !B_IS_IN_TREE(curcf)),
+	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
 
 	return CARRY_ON;
 }
@@ -1203,7 +1208,7 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1217,7 +1222,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 				   contains node being balanced.  The mnemonic is
 				   that the attempted change in node space used level
 				   is levbytes bytes. */
-	 n_ret_value;
+	 ret;
 
 	int lfree, sfree, rfree /* free space in L, S and R */ ;
 
@@ -1238,7 +1243,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 	/* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
 	   where 4th parameter is s1bytes and 5th - s2bytes
 	 */
-	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases 
+	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases
 					   0,1 - do not shift and do not shift but bottle
 					   2 - shift only whole item to left
 					   3 - shift to left and bottle as much as possible
@@ -1255,24 +1260,24 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 	/* Calculate balance parameters for creating new root. */
 	if (!Sh) {
 		if (!h)
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8210: ip_check_balance: S[0] can not be 0");
-		switch (n_ret_value = get_empty_nodes(tb, h)) {
+			reiserfs_panic(tb->tb_sb, "vs-8210",
+				       "S[0] can not be 0");
+		switch (ret = get_empty_nodes(tb, h)) {
 		case CARRY_ON:
 			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
 			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
 
 		case NO_DISK_SPACE:
 		case REPEAT_SEARCH:
-			return n_ret_value;
+			return ret;
 		default:
-			reiserfs_panic(tb->tb_sb,
-				       "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
+			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
+				       "return value of get_empty_nodes");
 		}
 	}
 
-	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)	/* get parents of S[h] neighbors. */
-		return n_ret_value;
+	if ((ret = get_parents(tb, h)) != CARRY_ON)	/* get parents of S[h] neighbors. */
+		return ret;
 
 	sfree = B_FREE_SPACE(Sh);
 
@@ -1287,7 +1292,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 
 	create_virtual_node(tb, h);
 
-	/*  
+	/*
 	   determine maximal number of items we can shift to the left neighbor (in tb structure)
 	   and the maximal number of bytes that can flow to the left neighbor
 	   from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
@@ -1348,13 +1353,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
 
 	{
 		int lpar, rpar, nset, lset, rset, lrset;
-		/* 
+		/*
 		 * regular overflowing of the node
 		 */
 
-		/* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
+		/* get_num_ver works in 2 modes (FLOW & NO_FLOW)
 		   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
-		   nset, lset, rset, lrset - shows, whether flowing items give better packing 
+		   nset, lset, rset, lrset - shows, whether flowing items give better packing
 		 */
 #define FLOW 1
 #define NO_FLOW 0		/* do not any splitting */
@@ -1544,7 +1549,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1559,7 +1564,7 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
 	/* Sh is the node whose balance is currently being checked,
 	   and Fh is its father.  */
 	struct buffer_head *Sh, *Fh;
-	int maxsize, n_ret_value;
+	int maxsize, ret;
 	int lfree, rfree /* free space in L and R */ ;
 
 	Sh = PATH_H_PBUFFER(tb->tb_path, h);
@@ -1584,8 +1589,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
 		return CARRY_ON;
 	}
 
-	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
-		return n_ret_value;
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
 
 	/* get free space of neighbors */
 	rfree = get_rfree(tb, h);
@@ -1727,7 +1732,7 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1742,7 +1747,7 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
 	   attempted change in node space used level is levbytes bytes. */
 	int levbytes;
 	/* the maximal item size */
-	int maxsize, n_ret_value;
+	int maxsize, ret;
 	/* S0 is the node whose balance is currently being checked,
 	   and F0 is its father.  */
 	struct buffer_head *S0, *F0;
@@ -1764,8 +1769,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
 		return NO_BALANCING_NEEDED;
 	}
 
-	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
-		return n_ret_value;
+	if ((ret = get_parents(tb, h)) != CARRY_ON)
+		return ret;
 
 	/* get free space of neighbors */
 	rfree = get_rfree(tb, h);
@@ -1821,7 +1826,7 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	d - delete, c - cut.
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1850,7 +1855,7 @@ static int dc_check_balance(struct tree_balance *tb, int h)
  *	h	current level of the node;
  *	inum	item number in S[h];
  *	mode	i - insert, p - paste, d - delete, c - cut.
- * Returns:	1 - schedule occurred; 
+ * Returns:	1 - schedule occurred;
  *	        0 - balancing for higher levels needed;
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
@@ -1884,137 +1889,138 @@ static int check_balance(int mode,
 }
 
 /* Check whether parent at the path is the really parent of the current node.*/
-static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
+static int get_direct_parent(struct tree_balance *tb, int h)
 {
-	struct buffer_head *p_s_bh;
-	struct treepath *p_s_path = p_s_tb->tb_path;
-	int n_position,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+	struct buffer_head *bh;
+	struct treepath *path = tb->tb_path;
+	int position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
 
 	/* We are in the root or in the new root. */
-	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
 
-		RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
+		RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
 		       "PAP-8260: invalid offset in the path");
 
-		if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
-		    b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
+		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
+		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
 			/* Root is not changed. */
-			PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
-			PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
+			PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
+			PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
 			return CARRY_ON;
 		}
 		return REPEAT_SEARCH;	/* Root is changed and we must recalculate the path. */
 	}
 
 	if (!B_IS_IN_TREE
-	    (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
+	    (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
 		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
 
-	if ((n_position =
-	     PATH_OFFSET_POSITION(p_s_path,
-				  n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
+	if ((position =
+	     PATH_OFFSET_POSITION(path,
+				  path_offset - 1)) > B_NR_ITEMS(bh))
 		return REPEAT_SEARCH;
 
-	if (B_N_CHILD_NUM(p_s_bh, n_position) !=
-	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
+	if (B_N_CHILD_NUM(bh, position) !=
+	    PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
 		/* Parent in the path is not parent of the current node in the tree. */
 		return REPEAT_SEARCH;
 
-	if (buffer_locked(p_s_bh)) {
-		__wait_on_buffer(p_s_bh);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+	if (buffer_locked(bh)) {
+		__wait_on_buffer(bh);
+		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
 
 	return CARRY_ON;	/* Parent in the path is unlocked and really parent of the current node.  */
 }
 
-/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
- * of S[n_h] we
- * need in order to balance S[n_h], and get them if necessary.
+/* Using lnum[h] and rnum[h] we should determine what neighbors
+ * of S[h] we
+ * need in order to balance S[h], and get them if necessary.
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
+static int get_neighbors(struct tree_balance *tb, int h)
 {
-	int n_child_position,
-	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
-	unsigned long n_son_number;
-	struct super_block *p_s_sb = p_s_tb->tb_sb;
-	struct buffer_head *p_s_bh;
+	int child_position,
+	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
+	unsigned long son_number;
+	struct super_block *sb = tb->tb_sb;
+	struct buffer_head *bh;
 
-	PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
+	PROC_INFO_INC(sb, get_neighbors[h]);
 
-	if (p_s_tb->lnum[n_h]) {
-		/* We need left neighbor to balance S[n_h]. */
-		PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
-		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+	if (tb->lnum[h]) {
+		/* We need left neighbor to balance S[h]. */
+		PROC_INFO_INC(sb, need_l_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
 
-		RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
-		       !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
+		RFALSE(bh == tb->FL[h] &&
+		       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
 		       "PAP-8270: invalid position in the parent");
 
-		n_child_position =
-		    (p_s_bh ==
-		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
-								       FL[n_h]);
-		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
-		p_s_bh = sb_bread(p_s_sb, n_son_number);
-		if (!p_s_bh)
+		child_position =
+		    (bh ==
+		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
+								       FL[h]);
+		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+		bh = sb_bread(sb, son_number);
+		if (!bh)
 			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-			decrement_bcount(p_s_bh);
-			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
 			return REPEAT_SEARCH;
 		}
 
-		RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
-		       n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
-		       B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
-		       p_s_bh->b_blocknr, "PAP-8275: invalid parent");
-		RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
-		RFALSE(!n_h &&
-		       B_FREE_SPACE(p_s_bh) !=
-		       MAX_CHILD_SIZE(p_s_bh) -
-		       dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
+		RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
+		       child_position > B_NR_ITEMS(tb->FL[h]) ||
+		       B_N_CHILD_NUM(tb->FL[h], child_position) !=
+		       bh->b_blocknr, "PAP-8275: invalid parent");
+		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
+		RFALSE(!h &&
+		       B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FL[0], child_position)),
 		       "PAP-8290: invalid child size of left neighbor");
 
-		decrement_bcount(p_s_tb->L[n_h]);
-		p_s_tb->L[n_h] = p_s_bh;
+		brelse(tb->L[h]);
+		tb->L[h] = bh;
 	}
 
-	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
-		PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
-		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+	/* We need right neighbor to balance S[path_offset]. */
+	if (tb->rnum[h]) {	/* We need right neighbor to balance S[path_offset]. */
+		PROC_INFO_INC(sb, need_r_neighbor[h]);
+		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
 
-		RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
-		       PATH_OFFSET_POSITION(p_s_tb->tb_path,
-					    n_path_offset) >=
-		       B_NR_ITEMS(p_s_bh),
+		RFALSE(bh == tb->FR[h] &&
+		       PATH_OFFSET_POSITION(tb->tb_path,
+					    path_offset) >=
+		       B_NR_ITEMS(bh),
 		       "PAP-8295: invalid position in the parent");
 
-		n_child_position =
-		    (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
-		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
-		p_s_bh = sb_bread(p_s_sb, n_son_number);
-		if (!p_s_bh)
+		child_position =
+		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
+		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+		bh = sb_bread(sb, son_number);
+		if (!bh)
 			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-			decrement_bcount(p_s_bh);
-			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+		if (FILESYSTEM_CHANGED_TB(tb)) {
+			brelse(bh);
+			PROC_INFO_INC(sb, get_neighbors_restart[h]);
 			return REPEAT_SEARCH;
 		}
-		decrement_bcount(p_s_tb->R[n_h]);
-		p_s_tb->R[n_h] = p_s_bh;
+		brelse(tb->R[h]);
+		tb->R[h] = bh;
 
-		RFALSE(!n_h
-		       && B_FREE_SPACE(p_s_bh) !=
-		       MAX_CHILD_SIZE(p_s_bh) -
-		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
+		RFALSE(!h
+		       && B_FREE_SPACE(bh) !=
+		       MAX_CHILD_SIZE(bh) -
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)),
 		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
-		       B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
-		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
+		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
+		       dc_size(B_N_CHILD(tb->FR[0], child_position)));
 
 	}
 	return CARRY_ON;
@@ -2088,52 +2094,46 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
 }
 
 #ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check(struct super_block *p_s_sb,
-				   struct buffer_head *p_s_bh,
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
 				   const char *descr, int level)
 {
-	if (p_s_bh) {
-		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
-
-			reiserfs_panic(p_s_sb,
-				       "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
-
-		if (!buffer_uptodate(p_s_bh)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
-
-		if (!B_IS_IN_TREE(p_s_bh)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
-
-		if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
-
-		if (p_s_bh->b_size != p_s_sb->s_blocksize) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
-
-		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_panic(p_s_sb,
-				       "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
-				       descr, level, p_s_bh);
-		}
+	if (bh) {
+		if (atomic_read(&(bh->b_count)) <= 0)
+
+			reiserfs_panic(sb, "jmacd-1", "negative or zero "
+				       "reference counter for buffer %s[%d] "
+				       "(%b)", descr, level, bh);
+
+		if (!buffer_uptodate(bh))
+			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
+				       "to date %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (!B_IS_IN_TREE(bh))
+			reiserfs_panic(sb, "jmacd-3", "buffer is not "
+				       "in tree %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_bdev != sb->s_bdev)
+			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
+				       "device %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_size != sb->s_blocksize)
+			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
+				       "blocksize %s[%d] (%b)",
+				       descr, level, bh);
+
+		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
+			reiserfs_panic(sb, "jmacd-6", "buffer block "
+				       "number too high %s[%d] (%b)",
+				       descr, level, bh);
 	}
 }
 #else
-static void tb_buffer_sanity_check(struct super_block *p_s_sb,
-				   struct buffer_head *p_s_bh,
+static void tb_buffer_sanity_check(struct super_block *sb,
+				   struct buffer_head *bh,
 				   const char *descr, int level)
 {;
 }
@@ -2144,7 +2144,7 @@ static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
 	return reiserfs_prepare_for_journal(s, bh, 0);
 }
 
-static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
+static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
 {
 	struct buffer_head *locked;
 #ifdef CONFIG_REISERFS_CHECK
@@ -2156,95 +2156,94 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 
 		locked = NULL;
 
-		for (i = p_s_tb->tb_path->path_length;
+		for (i = tb->tb_path->path_length;
 		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
-			if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
+			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
 				/* if I understand correctly, we can only be sure the last buffer
 				 ** in the path is in the tree --clm
 				 */
 #ifdef CONFIG_REISERFS_CHECK
-				if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
-				    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
+				if (PATH_PLAST_BUFFER(tb->tb_path) ==
+				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
+					tb_buffer_sanity_check(tb->tb_sb,
 							       PATH_OFFSET_PBUFFER
-							       (p_s_tb->tb_path,
+							       (tb->tb_path,
 								i), "S",
-							       p_s_tb->tb_path->
+							       tb->tb_path->
 							       path_length - i);
-				}
 #endif
-				if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+				if (!clear_all_dirty_bits(tb->tb_sb,
 							  PATH_OFFSET_PBUFFER
-							  (p_s_tb->tb_path,
+							  (tb->tb_path,
 							   i))) {
 					locked =
-					    PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
+					    PATH_OFFSET_PBUFFER(tb->tb_path,
 								i);
 				}
 			}
 		}
 
-		for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
+		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
 		     i++) {
 
-			if (p_s_tb->lnum[i]) {
+			if (tb->lnum[i]) {
 
-				if (p_s_tb->L[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->L[i],
+				if (tb->L[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->L[i],
 							       "L", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->L[i]))
-						locked = p_s_tb->L[i];
+					    (tb->tb_sb, tb->L[i]))
+						locked = tb->L[i];
 				}
 
-				if (!locked && p_s_tb->FL[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->FL[i],
+				if (!locked && tb->FL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FL[i],
 							       "FL", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->FL[i]))
-						locked = p_s_tb->FL[i];
+					    (tb->tb_sb, tb->FL[i]))
+						locked = tb->FL[i];
 				}
 
-				if (!locked && p_s_tb->CFL[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->CFL[i],
+				if (!locked && tb->CFL[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFL[i],
 							       "CFL", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->CFL[i]))
-						locked = p_s_tb->CFL[i];
+					    (tb->tb_sb, tb->CFL[i]))
+						locked = tb->CFL[i];
 				}
 
 			}
 
-			if (!locked && (p_s_tb->rnum[i])) {
+			if (!locked && (tb->rnum[i])) {
 
-				if (p_s_tb->R[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->R[i],
+				if (tb->R[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->R[i],
 							       "R", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->R[i]))
-						locked = p_s_tb->R[i];
+					    (tb->tb_sb, tb->R[i]))
+						locked = tb->R[i];
 				}
 
-				if (!locked && p_s_tb->FR[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->FR[i],
+				if (!locked && tb->FR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->FR[i],
 							       "FR", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->FR[i]))
-						locked = p_s_tb->FR[i];
+					    (tb->tb_sb, tb->FR[i]))
+						locked = tb->FR[i];
 				}
 
-				if (!locked && p_s_tb->CFR[i]) {
-					tb_buffer_sanity_check(p_s_tb->tb_sb,
-							       p_s_tb->CFR[i],
+				if (!locked && tb->CFR[i]) {
+					tb_buffer_sanity_check(tb->tb_sb,
+							       tb->CFR[i],
 							       "CFR", i);
 					if (!clear_all_dirty_bits
-					    (p_s_tb->tb_sb, p_s_tb->CFR[i]))
-						locked = p_s_tb->CFR[i];
+					    (tb->tb_sb, tb->CFR[i]))
+						locked = tb->CFR[i];
 				}
 			}
 		}
@@ -2257,10 +2256,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 		 ** --clm
 		 */
 		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
-			if (p_s_tb->FEB[i]) {
+			if (tb->FEB[i]) {
 				if (!clear_all_dirty_bits
-				    (p_s_tb->tb_sb, p_s_tb->FEB[i]))
-					locked = p_s_tb->FEB[i];
+				    (tb->tb_sb, tb->FEB[i]))
+					locked = tb->FEB[i];
 			}
 		}
 
@@ -2268,21 +2267,20 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
 #ifdef CONFIG_REISERFS_CHECK
 			repeat_counter++;
 			if ((repeat_counter % 10000) == 0) {
-				reiserfs_warning(p_s_tb->tb_sb,
-						 "wait_tb_buffers_until_released(): too many "
-						 "iterations waiting for buffer to unlock "
+				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
+						 "too many iterations waiting "
+						 "for buffer to unlock "
 						 "(%b)", locked);
 
 				/* Don't loop forever.  Try to recover from possible error. */
 
-				return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
+				return (FILESYSTEM_CHANGED_TB(tb)) ?
 				    REPEAT_SEARCH : CARRY_ON;
 			}
 #endif
 			__wait_on_buffer(locked);
-			if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			if (FILESYSTEM_CHANGED_TB(tb))
 				return REPEAT_SEARCH;
-			}
 		}
 
 	} while (locked);
@@ -2295,15 +2293,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
  *	analyze what and where should be moved;
  *	get sufficient number of new nodes;
  * Balancing will start only after all resources will be collected at a time.
- * 
+ *
  * When ported to SMP kernels, only at the last moment after all needed nodes
  * are collected in cache, will the resources be locked using the usual
  * textbook ordered lock acquisition algorithms.  Note that ensuring that
  * this code neither write locks what it does not need to write lock nor locks out of order
  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
- * 
+ *
  * fix is meant in the sense of render unchanging
- * 
+ *
  * Latency might be improved by first gathering a list of what buffers are needed
  * and then getting as many of them in parallel as possible? -Hans
  *
@@ -2312,159 +2310,160 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
  *	tb	tree_balance structure;
  *	inum	item number in S[h];
  *      pos_in_item - comment this if you can
- *      ins_ih & ins_sd are used when inserting
+ *      ins_ih	item head of item being inserted
+ *	data	inserted item or data to be pasted
  * Returns:	1 - schedule occurred while the function worked;
  *	        0 - schedule didn't occur while the function worked;
- *             -1 - if no_disk_space 
+ *             -1 - if no_disk_space
  */
 
-int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih,	// item head of item being inserted
-	      const void *data	// inserted item or data to be pasted
-    )
+int fix_nodes(int op_mode, struct tree_balance *tb,
+	      struct item_head *ins_ih, const void *data)
 {
-	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
-	int n_pos_in_item;
+	int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
+	int pos_in_item;
 
 	/* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
 	 ** during wait_tb_buffers_run
 	 */
 	int wait_tb_buffers_run = 0;
-	struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 
-	++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
+	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
 
-	n_pos_in_item = p_s_tb->tb_path->pos_in_item;
+	pos_in_item = tb->tb_path->pos_in_item;
 
-	p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
+	tb->fs_gen = get_generation(tb->tb_sb);
 
 	/* we prepare and log the super here so it will already be in the
 	 ** transaction when do_balance needs to change it.
 	 ** This way do_balance won't have to schedule when trying to prepare
 	 ** the super for logging
 	 */
-	reiserfs_prepare_for_journal(p_s_tb->tb_sb,
-				     SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
-	journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
-			   SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
-	if (FILESYSTEM_CHANGED_TB(p_s_tb))
+	reiserfs_prepare_for_journal(tb->tb_sb,
+				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
+	journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
+			   SB_BUFFER_WITH_SB(tb->tb_sb));
+	if (FILESYSTEM_CHANGED_TB(tb))
 		return REPEAT_SEARCH;
 
 	/* if it possible in indirect_to_direct conversion */
-	if (buffer_locked(p_s_tbS0)) {
-		__wait_on_buffer(p_s_tbS0);
-		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+	if (buffer_locked(tbS0)) {
+		__wait_on_buffer(tbS0);
+		if (FILESYSTEM_CHANGED_TB(tb))
 			return REPEAT_SEARCH;
 	}
 #ifdef CONFIG_REISERFS_CHECK
 	if (cur_tb) {
 		print_cur_tb("fix_nodes");
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8305: fix_nodes:  there is pending do_balance");
+		reiserfs_panic(tb->tb_sb, "PAP-8305",
+			       "there is pending do_balance");
 	}
 
-	if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
-			       "at the beginning of fix_nodes or not in tree (mode %c)",
-			       p_s_tbS0, p_s_tbS0, n_op_mode);
-	}
+	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
+		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
+			       "not uptodate at the beginning of fix_nodes "
+			       "or not in tree (mode %c)",
+			       tbS0, tbS0, op_mode);
 
 	/* Check parameters. */
-	switch (n_op_mode) {
+	switch (op_mode) {
 	case M_INSERT:
-		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
-			reiserfs_panic(p_s_tb->tb_sb,
-				       "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
-				       n_item_num, B_NR_ITEMS(p_s_tbS0));
+		if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
+			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
+				       "item number %d (in S0 - %d) in case "
+				       "of insert", item_num,
+				       B_NR_ITEMS(tbS0));
 		break;
 	case M_PASTE:
 	case M_DELETE:
 	case M_CUT:
-		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
-			print_block(p_s_tbS0, 0, -1, -1);
-			reiserfs_panic(p_s_tb->tb_sb,
-				       "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
-				       n_item_num, n_op_mode,
-				       p_s_tb->insert_size[0]);
+		if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
+			print_block(tbS0, 0, -1, -1);
+			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
+				       "item number(%d); mode = %c "
+				       "insert_size = %d",
+				       item_num, op_mode,
+				       tb->insert_size[0]);
 		}
 		break;
 	default:
-		reiserfs_panic(p_s_tb->tb_sb,
-			       "PAP-8340: fix_nodes: Incorrect mode of operation");
+		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
+			       "of operation");
 	}
 #endif
 
-	if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
+	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
 		// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
 		return REPEAT_SEARCH;
 
-	/* Starting from the leaf level; for all levels n_h of the tree. */
-	for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
-		if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
+	/* Starting from the leaf level; for all levels h of the tree. */
+	for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
+		ret = get_direct_parent(tb, h);
+		if (ret != CARRY_ON)
 			goto repeat;
-		}
 
-		if ((n_ret_value =
-		     check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
-				   n_pos_in_item, p_s_ins_ih,
-				   data)) != CARRY_ON) {
-			if (n_ret_value == NO_BALANCING_NEEDED) {
+		ret = check_balance(op_mode, tb, h, item_num,
+				    pos_in_item, ins_ih, data);
+		if (ret != CARRY_ON) {
+			if (ret == NO_BALANCING_NEEDED) {
 				/* No balancing for higher levels needed. */
-				if ((n_ret_value =
-				     get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+				ret = get_neighbors(tb, h);
+				if (ret != CARRY_ON)
 					goto repeat;
-				}
-				if (n_h != MAX_HEIGHT - 1)
-					p_s_tb->insert_size[n_h + 1] = 0;
+				if (h != MAX_HEIGHT - 1)
+					tb->insert_size[h + 1] = 0;
 				/* ok, analysis and resource gathering are complete */
 				break;
 			}
 			goto repeat;
 		}
 
-		if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+		ret = get_neighbors(tb, h);
+		if (ret != CARRY_ON)
 			goto repeat;
-		}
 
-		if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
-			goto repeat;	/* No disk space, or schedule occurred and
-					   analysis may be invalid and needs to be redone. */
-		}
+		/* No disk space, or schedule occurred and analysis may be
+		 * invalid and needs to be redone. */
+		ret = get_empty_nodes(tb, h);
+		if (ret != CARRY_ON)
+			goto repeat;
 
-		if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
+		if (!PATH_H_PBUFFER(tb->tb_path, h)) {
 			/* We have a positive insert size but no nodes exist on this
 			   level, this means that we are creating a new root. */
 
-			RFALSE(p_s_tb->blknum[n_h] != 1,
+			RFALSE(tb->blknum[h] != 1,
 			       "PAP-8350: creating new empty root");
 
-			if (n_h < MAX_HEIGHT - 1)
-				p_s_tb->insert_size[n_h + 1] = 0;
-		} else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
-			if (p_s_tb->blknum[n_h] > 1) {
-				/* The tree needs to be grown, so this node S[n_h]
+			if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
+		} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
+			if (tb->blknum[h] > 1) {
+				/* The tree needs to be grown, so this node S[h]
 				   which is the root node is split into two nodes,
-				   and a new node (S[n_h+1]) will be created to
+				   and a new node (S[h+1]) will be created to
 				   become the root node.  */
 
-				RFALSE(n_h == MAX_HEIGHT - 1,
+				RFALSE(h == MAX_HEIGHT - 1,
 				       "PAP-8355: attempt to create too high of a tree");
 
-				p_s_tb->insert_size[n_h + 1] =
+				tb->insert_size[h + 1] =
 				    (DC_SIZE +
-				     KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
+				     KEY_SIZE) * (tb->blknum[h] - 1) +
 				    DC_SIZE;
-			} else if (n_h < MAX_HEIGHT - 1)
-				p_s_tb->insert_size[n_h + 1] = 0;
+			} else if (h < MAX_HEIGHT - 1)
+				tb->insert_size[h + 1] = 0;
 		} else
-			p_s_tb->insert_size[n_h + 1] =
-			    (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
+			tb->insert_size[h + 1] =
+			    (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
 	}
 
-	if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
-		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+	ret = wait_tb_buffers_until_unlocked(tb);
+	if (ret == CARRY_ON) {
+		if (FILESYSTEM_CHANGED_TB(tb)) {
 			wait_tb_buffers_run = 1;
-			n_ret_value = REPEAT_SEARCH;
+			ret = REPEAT_SEARCH;
 			goto repeat;
 		} else {
 			return CARRY_ON;
@@ -2485,57 +2484,57 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
 
 		/* Release path buffers. */
 		if (wait_tb_buffers_run) {
-			pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
+			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
 		} else {
-			pathrelse(p_s_tb->tb_path);
+			pathrelse(tb->tb_path);
 		}
 		/* brelse all resources collected for balancing */
 		for (i = 0; i < MAX_HEIGHT; i++) {
 			if (wait_tb_buffers_run) {
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->L[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->R[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->FL[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->FR[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->L[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->R[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FL[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->FR[i]);
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
 								 CFL[i]);
-				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
-								 p_s_tb->
+				reiserfs_restore_prepared_buffer(tb->tb_sb,
+								 tb->
 								 CFR[i]);
 			}
 
-			brelse(p_s_tb->L[i]);
-			p_s_tb->L[i] = NULL;
-			brelse(p_s_tb->R[i]);
-			p_s_tb->R[i] = NULL;
-			brelse(p_s_tb->FL[i]);
-			p_s_tb->FL[i] = NULL;
-			brelse(p_s_tb->FR[i]);
-			p_s_tb->FR[i] = NULL;
-			brelse(p_s_tb->CFL[i]);
-			p_s_tb->CFL[i] = NULL;
-			brelse(p_s_tb->CFR[i]);
-			p_s_tb->CFR[i] = NULL;
+			brelse(tb->L[i]);
+			brelse(tb->R[i]);
+			brelse(tb->FL[i]);
+			brelse(tb->FR[i]);
+			brelse(tb->CFL[i]);
+			brelse(tb->CFR[i]);
+
+			tb->L[i] = NULL;
+			tb->R[i] = NULL;
+			tb->FL[i] = NULL;
+			tb->FR[i] = NULL;
+			tb->CFL[i] = NULL;
+			tb->CFR[i] = NULL;
 		}
 
 		if (wait_tb_buffers_run) {
 			for (i = 0; i < MAX_FEB_SIZE; i++) {
-				if (p_s_tb->FEB[i]) {
+				if (tb->FEB[i])
 					reiserfs_restore_prepared_buffer
-					    (p_s_tb->tb_sb, p_s_tb->FEB[i]);
-				}
+					    (tb->tb_sb, tb->FEB[i]);
 			}
 		}
-		return n_ret_value;
+		return ret;
 	}
 
 }
 
-/* Anatoly will probably forgive me renaming p_s_tb to tb. I just
+/* Anatoly will probably forgive me renaming tb to tb. I just
    wanted to make lines shorter */
 void unfix_nodes(struct tree_balance *tb)
 {
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index e664ac16fad9..6471c670743e 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -7,7 +7,7 @@
  * (see Applied Cryptography, 2nd edition, p448).
  *
  * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
- * 
+ *
  * Jeremy has agreed to the contents of reiserfs/README. -Hans
  * Yura's function is added (04/07/2000)
  */
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index de391a82b999..2074fd95046b 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -105,8 +105,8 @@ static void internal_define_dest_src_infos(int shift_mode,
 		break;
 
 	default:
-		reiserfs_panic(tb->tb_sb,
-			       "internal_define_dest_src_infos: shift type is unknown (%d)",
+		reiserfs_panic(tb->tb_sb, "ibalance-1",
+			       "shift type is unknown (%d)",
 			       shift_mode);
 	}
 }
@@ -278,7 +278,7 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
 
 /* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
 * last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest 
+ * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
  */
 static void internal_copy_pointers_items(struct buffer_info *dest_bi,
 					 struct buffer_head *src,
@@ -385,7 +385,7 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
 	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
 		first_pointer = 0;
 		first_item = 0;
-		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, 
+		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
 		   for key - with first_item */
 		internal_delete_pointers_items(src_bi, first_pointer,
 					       first_item, cpy_num - del_par);
@@ -453,7 +453,7 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
 	}
 }
 
-/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. 
+/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
  * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfl.
  * Delete pointer_amount items and node pointers from buffer src.
@@ -518,7 +518,7 @@ static void internal_shift1_left(struct tree_balance *tb,
 	/*    internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
 }
 
-/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. 
+/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
  * Copy n node pointers and n - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfr.
  * Delete n items and node pointers from buffer src.
@@ -702,8 +702,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
 
 		return;
 	}
-	reiserfs_panic(tb->tb_sb,
-		       "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+	reiserfs_panic(tb->tb_sb, "ibalance-2",
+		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
 		       h, tb->lnum[h], h, tb->rnum[h]);
 }
 
@@ -749,7 +749,7 @@ int balance_internal(struct tree_balance *tb,	/* tree_balance structure
        this means that new pointers and items must be inserted AFTER *
        child_pos
        }
-       else 
+       else
        {
        it is the position of the leftmost pointer that must be deleted (together with
        its corresponding key to the left of the pointer)
@@ -940,8 +940,8 @@ int balance_internal(struct tree_balance *tb,	/* tree_balance structure
 		struct block_head *blkh;
 
 		if (tb->blknum[h] != 1)
-			reiserfs_panic(NULL,
-				       "balance_internal: One new node required for creating the new root");
+			reiserfs_panic(NULL, "ibalance-3", "One new node "
+				       "required for creating the new root");
 		/* S[h] = empty buffer from the list FEB. */
 		tbSh = get_FEB(tb);
 		blkh = B_BLK_HEAD(tbSh);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 55fce92cdf18..6fd0f47e45db 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -53,7 +53,7 @@ void reiserfs_delete_inode(struct inode *inode)
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err) 
-			DQUOT_FREE_INODE(inode);
+			vfs_dq_free_inode(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count))
 			goto out;
@@ -363,7 +363,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 		}
 		/* make sure we don't read more bytes than actually exist in
 		 ** the file.  This can happen in odd cases where i_size isn't
-		 ** correct, and when direct item padding results in a few 
+		 ** correct, and when direct item padding results in a few
 		 ** extra bytes at the end of the direct item
 		 */
 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
@@ -438,15 +438,15 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
-** 
+**
 ** So, this allows block_prepare_write to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
 **
 ** The point of the wrapper is forcing a certain value for create, even
-** though the VFS layer is calling this function with create==1.  If you 
-** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, 
+** though the VFS layer is calling this function with create==1.  If you
+** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 ** don't use this function.
 */
 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
@@ -602,7 +602,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	int done;
 	int fs_gen;
 	struct reiserfs_transaction_handle *th = NULL;
-	/* space reserved in transaction batch: 
+	/* space reserved in transaction batch:
 	   . 3 balancings in direct->indirect conversion
 	   . 1 block involved into reiserfs_update_sd()
 	   XXX in practically impossible worst case direct2indirect()
@@ -754,7 +754,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 		reiserfs_write_unlock(inode->i_sb);
 
 		/* the item was found, so new blocks were not added to the file
-		 ** there is no need to make sure the inode is updated with this 
+		 ** there is no need to make sure the inode is updated with this
 		 ** transaction
 		 */
 		return retval;
@@ -841,10 +841,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 							  tail_offset);
 				if (retval) {
 					if (retval != -ENOSPC)
-						reiserfs_warning(inode->i_sb,
-								 "clm-6004: convert tail failed inode %lu, error %d",
-								 inode->i_ino,
-								 retval);
+						reiserfs_error(inode->i_sb,
+							"clm-6004",
+							"convert tail failed "
+							"inode %lu, error %d",
+							inode->i_ino,
+							retval);
 					if (allocated_block_nr) {
 						/* the bitmap, the super, and the stat data == 3 */
 						if (!th)
@@ -984,7 +986,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 
 		/* this loop could log more blocks than we had originally asked
 		 ** for.  So, we have to allow the transaction to end if it is
-		 ** too big or too full.  Update the inode so things are 
+		 ** too big or too full.  Update the inode so things are
 		 ** consistent if we crash before the function returns
 		 **
 		 ** release the path so that anybody waiting on the path before
@@ -995,7 +997,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			if (retval)
 				goto failure;
 		}
-		/* inserting indirect pointers for a hole can take a 
+		/* inserting indirect pointers for a hole can take a
 		 ** long time.  reschedule if needed
 		 */
 		cond_resched();
@@ -1006,8 +1008,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 			goto failure;
 		}
 		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb,
-					 "vs-825: reiserfs_get_block: "
+			reiserfs_warning(inode->i_sb, "vs-825",
 					 "%K should not be found", &key);
 			retval = -EEXIST;
 			if (allocated_block_nr)
@@ -1299,8 +1300,7 @@ static void update_stat_data(struct treepath *path, struct inode *inode,
 	ih = PATH_PITEM_HEAD(path);
 
 	if (!is_statdata_le_ih(ih))
-		reiserfs_panic(inode->i_sb,
-			       "vs-13065: update_stat_data: key %k, found item %h",
+		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
 			       INODE_PKEY(inode), ih);
 
 	if (stat_data_v1(ih)) {
@@ -1332,10 +1332,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 		/* look for the object's stat data */
 		retval = search_item(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(inode->i_sb,
-					 "vs-13050: reiserfs_update_sd: "
-					 "i/o failure occurred trying to update %K stat data",
-					 &key);
+			reiserfs_error(inode->i_sb, "vs-13050",
+				       "i/o failure occurred trying to "
+				       "update %K stat data", &key);
 			return;
 		}
 		if (retval == ITEM_NOT_FOUND) {
@@ -1345,9 +1344,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
 				return;
 			}
-			reiserfs_warning(inode->i_sb,
-					 "vs-13060: reiserfs_update_sd: "
-					 "stat data of object %k (nlink == %d) not found (pos %d)",
+			reiserfs_warning(inode->i_sb, "vs-13060",
+					 "stat data of object %k (nlink == %d) "
+					 "not found (pos %d)",
 					 INODE_PKEY(inode), inode->i_nlink,
 					 pos);
 			reiserfs_check_path(&path);
@@ -1424,10 +1423,9 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	/* look for the object's stat data */
 	retval = search_item(inode->i_sb, &key, &path_to_sd);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(inode->i_sb,
-				 "vs-13070: reiserfs_read_locked_inode: "
-				 "i/o failure occurred trying to find stat data of %K",
-				 &key);
+		reiserfs_error(inode->i_sb, "vs-13070",
+			       "i/o failure occurred trying to find "
+			       "stat data of %K", &key);
 		reiserfs_make_bad_inode(inode);
 		return;
 	}
@@ -1446,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	   update sd on unlink all that is required is to check for nlink
 	   here. This bug was first found by Sizif when debugging
 	   SquidNG/Butterfly, forgotten, and found again after Philippe
-	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 
+	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 
 	   More logical fix would require changes in fs/inode.c:iput() to
 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
@@ -1457,8 +1455,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
 	if ((inode->i_nlink == 0) &&
 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
-		reiserfs_warning(inode->i_sb,
-				 "vs-13075: reiserfs_read_locked_inode: "
+		reiserfs_warning(inode->i_sb, "vs-13075",
 				 "dead inode read from disk %K. "
 				 "This is likely to be race with knfsd. Ignore",
 				 &key);
@@ -1555,7 +1552,7 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 	 */
 	if (fh_type > fh_len) {
 		if (fh_type != 6 || fh_len != 5)
-			reiserfs_warning(sb,
+			reiserfs_warning(sb, "reiserfs-13077",
 				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
 				fh_type, fh_len);
 		fh_type = 5;
@@ -1622,7 +1619,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
-	 ** these cases are just when the system needs ram, not when the 
+	 ** these cases are just when the system needs ram, not when the
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
@@ -1680,13 +1677,13 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
-				 "i/o failure occurred creating new directory");
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new directory");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
-		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
+		reiserfs_warning(sb, "vs-13070",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
@@ -1720,13 +1717,13 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
-				 "i/o failure occurred creating new symlink");
+		reiserfs_error(sb, "vs-13080",
+			       "i/o failure occurred creating new symlink");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
-		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
+		reiserfs_warning(sb, "vs-13080",
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
@@ -1739,7 +1736,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
 /* inserts the stat data into the tree, and then calls
    reiserfs_new_directory (to insert ".", ".." item if new object is
    directory) or reiserfs_new_symlink (to insert symlink body if new
-   object is symlink) or nothing (if new object is regular file) 
+   object is symlink) or nothing (if new object is regular file)
 
    NOTE! uid and gid must already be set in the inode.  If we return
    non-zero due to an error, we have to drop the quota previously allocated
@@ -1747,10 +1744,11 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
    if we return non-zero, we also end the transaction.  */
 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *dir, int mode, const char *symname,
-		       /* 0 for regular, EMTRY_DIR_SIZE for dirs, 
+		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
 		          strlen (symname) for symlinks) */
 		       loff_t i_size, struct dentry *dentry,
-		       struct inode *inode)
+		       struct inode *inode,
+		       struct reiserfs_security_handle *security)
 {
 	struct super_block *sb;
 	struct reiserfs_iget_args args;
@@ -1763,7 +1761,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto out_end_trans;
 	}
@@ -1796,7 +1794,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb))
-		/* not a perfect generation count, as object ids can be reused, but 
+		/* not a perfect generation count, as object ids can be reused, but
 		 ** this is as good as reiserfs can do right now.
 		 ** note that the private part of inode isn't filled in yet, we have
 		 ** to use the directory.
@@ -1917,9 +1915,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_inserted_sd;
 	}
 
-	/* XXX CHECK THIS */
 	if (reiserfs_posixacl(inode->i_sb)) {
-		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
+		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
 		if (retval) {
 			err = retval;
 			reiserfs_check_path(&path_to_key);
@@ -1927,10 +1924,23 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 			goto out_inserted_sd;
 		}
 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
-		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
+		reiserfs_warning(inode->i_sb, "jdm-13090",
+				 "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
-	} else if (is_reiserfs_priv_object(dir)) {
-		reiserfs_mark_inode_private(inode);
+	} else if (IS_PRIVATE(dir))
+		inode->i_flags |= S_PRIVATE;
+
+	if (security->name) {
+		retval = reiserfs_security_write(th, inode, security);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			retval = journal_end(th, th->t_super,
+					     th->t_blocks_allocated);
+			if (retval)
+				err = retval;
+			goto out_inserted_sd;
+		}
 	}
 
 	reiserfs_update_sd(th, inode);
@@ -1947,12 +1957,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	INODE_PKEY(inode)->k_objectid = 0;
 
 	/* Quota change must be inside a transaction for journaling */
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
 
@@ -1960,19 +1970,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
 	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
-
-	/* If we were inheriting an ACL, we need to release the lock so that
-	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
-	 * code really needs to be reworked, but this will take care of it
-	 * for now. -jeffm */
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		iput(inode);
-		reiserfs_write_lock_xattrs(dir->i_sb);
-	} else
-#endif
-		iput(inode);
+	iput(inode);
 	return err;
 }
 
@@ -1989,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 **
 ** on failure, nonzero is returned, page_result and bh_result are untouched.
 */
-static int grab_tail_page(struct inode *p_s_inode,
+static int grab_tail_page(struct inode *inode,
 			  struct page **page_result,
 			  struct buffer_head **bh_result)
 {
@@ -1997,11 +1995,11 @@ static int grab_tail_page(struct inode *p_s_inode,
 	/* we want the page with the last byte in the file,
 	 ** not the page that will hold the next byte for appending
 	 */
-	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
 	unsigned long pos = 0;
 	unsigned long start = 0;
-	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
-	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
@@ -2015,7 +2013,7 @@ static int grab_tail_page(struct inode *p_s_inode,
 	if ((offset & (blocksize - 1)) == 0) {
 		return -ENOENT;
 	}
-	page = grab_cache_page(p_s_inode->i_mapping, index);
+	page = grab_cache_page(inode->i_mapping, index);
 	error = -ENOMEM;
 	if (!page) {
 		goto out;
@@ -2044,10 +2042,8 @@ static int grab_tail_page(struct inode *p_s_inode,
 		 ** I've screwed up the code to find the buffer, or the code to
 		 ** call prepare_write
 		 */
-		reiserfs_warning(p_s_inode->i_sb,
-				 "clm-6000: error reading block %lu on dev %s",
-				 bh->b_blocknr,
-				 reiserfs_bdevname(p_s_inode->i_sb));
+		reiserfs_error(inode->i_sb, "clm-6000",
+			       "error reading block %lu", bh->b_blocknr);
 		error = -EIO;
 		goto unlock;
 	}
@@ -2069,57 +2065,58 @@ static int grab_tail_page(struct inode *p_s_inode,
 **
 ** some code taken from block_truncate_page
 */
-int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
+int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 {
 	struct reiserfs_transaction_handle th;
 	/* we want the offset for the first byte after the end of the file */
-	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
-	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
+	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned length;
 	struct page *page = NULL;
 	int error;
 	struct buffer_head *bh = NULL;
 	int err2;
 
-	reiserfs_write_lock(p_s_inode->i_sb);
+	reiserfs_write_lock(inode->i_sb);
 
-	if (p_s_inode->i_size > 0) {
-		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
-			// -ENOENT means we truncated past the end of the file, 
+	if (inode->i_size > 0) {
+		error = grab_tail_page(inode, &page, &bh);
+		if (error) {
+			// -ENOENT means we truncated past the end of the file,
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
-				reiserfs_warning(p_s_inode->i_sb,
-						 "clm-6001: grab_tail_page failed %d",
-						 error);
+				reiserfs_error(inode->i_sb, "clm-6001",
+					       "grab_tail_page failed %d",
+					       error);
 			page = NULL;
 			bh = NULL;
 		}
 	}
 
-	/* so, if page != NULL, we have a buffer head for the offset at 
-	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 
-	 ** then we have an unformatted node.  Otherwise, we have a direct item, 
-	 ** and no zeroing is required on disk.  We zero after the truncate, 
-	 ** because the truncate might pack the item anyway 
+	/* so, if page != NULL, we have a buffer head for the offset at
+	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+	 ** then we have an unformatted node.  Otherwise, we have a direct item,
+	 ** and no zeroing is required on disk.  We zero after the truncate,
+	 ** because the truncate might pack the item anyway
 	 ** (it will unmap bh if it packs).
 	 */
 	/* it is enough to reserve space in transaction for 2 balancings:
 	   one for "save" link adding and another for the first
 	   cut_from_item. 1 is for update_sd */
-	error = journal_begin(&th, p_s_inode->i_sb,
+	error = journal_begin(&th, inode->i_sb,
 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
-	reiserfs_update_inode_transaction(p_s_inode);
+	reiserfs_update_inode_transaction(inode);
 	if (update_timestamps)
 		/* we are doing real truncate: if the system crashes before the last
 		   transaction of truncating gets committed - on reboot the file
 		   either appears truncated properly or not truncated at all */
-		add_save_link(&th, p_s_inode, 1);
-	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
+		add_save_link(&th, inode, 1);
+	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
 	error =
-	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 
@@ -2130,7 +2127,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 	}
 	
 	if (update_timestamps) {
-		error = remove_save_link(p_s_inode, 1 /* truncate */ );
+		error = remove_save_link(inode, 1 /* truncate */);
 		if (error)
 			goto out;
 	}
@@ -2149,14 +2146,14 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 		page_cache_release(page);
 	}
 
-	reiserfs_write_unlock(p_s_inode->i_sb);
+	reiserfs_write_unlock(inode->i_sb);
 	return 0;
       out:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	reiserfs_write_unlock(p_s_inode->i_sb);
+	reiserfs_write_unlock(inode->i_sb);
 	return error;
 }
 
@@ -2208,9 +2205,8 @@ static int map_block_for_writepage(struct inode *inode,
 	/* we've found an unformatted node */
 	if (indirect_item_found(retval, ih)) {
 		if (bytes_copied > 0) {
-			reiserfs_warning(inode->i_sb,
-					 "clm-6002: bytes_copied %d",
-					 bytes_copied);
+			reiserfs_warning(inode->i_sb, "clm-6002",
+					 "bytes_copied %d", bytes_copied);
 		}
 		if (!get_block_num(item, pos_in_item)) {
 			/* crap, we are writing to a hole */
@@ -2267,9 +2263,8 @@ static int map_block_for_writepage(struct inode *inode,
 			goto research;
 		}
 	} else {
-		reiserfs_warning(inode->i_sb,
-				 "clm-6003: bad item inode %lu, device %s",
-				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
+		reiserfs_warning(inode->i_sb, "clm-6003",
+				 "bad item inode %lu", inode->i_ino);
 		retval = -EIO;
 		goto out;
 	}
@@ -2312,8 +2307,8 @@ static int map_block_for_writepage(struct inode *inode,
 	return retval;
 }
 
-/* 
- * mason@suse.com: updated in 2.5.54 to follow the same general io 
+/*
+ * mason@suse.com: updated in 2.5.54 to follow the same general io
  * start/recovery path as __block_write_full_page, along with special
  * code to handle reiserfs tails.
  */
@@ -2453,7 +2448,7 @@ static int reiserfs_write_full_page(struct page *page,
 	unlock_page(page);
 
 	/*
-	 * since any buffer might be the only dirty buffer on the page, 
+	 * since any buffer might be the only dirty buffer on the page,
 	 * the first submit_bh can bring the page out of writeback.
 	 * be careful with the buffers.
 	 */
@@ -2472,8 +2467,8 @@ static int reiserfs_write_full_page(struct page *page,
 	if (nr == 0) {
 		/*
 		 * if this page only had a direct item, it is very possible for
-		 * no io to be required without there being an error.  Or, 
-		 * someone else could have locked them and sent them down the 
+		 * no io to be required without there being an error.  Or,
+		 * someone else could have locked them and sent them down the
 		 * pipe without locking the page
 		 */
 		bh = head;
@@ -2492,7 +2487,7 @@ static int reiserfs_write_full_page(struct page *page,
 
       fail:
 	/* catches various errors, we need to make sure any valid dirty blocks
-	 * get to the media.  The page is currently locked and not marked for 
+	 * get to the media.  The page is currently locked and not marked for
 	 * writeback
 	 */
 	ClearPageUptodate(page);
@@ -3119,7 +3114,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 				if (error)
 					goto out;
 				error =
-				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+				    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 				if (error) {
 					journal_end(&th, inode->i_sb,
 						    jbegin_count);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 830332021ed4..0ccc3fdda7bf 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -189,7 +189,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	}
 
 	/* we unpack by finding the page with the tail, and calling
-	 ** reiserfs_prepare_write on that page.  This will force a 
+	 ** reiserfs_prepare_write on that page.  This will force a
 	 ** reiserfs_get_block to unpack the tail for us.
 	 */
 	index = inode->i_size >> PAGE_CACHE_SHIFT;
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 9475557ab499..72cb1cc51b87 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -97,7 +97,8 @@ static int sd_unit_num(struct virtual_item *vi)
 
 static void sd_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "STATDATA, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16100",
+			 "STATDATA, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -190,7 +191,8 @@ static int direct_unit_num(struct virtual_item *vi)
 
 static void direct_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "DIRECT, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16101",
+			 "DIRECT, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -278,7 +280,7 @@ static void indirect_print_item(struct item_head *ih, char *item)
 	unp = (__le32 *) item;
 
 	if (ih_item_len(ih) % UNFM_P_SIZE)
-		reiserfs_warning(NULL, "indirect_print_item: invalid item len");
+		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
 
 	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
 	for (j = 0; j < I_UNFM_NUM(ih); j++) {
@@ -334,7 +336,8 @@ static int indirect_unit_num(struct virtual_item *vi)
 
 static void indirect_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL, "INDIRECT, index %d, type 0x%x, %h",
+	reiserfs_warning(NULL, "reiserfs-16103",
+			 "INDIRECT, index %d, type 0x%x, %h",
 			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
@@ -359,7 +362,7 @@ static struct item_operations indirect_ops = {
 
 static int direntry_bytes_number(struct item_head *ih, int block_size)
 {
-	reiserfs_warning(NULL, "vs-16090: direntry_bytes_number: "
+	reiserfs_warning(NULL, "vs-16090",
 			 "bytes number is asked for direntry");
 	return 0;
 }
@@ -514,8 +517,9 @@ static int direntry_create_vi(struct virtual_node *vn,
 		    ((is_affected
 		      && (vn->vn_mode == M_PASTE
 			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
-			reiserfs_panic(NULL,
-				       "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
+			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
+				       "insert_size==%d), invalid length of "
+				       "directory item",
 				       vn->vn_mode, insert_size);
 		}
 	}
@@ -546,7 +550,8 @@ static int direntry_check_left(struct virtual_item *vi, int free,
 	}
 
 	if (entries == dir_u->entry_count) {
-		reiserfs_panic(NULL, "free space %d, entry_count %d\n", free,
+		reiserfs_panic(NULL, "item_ops-1",
+			       "free space %d, entry_count %d", free,
 			       dir_u->entry_count);
 	}
 
@@ -614,7 +619,8 @@ static void direntry_print_vi(struct virtual_item *vi)
 	int i;
 	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-	reiserfs_warning(NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+	reiserfs_warning(NULL, "reiserfs-16104",
+			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
 			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
 	printk("%d entries: ", dir_u->entry_count);
 	for (i = 0; i < dir_u->entry_count; i++)
@@ -642,43 +648,43 @@ static struct item_operations direntry_ops = {
 //
 static int errcatch_bytes_number(struct item_head *ih, int block_size)
 {
-	reiserfs_warning(NULL,
-			 "green-16001: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16001",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_decrement_key(struct cpu_key *key)
 {
-	reiserfs_warning(NULL,
-			 "green-16002: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16002",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static int errcatch_is_left_mergeable(struct reiserfs_key *key,
 				      unsigned long bsize)
 {
-	reiserfs_warning(NULL,
-			 "green-16003: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16003",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_print_item(struct item_head *ih, char *item)
 {
-	reiserfs_warning(NULL,
-			 "green-16004: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16004",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static void errcatch_check_item(struct item_head *ih, char *item)
 {
-	reiserfs_warning(NULL,
-			 "green-16005: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16005",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static int errcatch_create_vi(struct virtual_node *vn,
 			      struct virtual_item *vi,
 			      int is_affected, int insert_size)
 {
-	reiserfs_warning(NULL,
-			 "green-16006: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16006",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;		// We might return -1 here as well, but it won't help as create_virtual_node() from where
 	// this operation is called from is of return type void.
 }
@@ -686,36 +692,36 @@ static int errcatch_create_vi(struct virtual_node *vn,
 static int errcatch_check_left(struct virtual_item *vi, int free,
 			       int start_skip, int end_skip)
 {
-	reiserfs_warning(NULL,
-			 "green-16007: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16007",
+			 "Invalid item type observed, run fsck ASAP");
 	return -1;
 }
 
 static int errcatch_check_right(struct virtual_item *vi, int free)
 {
-	reiserfs_warning(NULL,
-			 "green-16008: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16008",
+			 "Invalid item type observed, run fsck ASAP");
 	return -1;
 }
 
 static int errcatch_part_size(struct virtual_item *vi, int first, int count)
 {
-	reiserfs_warning(NULL,
-			 "green-16009: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16009",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static int errcatch_unit_num(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL,
-			 "green-16010: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16010",
+			 "Invalid item type observed, run fsck ASAP");
 	return 0;
 }
 
 static void errcatch_print_vi(struct virtual_item *vi)
 {
-	reiserfs_warning(NULL,
-			 "green-16011: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL, "green-16011",
+			 "Invalid item type observed, run fsck ASAP");
 }
 
 static struct item_operations errcatch_ops = {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9643c3bbeb3b..77f5bb746bf0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,36 +1,36 @@
 /*
 ** Write ahead logging implementation copyright Chris Mason 2000
 **
-** The background commits make this code very interelated, and 
+** The background commits make this code very interelated, and
 ** overly complex.  I need to rethink things a bit....The major players:
 **
-** journal_begin -- call with the number of blocks you expect to log.  
+** journal_begin -- call with the number of blocks you expect to log.
 **                  If the current transaction is too
-** 		    old, it will block until the current transaction is 
+** 		    old, it will block until the current transaction is
 ** 		    finished, and then start a new one.
-**		    Usually, your transaction will get joined in with 
+**		    Usually, your transaction will get joined in with
 **                  previous ones for speed.
 **
-** journal_join  -- same as journal_begin, but won't block on the current 
+** journal_join  -- same as journal_begin, but won't block on the current
 **                  transaction regardless of age.  Don't ever call
-**                  this.  Ever.  There are only two places it should be 
+**                  this.  Ever.  There are only two places it should be
 **                  called from, and they are both inside this file.
 **
-** journal_mark_dirty -- adds blocks into this transaction.  clears any flags 
+** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
 **                       that might make them get sent to disk
-**                       and then marks them BH_JDirty.  Puts the buffer head 
-**                       into the current transaction hash.  
+**                       and then marks them BH_JDirty.  Puts the buffer head
+**                       into the current transaction hash.
 **
 ** journal_end -- if the current transaction is batchable, it does nothing
 **                   otherwise, it could do an async/synchronous commit, or
-**                   a full flush of all log and real blocks in the 
+**                   a full flush of all log and real blocks in the
 **                   transaction.
 **
-** flush_old_commits -- if the current transaction is too old, it is ended and 
-**                      commit blocks are sent to disk.  Forces commit blocks 
-**                      to disk for all backgrounded commits that have been 
+** flush_old_commits -- if the current transaction is too old, it is ended and
+**                      commit blocks are sent to disk.  Forces commit blocks
+**                      to disk for all backgrounded commits that have been
 **                      around too long.
-**		     -- Note, if you call this as an immediate flush from 
+**		     -- Note, if you call this as an immediate flush from
 **		        from within kupdate, it will ignore the immediate flag
 */
 
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
 			     struct reiserfs_journal_list *jl, int flushall);
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *p_s_sb, unsigned long nblocks);
+			struct super_block *sb, unsigned long nblocks);
 static int release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
@@ -113,12 +113,12 @@ enum {
 };
 
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb,
+			      struct super_block *sb,
 			      unsigned long nblocks, int join);
 
-static void init_journal_hash(struct super_block *p_s_sb)
+static void init_journal_hash(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	memset(journal->j_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 }
@@ -145,7 +145,7 @@ static void disable_barrier(struct super_block *s)
 }
 
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
-							 *p_s_sb)
+							 *sb)
 {
 	struct reiserfs_bitmap_node *bn;
 	static int id;
@@ -154,7 +154,7 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 	if (!bn) {
 		return NULL;
 	}
-	bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);
+	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
 	if (!bn->data) {
 		kfree(bn);
 		return NULL;
@@ -164,9 +164,9 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 	return bn;
 }
 
-static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
+static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	struct list_head *entry = journal->j_bitmap_nodes.next;
 
@@ -176,21 +176,21 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
 	if (entry != &journal->j_bitmap_nodes) {
 		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
 		list_del(entry);
-		memset(bn->data, 0, p_s_sb->s_blocksize);
+		memset(bn->data, 0, sb->s_blocksize);
 		journal->j_free_bitmap_nodes--;
 		return bn;
 	}
-	bn = allocate_bitmap_node(p_s_sb);
+	bn = allocate_bitmap_node(sb);
 	if (!bn) {
 		yield();
 		goto repeat;
 	}
 	return bn;
 }
-static inline void free_bitmap_node(struct super_block *p_s_sb,
+static inline void free_bitmap_node(struct super_block *sb,
 				    struct reiserfs_bitmap_node *bn)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	journal->j_used_bitmap_nodes--;
 	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
 		kfree(bn->data);
@@ -201,46 +201,46 @@ static inline void free_bitmap_node(struct super_block *p_s_sb,
 	}
 }
 
-static void allocate_bitmap_nodes(struct super_block *p_s_sb)
+static void allocate_bitmap_nodes(struct super_block *sb)
 {
 	int i;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
-		bn = allocate_bitmap_node(p_s_sb);
+		bn = allocate_bitmap_node(sb);
 		if (bn) {
 			list_add(&bn->list, &journal->j_bitmap_nodes);
 			journal->j_free_bitmap_nodes++;
 		} else {
-			break;	// this is ok, we'll try again when more are needed 
+			break;	/* this is ok, we'll try again when more are needed */
 		}
 	}
 }
 
-static int set_bit_in_list_bitmap(struct super_block *p_s_sb,
+static int set_bit_in_list_bitmap(struct super_block *sb,
 				  b_blocknr_t block,
 				  struct reiserfs_list_bitmap *jb)
 {
-	unsigned int bmap_nr = block / (p_s_sb->s_blocksize << 3);
-	unsigned int bit_nr = block % (p_s_sb->s_blocksize << 3);
+	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
+	unsigned int bit_nr = block % (sb->s_blocksize << 3);
 
 	if (!jb->bitmaps[bmap_nr]) {
-		jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);
+		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
 	}
 	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
 	return 0;
 }
 
-static void cleanup_bitmap_list(struct super_block *p_s_sb,
+static void cleanup_bitmap_list(struct super_block *sb,
 				struct reiserfs_list_bitmap *jb)
 {
 	int i;
 	if (jb->bitmaps == NULL)
 		return;
 
-	for (i = 0; i < reiserfs_bmap_count(p_s_sb); i++) {
+	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
 		if (jb->bitmaps[i]) {
-			free_bitmap_node(p_s_sb, jb->bitmaps[i]);
+			free_bitmap_node(sb, jb->bitmaps[i]);
 			jb->bitmaps[i] = NULL;
 		}
 	}
@@ -249,7 +249,7 @@ static void cleanup_bitmap_list(struct super_block *p_s_sb,
 /*
 ** only call this on FS unmount.
 */
-static int free_list_bitmaps(struct super_block *p_s_sb,
+static int free_list_bitmaps(struct super_block *sb,
 			     struct reiserfs_list_bitmap *jb_array)
 {
 	int i;
@@ -257,16 +257,16 @@ static int free_list_bitmaps(struct super_block *p_s_sb,
 	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 		jb = jb_array + i;
 		jb->journal_list = NULL;
-		cleanup_bitmap_list(p_s_sb, jb);
+		cleanup_bitmap_list(sb, jb);
 		vfree(jb->bitmaps);
 		jb->bitmaps = NULL;
 	}
 	return 0;
 }
 
-static int free_bitmap_nodes(struct super_block *p_s_sb)
+static int free_bitmap_nodes(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct list_head *next = journal->j_bitmap_nodes.next;
 	struct reiserfs_bitmap_node *bn;
 
@@ -283,10 +283,10 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
 }
 
 /*
-** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. 
+** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 ** jb_array is the array to be filled in.
 */
-int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
+int reiserfs_allocate_list_bitmaps(struct super_block *sb,
 				   struct reiserfs_list_bitmap *jb_array,
 				   unsigned int bmap_nr)
 {
@@ -300,30 +300,30 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 		jb->journal_list = NULL;
 		jb->bitmaps = vmalloc(mem);
 		if (!jb->bitmaps) {
-			reiserfs_warning(p_s_sb,
-					 "clm-2000, unable to allocate bitmaps for journal lists");
+			reiserfs_warning(sb, "clm-2000", "unable to "
+					 "allocate bitmaps for journal lists");
 			failed = 1;
 			break;
 		}
 		memset(jb->bitmaps, 0, mem);
 	}
 	if (failed) {
-		free_list_bitmaps(p_s_sb, jb_array);
+		free_list_bitmaps(sb, jb_array);
 		return -1;
 	}
 	return 0;
 }
 
 /*
-** find an available list bitmap.  If you can't find one, flush a commit list 
+** find an available list bitmap.  If you can't find one, flush a commit list
 ** and try again
 */
-static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
+static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
 						    struct reiserfs_journal_list
 						    *jl)
 {
 	int i, j;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_list_bitmap *jb = NULL;
 
 	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
@@ -331,7 +331,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
 		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
 		jb = journal->j_list_bitmap + i;
 		if (journal->j_list_bitmap[i].journal_list) {
-			flush_commit_list(p_s_sb,
+			flush_commit_list(sb,
 					  journal->j_list_bitmap[i].
 					  journal_list, 1);
 			if (!journal->j_list_bitmap[i].journal_list) {
@@ -348,7 +348,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
 	return jb;
 }
 
-/* 
+/*
 ** allocates a new chunk of X nodes, and links them all together as a list.
 ** Uses the cnode->next and cnode->prev pointers
 ** returns NULL on failure
@@ -376,14 +376,14 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 }
 
 /*
-** pulls a cnode off the free list, or returns NULL on failure 
+** pulls a cnode off the free list, or returns NULL on failure
 */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
 {
 	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
-	reiserfs_check_lock_depth(p_s_sb, "get_cnode");
+	reiserfs_check_lock_depth(sb, "get_cnode");
 
 	if (journal->j_cnode_free <= 0) {
 		return NULL;
@@ -403,14 +403,14 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
 }
 
 /*
-** returns a cnode to the free list 
+** returns a cnode to the free list
 */
-static void free_cnode(struct super_block *p_s_sb,
+static void free_cnode(struct super_block *sb,
 		       struct reiserfs_journal_cnode *cn)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
-	reiserfs_check_lock_depth(p_s_sb, "free_cnode");
+	reiserfs_check_lock_depth(sb, "free_cnode");
 
 	journal->j_cnode_used--;
 	journal->j_cnode_free++;
@@ -436,8 +436,8 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
 {
 #ifdef CONFIG_SMP
 	if (current->lock_depth < 0) {
-		reiserfs_panic(sb, "%s called without kernel lock held",
-			       caller);
+		reiserfs_panic(sb, "journal-1", "%s called without kernel "
+			       "lock held", caller);
 	}
 #else
 	;
@@ -481,11 +481,11 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
 ** reject it on the next call to reiserfs_in_journal
 **
 */
-int reiserfs_in_journal(struct super_block *p_s_sb,
+int reiserfs_in_journal(struct super_block *sb,
 			unsigned int bmap_nr, int bit_nr, int search_all,
 			b_blocknr_t * next_zero_bit)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_list_bitmap *jb;
 	int i;
@@ -493,14 +493,14 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
 	*next_zero_bit = 0;	/* always start this at zero. */
 
-	PROC_INFO_INC(p_s_sb, journal.in_journal);
+	PROC_INFO_INC(sb, journal.in_journal);
 	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
 	 ** if we crash before the transaction that freed it commits,  this transaction won't
 	 ** have committed either, and the block will never be written
 	 */
 	if (search_all) {
 		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);
+			PROC_INFO_INC(sb, journal.in_journal_bitmap);
 			jb = journal->j_list_bitmap + i;
 			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
 			    test_bit(bit_nr,
@@ -510,28 +510,28 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 				    find_next_zero_bit((unsigned long *)
 						       (jb->bitmaps[bmap_nr]->
 							data),
-						       p_s_sb->s_blocksize << 3,
+						       sb->s_blocksize << 3,
 						       bit_nr + 1);
 				return 1;
 			}
 		}
 	}
 
-	bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
+	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
 	/* is it in any old transactions? */
 	if (search_all
 	    && (cn =
-		get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
+		get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
 		return 1;
 	}
 
 	/* is it in the current transaction.  This should never happen */
-	if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
+	if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
 		BUG();
 		return 1;
 	}
 
-	PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);
+	PROC_INFO_INC(sb, journal.in_journal_reusable);
 	/* safe for reuse */
 	return 0;
 }
@@ -553,16 +553,16 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 }
 
 /* lock the current transaction */
-static inline void lock_journal(struct super_block *p_s_sb)
+static inline void lock_journal(struct super_block *sb)
 {
-	PROC_INFO_INC(p_s_sb, journal.lock_journal);
-	mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
+	PROC_INFO_INC(sb, journal.lock_journal);
+	mutex_lock(&SB_JOURNAL(sb)->j_mutex);
 }
 
 /* unlock the current transaction */
-static inline void unlock_journal(struct super_block *p_s_sb)
+static inline void unlock_journal(struct super_block *sb)
 {
-	mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
+	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
 }
 
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -574,7 +574,7 @@ static inline void put_journal_list(struct super_block *s,
 				    struct reiserfs_journal_list *jl)
 {
 	if (jl->j_refcount < 1) {
-		reiserfs_panic(s, "trans id %lu, refcount at %d",
+		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
 			       jl->j_trans_id, jl->j_refcount);
 	}
 	if (--jl->j_refcount == 0)
@@ -586,20 +586,20 @@ static inline void put_journal_list(struct super_block *s,
 ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
 ** transaction.
 */
-static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,
+static void cleanup_freed_for_journal_list(struct super_block *sb,
 					   struct reiserfs_journal_list *jl)
 {
 
 	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
 	if (jb) {
-		cleanup_bitmap_list(p_s_sb, jb);
+		cleanup_bitmap_list(sb, jb);
 	}
 	jl->j_list_bitmap->journal_list = NULL;
 	jl->j_list_bitmap = NULL;
 }
 
 static int journal_list_still_alive(struct super_block *s,
-				    unsigned long trans_id)
+				    unsigned int trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	struct list_head *entry = &journal->j_journal_list;
@@ -644,8 +644,8 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 	char b[BDEVNAME_SIZE];
 
 	if (buffer_journaled(bh)) {
-		reiserfs_warning(NULL,
-				 "clm-2084: pinned buffer %lu:%s sent to disk",
+		reiserfs_warning(NULL, "clm-2084",
+				 "pinned buffer %lu:%s sent to disk",
 				 bh->b_blocknr, bdevname(bh->b_bdev, b));
 	}
 	if (uptodate)
@@ -933,9 +933,9 @@ static int flush_older_commits(struct super_block *s,
 	struct reiserfs_journal_list *other_jl;
 	struct reiserfs_journal_list *first_jl;
 	struct list_head *entry;
-	unsigned long trans_id = jl->j_trans_id;
-	unsigned long other_trans_id;
-	unsigned long first_trans_id;
+	unsigned int trans_id = jl->j_trans_id;
+	unsigned int other_trans_id;
+	unsigned int first_trans_id;
 
       find_first:
 	/*
@@ -1014,7 +1014,7 @@ static int flush_commit_list(struct super_block *s,
 	int i;
 	b_blocknr_t bn;
 	struct buffer_head *tbh = NULL;
-	unsigned long trans_id = jl->j_trans_id;
+	unsigned int trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	int barrier = 0;
 	int retval = 0;
@@ -1122,7 +1122,8 @@ static int flush_commit_list(struct super_block *s,
 			sync_dirty_buffer(tbh);
 		if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
-			reiserfs_warning(s, "journal-601, buffer write failed");
+			reiserfs_warning(s, "journal-601",
+					 "buffer write failed");
 #endif
 			retval = -EIO;
 		}
@@ -1154,14 +1155,14 @@ static int flush_commit_list(struct super_block *s,
 	 * up propagating the write error out to the filesystem. */
 	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-		reiserfs_warning(s, "journal-615: buffer write failed");
+		reiserfs_warning(s, "journal-615", "buffer write failed");
 #endif
 		retval = -EIO;
 	}
 	bforget(jl->j_commit_bh);
 	if (journal->j_last_commit_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
-		reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
+		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
 				 journal->j_last_commit_id, jl->j_trans_id);
 	}
 	journal->j_last_commit_id = jl->j_trans_id;
@@ -1191,8 +1192,8 @@ static int flush_commit_list(struct super_block *s,
 }
 
 /*
-** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or 
-** returns NULL if it can't find anything 
+** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
+** returns NULL if it can't find anything
 */
 static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
 							  reiserfs_journal_cnode
@@ -1236,11 +1237,11 @@ static void remove_journal_hash(struct super_block *,
 ** journal list for this transaction.  Aside from freeing the cnode, this also allows the
 ** block to be reallocated for data blocks if it had been deleted.
 */
-static void remove_all_from_journal_list(struct super_block *p_s_sb,
+static void remove_all_from_journal_list(struct super_block *sb,
 					 struct reiserfs_journal_list *jl,
 					 int debug)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn, *last;
 	cn = jl->j_realblock;
 
@@ -1250,18 +1251,18 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 	while (cn) {
 		if (cn->blocknr != 0) {
 			if (debug) {
-				reiserfs_warning(p_s_sb,
+				reiserfs_warning(sb, "reiserfs-2201",
 						 "block %u, bh is %d, state %ld",
 						 cn->blocknr, cn->bh ? 1 : 0,
 						 cn->state);
 			}
 			cn->state = 0;
-			remove_journal_hash(p_s_sb, journal->j_list_hash_table,
+			remove_journal_hash(sb, journal->j_list_hash_table,
 					    jl, cn->blocknr, 1);
 		}
 		last = cn;
 		cn = cn->next;
-		free_cnode(p_s_sb, last);
+		free_cnode(sb, last);
 	}
 	jl->j_realblock = NULL;
 }
@@ -1273,12 +1274,12 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
 ** called by flush_journal_list, before it calls remove_all_from_journal_list
 **
 */
-static int _update_journal_header_block(struct super_block *p_s_sb,
+static int _update_journal_header_block(struct super_block *sb,
 					unsigned long offset,
-					unsigned long trans_id)
+					unsigned int trans_id)
 {
 	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	if (reiserfs_is_journal_aborted(journal))
 		return -EIO;
@@ -1288,8 +1289,8 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 			wait_on_buffer((journal->j_header_bh));
 			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(p_s_sb,
-						 "journal-699: buffer write failed");
+				reiserfs_warning(sb, "journal-699",
+						 "buffer write failed");
 #endif
 				return -EIO;
 			}
@@ -1302,49 +1303,49 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 
-		if (reiserfs_barrier_flush(p_s_sb)) {
+		if (reiserfs_barrier_flush(sb)) {
 			int ret;
 			lock_buffer(journal->j_header_bh);
 			ret = submit_barrier_buffer(journal->j_header_bh);
 			if (ret == -EOPNOTSUPP) {
 				set_buffer_uptodate(journal->j_header_bh);
-				disable_barrier(p_s_sb);
+				disable_barrier(sb);
 				goto sync;
 			}
 			wait_on_buffer(journal->j_header_bh);
-			check_barrier_completion(p_s_sb, journal->j_header_bh);
+			check_barrier_completion(sb, journal->j_header_bh);
 		} else {
 		      sync:
 			set_buffer_dirty(journal->j_header_bh);
 			sync_dirty_buffer(journal->j_header_bh);
 		}
 		if (!buffer_uptodate(journal->j_header_bh)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-837: IO error during journal replay");
+			reiserfs_warning(sb, "journal-837",
+					 "IO error during journal replay");
 			return -EIO;
 		}
 	}
 	return 0;
 }
 
-static int update_journal_header_block(struct super_block *p_s_sb,
+static int update_journal_header_block(struct super_block *sb,
 				       unsigned long offset,
-				       unsigned long trans_id)
+				       unsigned int trans_id)
 {
-	return _update_journal_header_block(p_s_sb, offset, trans_id);
+	return _update_journal_header_block(sb, offset, trans_id);
 }
 
-/* 
-** flush any and all journal lists older than you are 
+/*
+** flush any and all journal lists older than you are
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb,
+static int flush_older_journal_lists(struct super_block *sb,
 				     struct reiserfs_journal_list *jl)
 {
 	struct list_head *entry;
 	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-	unsigned long trans_id = jl->j_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned int trans_id = jl->j_trans_id;
 
 	/* we know we are the only ones flushing things, no extra race
 	 * protection is required.
@@ -1358,7 +1359,7 @@ static int flush_older_journal_lists(struct super_block *p_s_sb,
 	if (other_jl->j_trans_id < trans_id) {
 		BUG_ON(other_jl->j_refcount <= 0);
 		/* do not flush all */
-		flush_journal_list(p_s_sb, other_jl, 0);
+		flush_journal_list(sb, other_jl, 0);
 
 		/* other_jl is now deleted from the list */
 		goto restart;
@@ -1381,8 +1382,8 @@ static void del_from_work_list(struct super_block *s,
 ** always set flushall to 1, unless you are calling from inside
 ** flush_journal_list
 **
-** IMPORTANT.  This can only be called while there are no journal writers, 
-** and the journal is locked.  That means it can only be called from 
+** IMPORTANT.  This can only be called while there are no journal writers,
+** and the journal is locked.  That means it can only be called from
 ** do_journal_end, or by journal_release
 */
 static int flush_journal_list(struct super_block *s,
@@ -1401,8 +1402,7 @@ static int flush_journal_list(struct super_block *s,
 	BUG_ON(j_len_saved <= 0);
 
 	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_warning(s,
-				 "clm-2048: flush_journal_list called with wcount %d",
+		reiserfs_warning(s, "clm-2048", "called with wcount %d",
 				 atomic_read(&journal->j_wcount));
 	}
 	BUG_ON(jl->j_trans_id == 0);
@@ -1416,8 +1416,7 @@ static int flush_journal_list(struct super_block *s,
 
 	count = 0;
 	if (j_len_saved > journal->j_trans_max) {
-		reiserfs_panic(s,
-			       "journal-715: flush_journal_list, length is %lu, trans id %lu\n",
+		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
 			       j_len_saved, jl->j_trans_id);
 		return 0;
 	}
@@ -1430,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
 		goto flush_older_and_return;
 	}
 
-	/* start by putting the commit list on disk.  This will also flush 
+	/* start by putting the commit list on disk.  This will also flush
 	 ** the commit lists of any olders transactions
 	 */
 	flush_commit_list(s, jl, 1);
@@ -1445,12 +1444,12 @@ static int flush_journal_list(struct super_block *s,
 		goto flush_older_and_return;
 	}
 
-	/* loop through each cnode, see if we need to write it, 
-	 ** or wait on a more recent transaction, or just ignore it 
+	/* loop through each cnode, see if we need to write it,
+	 ** or wait on a more recent transaction, or just ignore it
 	 */
 	if (atomic_read(&(journal->j_wcount)) != 0) {
-		reiserfs_panic(s,
-			       "journal-844: panic journal list is flushing, wcount is not 0\n");
+		reiserfs_panic(s, "journal-844", "journal list is flushing, "
+			       "wcount is not 0");
 	}
 	cn = jl->j_realblock;
 	while (cn) {
@@ -1474,8 +1473,8 @@ static int flush_journal_list(struct super_block *s,
 		if (!pjl && cn->bh) {
 			saved_bh = cn->bh;
 
-			/* we do this to make sure nobody releases the buffer while 
-			 ** we are working with it 
+			/* we do this to make sure nobody releases the buffer while
+			 ** we are working with it
 			 */
 			get_bh(saved_bh);
 
@@ -1498,8 +1497,8 @@ static int flush_journal_list(struct super_block *s,
 			goto free_cnode;
 		}
 
-		/* bh == NULL when the block got to disk on its own, OR, 
-		 ** the block got freed in a future transaction 
+		/* bh == NULL when the block got to disk on its own, OR,
+		 ** the block got freed in a future transaction
 		 */
 		if (saved_bh == NULL) {
 			goto free_cnode;
@@ -1510,8 +1509,8 @@ static int flush_journal_list(struct super_block *s,
 		 ** is not marked JDirty_wait
 		 */
 		if ((!was_jwait) && !buffer_locked(saved_bh)) {
-			reiserfs_warning(s,
-					 "journal-813: BAD! buffer %llu %cdirty %cjwait, "
+			reiserfs_warning(s, "journal-813",
+					 "BAD! buffer %llu %cdirty %cjwait, "
 					 "not in a newer tranasction",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, was_dirty ? ' ' : '!',
@@ -1529,8 +1528,8 @@ static int flush_journal_list(struct super_block *s,
 				unlock_buffer(saved_bh);
 			count++;
 		} else {
-			reiserfs_warning(s,
-					 "clm-2082: Unable to flush buffer %llu in %s",
+			reiserfs_warning(s, "clm-2082",
+					 "Unable to flush buffer %llu in %s",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, __func__);
 		}
@@ -1541,8 +1540,8 @@ static int flush_journal_list(struct super_block *s,
 			/* we incremented this to keep others from taking the buffer head away */
 			put_bh(saved_bh);
 			if (atomic_read(&(saved_bh->b_count)) < 0) {
-				reiserfs_warning(s,
-						 "journal-945: saved_bh->b_count < 0");
+				reiserfs_warning(s, "journal-945",
+						 "saved_bh->b_count < 0");
 			}
 		}
 	}
@@ -1551,18 +1550,18 @@ static int flush_journal_list(struct super_block *s,
 		while (cn) {
 			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
 				if (!cn->bh) {
-					reiserfs_panic(s,
-						       "journal-1011: cn->bh is NULL\n");
+					reiserfs_panic(s, "journal-1011",
+						       "cn->bh is NULL");
 				}
 				wait_on_buffer(cn->bh);
 				if (!cn->bh) {
-					reiserfs_panic(s,
-						       "journal-1012: cn->bh is NULL\n");
+					reiserfs_panic(s, "journal-1012",
+						       "cn->bh is NULL");
 				}
 				if (unlikely(!buffer_uptodate(cn->bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-					reiserfs_warning(s,
-							 "journal-949: buffer write failed\n");
+					reiserfs_warning(s, "journal-949",
+							 "buffer write failed");
 #endif
 					err = -EIO;
 				}
@@ -1587,7 +1586,7 @@ static int flush_journal_list(struct super_block *s,
 			       __func__);
       flush_older_and_return:
 
-	/* before we can update the journal header block, we _must_ flush all 
+	/* before we can update the journal header block, we _must_ flush all
 	 ** real blocks from all older transactions to disk.  This is because
 	 ** once the header block is updated, this transaction will not be
 	 ** replayed after a crash
@@ -1597,7 +1596,7 @@ static int flush_journal_list(struct super_block *s,
 	}
 
 	err = journal->j_errno;
-	/* before we can remove everything from the hash tables for this 
+	/* before we can remove everything from the hash tables for this
 	 ** transaction, we must make sure it can never be replayed
 	 **
 	 ** since we are only called from do_journal_end, we know for sure there
@@ -1623,7 +1622,7 @@ static int flush_journal_list(struct super_block *s,
 
 	if (journal->j_last_flush_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
-		reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
+		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
 				 journal->j_last_flush_id, jl->j_trans_id);
 	}
 	journal->j_last_flush_id = jl->j_trans_id;
@@ -1758,13 +1757,13 @@ static int dirty_one_transaction(struct super_block *s,
 static int kupdate_transactions(struct super_block *s,
 				struct reiserfs_journal_list *jl,
 				struct reiserfs_journal_list **next_jl,
-				unsigned long *next_trans_id,
+				unsigned int *next_trans_id,
 				int num_blocks, int num_trans)
 {
 	int ret = 0;
 	int written = 0;
 	int transactions_flushed = 0;
-	unsigned long orig_trans_id = jl->j_trans_id;
+	unsigned int orig_trans_id = jl->j_trans_id;
 	struct buffer_chunk chunk;
 	struct list_head *entry;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -1833,7 +1832,7 @@ static int flush_used_journal_lists(struct super_block *s,
 	int limit = 256;
 	struct reiserfs_journal_list *tjl;
 	struct reiserfs_journal_list *flush_jl;
-	unsigned long trans_id;
+	unsigned int trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 
 	flush_jl = tjl = jl;
@@ -1909,22 +1908,22 @@ void remove_journal_hash(struct super_block *sb,
 	}
 }
 
-static void free_journal_ram(struct super_block *p_s_sb)
+static void free_journal_ram(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	kfree(journal->j_current_jl);
 	journal->j_num_lists--;
 
 	vfree(journal->j_cnode_free_orig);
-	free_list_bitmaps(p_s_sb, journal->j_list_bitmap);
-	free_bitmap_nodes(p_s_sb);	/* must be after free_list_bitmaps */
+	free_list_bitmaps(sb, journal->j_list_bitmap);
+	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
 	if (journal->j_header_bh) {
 		brelse(journal->j_header_bh);
 	}
 	/* j_header_bh is on the journal dev, make sure not to release the journal
 	 * dev until we brelse j_header_bh
 	 */
-	release_journal_dev(p_s_sb, journal);
+	release_journal_dev(sb, journal);
 	vfree(journal);
 }
 
@@ -1933,27 +1932,27 @@ static void free_journal_ram(struct super_block *p_s_sb)
 ** of read_super() yet.  Any other caller must keep error at 0.
 */
 static int do_journal_release(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb, int error)
+			      struct super_block *sb, int error)
 {
 	struct reiserfs_transaction_handle myth;
 	int flushed = 0;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	/* we only want to flush out transactions if we were called with error == 0
 	 */
-	if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
+	if (!error && !(sb->s_flags & MS_RDONLY)) {
 		/* end the current trans */
 		BUG_ON(!th->t_trans_id);
-		do_journal_end(th, p_s_sb, 10, FLUSH_ALL);
+		do_journal_end(th, sb, 10, FLUSH_ALL);
 
 		/* make sure something gets logged to force our way into the flush code */
-		if (!journal_join(&myth, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&myth, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
-			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
 			flushed = 1;
 		}
 	}
@@ -1961,26 +1960,26 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	/* this also catches errors during the do_journal_end above */
 	if (!error && reiserfs_is_journal_aborted(journal)) {
 		memset(&myth, 0, sizeof(myth));
-		if (!journal_join_abort(&myth, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join_abort(&myth, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&myth, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
-			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+			journal_mark_dirty(&myth, sb,
+					   SB_BUFFER_WITH_SB(sb));
+			do_journal_end(&myth, sb, 1, FLUSH_ALL);
 		}
 	}
 
 	reiserfs_mounted_fs_count--;
 	/* wait for all commits to finish */
-	cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
+	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
 	flush_workqueue(commit_wq);
 	if (!reiserfs_mounted_fs_count) {
 		destroy_workqueue(commit_wq);
 		commit_wq = NULL;
 	}
 
-	free_journal_ram(p_s_sb);
+	free_journal_ram(sb);
 
 	return 0;
 }
@@ -1989,41 +1988,41 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 ** call on unmount.  flush all journal trans, release all alloc'd ram
 */
 int journal_release(struct reiserfs_transaction_handle *th,
-		    struct super_block *p_s_sb)
+		    struct super_block *sb)
 {
-	return do_journal_release(th, p_s_sb, 0);
+	return do_journal_release(th, sb, 0);
 }
 
 /*
 ** only call from an error condition inside reiserfs_read_super!
 */
 int journal_release_error(struct reiserfs_transaction_handle *th,
-			  struct super_block *p_s_sb)
+			  struct super_block *sb)
 {
-	return do_journal_release(th, p_s_sb, 1);
+	return do_journal_release(th, sb, 1);
 }
 
 /* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
-static int journal_compare_desc_commit(struct super_block *p_s_sb,
+static int journal_compare_desc_commit(struct super_block *sb,
 				       struct reiserfs_journal_desc *desc,
 				       struct reiserfs_journal_commit *commit)
 {
 	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
 	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
-	    get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
+	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
 	    get_commit_trans_len(commit) <= 0) {
 		return 1;
 	}
 	return 0;
 }
 
-/* returns 0 if it did not find a description block  
+/* returns 0 if it did not find a description block
 ** returns -1 if it found a corrupt commit block
-** returns 1 if both desc and commit were valid 
+** returns 1 if both desc and commit were valid
 */
-static int journal_transaction_is_valid(struct super_block *p_s_sb,
+static int journal_transaction_is_valid(struct super_block *sb,
 					struct buffer_head *d_bh,
-					unsigned long *oldest_invalid_trans_id,
+					unsigned int *oldest_invalid_trans_id,
 					unsigned long *newest_mount_id)
 {
 	struct reiserfs_journal_desc *desc;
@@ -2039,7 +2038,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
 		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
 		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal-986: transaction "
 				       "is valid returning because trans_id %d is greater than "
 				       "oldest_invalid %lu",
@@ -2049,7 +2048,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 		}
 		if (newest_mount_id
 		    && *newest_mount_id > get_desc_mount_id(desc)) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal-1087: transaction "
 				       "is valid returning because mount_id %d is less than "
 				       "newest_mount_id %lu",
@@ -2057,36 +2056,37 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 				       *newest_mount_id);
 			return -1;
 		}
-		if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {
-			reiserfs_warning(p_s_sb,
-					 "journal-2018: Bad transaction length %d encountered, ignoring transaction",
+		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
+			reiserfs_warning(sb, "journal-2018",
+					 "Bad transaction length %d "
+					 "encountered, ignoring transaction",
 					 get_desc_trans_len(desc));
 			return -1;
 		}
-		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 
 		/* ok, we have a journal description block, lets see if the transaction was valid */
 		c_bh =
-		    journal_bread(p_s_sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				  ((offset + get_desc_trans_len(desc) +
-				    1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
 		if (!c_bh)
 			return 0;
 		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-		if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		if (journal_compare_desc_commit(sb, desc, commit)) {
+			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 				       "journal_transaction_is_valid, commit offset %ld had bad "
 				       "time %d or length %d",
 				       c_bh->b_blocknr -
-				       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 				       get_commit_trans_id(commit),
 				       get_commit_trans_len(commit));
 			brelse(c_bh);
 			if (oldest_invalid_trans_id) {
 				*oldest_invalid_trans_id =
 				    get_desc_trans_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1004: "
 					       "transaction_is_valid setting oldest invalid trans_id "
 					       "to %d",
@@ -2095,11 +2095,11 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
 			return -1;
 		}
 		brelse(c_bh);
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1006: found valid "
 			       "transaction start offset %llu, len %d id %d",
 			       d_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       get_desc_trans_len(desc),
 			       get_desc_trans_id(desc));
 		return 1;
@@ -2121,63 +2121,63 @@ static void brelse_array(struct buffer_head **heads, int num)
 ** this either reads in a replays a transaction, or returns because the transaction
 ** is invalid, or too old.
 */
-static int journal_read_transaction(struct super_block *p_s_sb,
+static int journal_read_transaction(struct super_block *sb,
 				    unsigned long cur_dblock,
 				    unsigned long oldest_start,
-				    unsigned long oldest_trans_id,
+				    unsigned int oldest_trans_id,
 				    unsigned long newest_mount_id)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
-	unsigned long trans_id = 0;
+	unsigned int trans_id = 0;
 	struct buffer_head *c_bh;
 	struct buffer_head *d_bh;
 	struct buffer_head **log_blocks = NULL;
 	struct buffer_head **real_blocks = NULL;
-	unsigned long trans_offset;
+	unsigned int trans_offset;
 	int i;
 	int trans_half;
 
-	d_bh = journal_bread(p_s_sb, cur_dblock);
+	d_bh = journal_bread(sb, cur_dblock);
 	if (!d_bh)
 		return 1;
 	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
+	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
 		       "journal_read_transaction, offset %llu, len %d mount_id %d",
-		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		       get_desc_trans_len(desc), get_desc_mount_id(desc));
 	if (get_desc_trans_id(desc) < oldest_trans_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
 			       "journal_read_trans skipping because %lu is too old",
 			       cur_dblock -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
 		brelse(d_bh);
 		return 1;
 	}
 	if (get_desc_mount_id(desc) != newest_mount_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
 			       "journal_read_trans skipping because %d is != "
 			       "newest_mount_id %lu", get_desc_mount_id(desc),
 			       newest_mount_id);
 		brelse(d_bh);
 		return 1;
 	}
-	c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			     ((trans_offset + get_desc_trans_len(desc) + 1) %
-			      SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+			      SB_ONDISK_JOURNAL_SIZE(sb)));
 	if (!c_bh) {
 		brelse(d_bh);
 		return 1;
 	}
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	if (journal_compare_desc_commit(sb, desc, commit)) {
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal_read_transaction, "
 			       "commit offset %llu had bad time %d or length %d",
 			       c_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       get_commit_trans_id(commit),
 			       get_commit_trans_len(commit));
 		brelse(c_bh);
@@ -2195,38 +2195,41 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		brelse(d_bh);
 		kfree(log_blocks);
 		kfree(real_blocks);
-		reiserfs_warning(p_s_sb,
-				 "journal-1169: kmalloc failed, unable to mount FS");
+		reiserfs_warning(sb, "journal-1169",
+				 "kmalloc failed, unable to mount FS");
 		return -1;
 	}
 	/* get all the buffer heads */
-	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	trans_half = journal_trans_half(sb->s_blocksize);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		log_blocks[i] =
-		    journal_getblk(p_s_sb,
-				   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_getblk(sb,
+				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				   (trans_offset + 1 +
-				    i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
 		if (i < trans_half) {
 			real_blocks[i] =
-			    sb_getblk(p_s_sb,
+			    sb_getblk(sb,
 				      le32_to_cpu(desc->j_realblock[i]));
 		} else {
 			real_blocks[i] =
-			    sb_getblk(p_s_sb,
+			    sb_getblk(sb,
 				      le32_to_cpu(commit->
 						  j_realblock[i - trans_half]));
 		}
-		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
+		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
+			reiserfs_warning(sb, "journal-1207",
+					 "REPLAY FAILURE fsck required! "
+					 "Block to replay is outside of "
+					 "filesystem");
 			goto abort_replay;
 		}
 		/* make sure we don't try to replay onto log or reserved area */
 		if (is_block_in_log_or_reserved_area
-		    (p_s_sb, real_blocks[i]->b_blocknr)) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");
+		    (sb, real_blocks[i]->b_blocknr)) {
+			reiserfs_warning(sb, "journal-1204",
+					 "REPLAY FAILURE fsck required! "
+					 "Trying to replay onto a log block");
 		      abort_replay:
 			brelse_array(log_blocks, i);
 			brelse_array(real_blocks, i);
@@ -2242,8 +2245,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(log_blocks[i]);
 		if (!buffer_uptodate(log_blocks[i])) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1212: REPLAY FAILURE fsck required! buffer write failed");
+			reiserfs_warning(sb, "journal-1212",
+					 "REPLAY FAILURE fsck required! "
+					 "buffer write failed");
 			brelse_array(log_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse_array(real_blocks, get_desc_trans_len(desc));
@@ -2266,8 +2270,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
 		if (!buffer_uptodate(real_blocks[i])) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1226: REPLAY FAILURE, fsck required! buffer write failed");
+			reiserfs_warning(sb, "journal-1226",
+					 "REPLAY FAILURE, fsck required! "
+					 "buffer write failed");
 			brelse_array(real_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse(c_bh);
@@ -2279,15 +2284,15 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 		brelse(real_blocks[i]);
 	}
 	cur_dblock =
-	    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 	    ((trans_offset + get_desc_trans_len(desc) +
-	      2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 		       "journal-1095: setting journal " "start to offset %ld",
-		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
 
 	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
-	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	journal->j_last_flush_trans_id = trans_id;
 	journal->j_trans_id = trans_id + 1;
 	/* check for trans_id overflow */
@@ -2352,12 +2357,12 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
 **
 ** On exit, it sets things up so the first transaction will work correctly.
 */
-static int journal_read(struct super_block *p_s_sb)
+static int journal_read(struct super_block *sb)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_desc *desc;
-	unsigned long oldest_trans_id = 0;
-	unsigned long oldest_invalid_trans_id = 0;
+	unsigned int oldest_trans_id = 0;
+	unsigned int oldest_invalid_trans_id = 0;
 	time_t start;
 	unsigned long oldest_start = 0;
 	unsigned long cur_dblock = 0;
@@ -2370,46 +2375,46 @@ static int journal_read(struct super_block *p_s_sb)
 	int ret;
 	char b[BDEVNAME_SIZE];
 
-	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
-	reiserfs_info(p_s_sb, "checking transaction log (%s)\n",
+	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
+	reiserfs_info(sb, "checking transaction log (%s)\n",
 		      bdevname(journal->j_dev_bd, b));
 	start = get_seconds();
 
-	/* step 1, read in the journal header block.  Check the transaction it says 
-	 ** is the first unflushed, and if that transaction is not valid, 
+	/* step 1, read in the journal header block.  Check the transaction it says
+	 ** is the first unflushed, and if that transaction is not valid,
 	 ** replay is done
 	 */
-	journal->j_header_bh = journal_bread(p_s_sb,
-					     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)
-					     + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	journal->j_header_bh = journal_bread(sb,
+					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
+					     + SB_ONDISK_JOURNAL_SIZE(sb));
 	if (!journal->j_header_bh) {
 		return 1;
 	}
 	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
 	if (le32_to_cpu(jh->j_first_unflushed_offset) <
-	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
+	    SB_ONDISK_JOURNAL_SIZE(sb)
 	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
 		oldest_start =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 		    le32_to_cpu(jh->j_first_unflushed_offset);
 		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
 		newest_mount_id = le32_to_cpu(jh->j_mount_id);
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1153: found in "
 			       "header: first_unflushed_offset %d, last_flushed_trans_id "
 			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
 			       le32_to_cpu(jh->j_last_flush_trans_id));
 		valid_journal_header = 1;
 
-		/* now, we try to read the first unflushed offset.  If it is not valid, 
-		 ** there is nothing more we can do, and it makes no sense to read 
+		/* now, we try to read the first unflushed offset.  If it is not valid,
+		 ** there is nothing more we can do, and it makes no sense to read
 		 ** through the whole log.
 		 */
 		d_bh =
-		    journal_bread(p_s_sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    journal_bread(sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				  le32_to_cpu(jh->j_first_unflushed_offset));
-		ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);
+		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
 		if (!ret) {
 			continue_replay = 0;
 		}
@@ -2417,9 +2422,9 @@ static int journal_read(struct super_block *p_s_sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
-		reiserfs_warning(p_s_sb,
-				 "clm-2076: device is readonly, unable to replay log");
+	if (continue_replay && bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
 		return -1;
 	}
 
@@ -2428,17 +2433,17 @@ static int journal_read(struct super_block *p_s_sb)
 	 */
 	while (continue_replay
 	       && cur_dblock <
-	       (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-		SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
+	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+		SB_ONDISK_JOURNAL_SIZE(sb))) {
 		/* Note that it is required for blocksize of primary fs device and journal
 		   device to be the same */
 		d_bh =
 		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
-				    p_s_sb->s_blocksize,
-				    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-				    SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+				    sb->s_blocksize,
+				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+				    SB_ONDISK_JOURNAL_SIZE(sb));
 		ret =
-		    journal_transaction_is_valid(p_s_sb, d_bh,
+		    journal_transaction_is_valid(sb, d_bh,
 						 &oldest_invalid_trans_id,
 						 &newest_mount_id);
 		if (ret == 1) {
@@ -2447,26 +2452,26 @@ static int journal_read(struct super_block *p_s_sb)
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
 				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1179: Setting "
 					       "oldest_start to offset %llu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (p_s_sb), oldest_trans_id);
+					       (sb), oldest_trans_id);
 			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
 				/* one we just read was older */
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1180: Resetting "
 					       "oldest_start to offset %lu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (p_s_sb), oldest_trans_id);
+					       (sb), oldest_trans_id);
 			}
 			if (newest_mount_id < get_desc_mount_id(desc)) {
 				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 					       "journal-1299: Setting "
 					       "newest_mount_id to %d",
 					       get_desc_mount_id(desc));
@@ -2481,17 +2486,17 @@ static int journal_read(struct super_block *p_s_sb)
       start_log_replay:
 	cur_dblock = oldest_start;
 	if (oldest_trans_id) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1206: Starting replay "
 			       "from offset %llu, trans_id %lu",
-			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 			       oldest_trans_id);
 
 	}
 	replay_count = 0;
 	while (continue_replay && oldest_trans_id > 0) {
 		ret =
-		    journal_read_transaction(p_s_sb, cur_dblock, oldest_start,
+		    journal_read_transaction(sb, cur_dblock, oldest_start,
 					     oldest_trans_id, newest_mount_id);
 		if (ret < 0) {
 			return ret;
@@ -2499,14 +2504,14 @@ static int journal_read(struct super_block *p_s_sb)
 			break;
 		}
 		cur_dblock =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;
+		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
 		replay_count++;
 		if (cur_dblock == oldest_start)
 			break;
 	}
 
 	if (oldest_trans_id == 0) {
-		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 			       "journal-1225: No valid " "transactions found");
 	}
 	/* j_start does not get set correctly if we don't replay any transactions.
@@ -2526,16 +2531,16 @@ static int journal_read(struct super_block *p_s_sb)
 	} else {
 		journal->j_mount_id = newest_mount_id + 1;
 	}
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
 		       "newest_mount_id to %lu", journal->j_mount_id);
 	journal->j_first_unflushed_offset = journal->j_start;
 	if (replay_count > 0) {
-		reiserfs_info(p_s_sb,
+		reiserfs_info(sb,
 			      "replayed %d transactions in %lu seconds\n",
 			      replay_count, get_seconds() - start);
 	}
-	if (!bdev_read_only(p_s_sb->s_bdev) &&
-	    _update_journal_header_block(p_s_sb, journal->j_start,
+	if (!bdev_read_only(sb->s_bdev) &&
+	    _update_journal_header_block(sb, journal->j_start,
 					 journal->j_last_flush_trans_id)) {
 		/* replay failed, caller must call free_journal_ram and abort
 		 ** the mount
@@ -2560,9 +2565,9 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 	return jl;
 }
 
-static void journal_list_init(struct super_block *p_s_sb)
+static void journal_list_init(struct super_block *sb)
 {
-	SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
+	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
 static int release_journal_dev(struct super_block *super,
@@ -2580,9 +2585,8 @@ static int release_journal_dev(struct super_block *super,
 	}
 
 	if (result != 0) {
-		reiserfs_warning(super,
-				 "sh-457: release_journal_dev: Cannot release journal device: %i",
-				 result);
+		reiserfs_warning(super, "sh-457",
+				 "Cannot release journal device: %i", result);
 	}
 	return result;
 }
@@ -2612,7 +2616,7 @@ static int journal_init_dev(struct super_block *super,
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
-			reiserfs_warning(super, "sh-458: journal_init_dev: "
+			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
@@ -2662,30 +2666,30 @@ static int journal_init_dev(struct super_block *super,
  */
 #define REISERFS_STANDARD_BLKSIZE (4096)
 
-static int check_advise_trans_params(struct super_block *p_s_sb,
+static int check_advise_trans_params(struct super_block *sb,
 				     struct reiserfs_journal *journal)
 {
         if (journal->j_trans_max) {
 	        /* Non-default journal params.
 		   Do sanity check for them. */
 	        int ratio = 1;
-		if (p_s_sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
-		        ratio = REISERFS_STANDARD_BLKSIZE / p_s_sb->s_blocksize;
+		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
+		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
 
 		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
 		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
-		    SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <
+		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
 		    JOURNAL_MIN_RATIO) {
-		        reiserfs_warning(p_s_sb,
-				 "sh-462: bad transaction max size (%u). FSCK?",
-				 journal->j_trans_max);
+			reiserfs_warning(sb, "sh-462",
+					 "bad transaction max size (%u). "
+					 "FSCK?", journal->j_trans_max);
 			return 1;
 		}
 		if (journal->j_max_batch != (journal->j_trans_max) *
 		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
-		        reiserfs_warning(p_s_sb,
-				"sh-463: bad transaction max batch (%u). FSCK?",
-				journal->j_max_batch);
+			reiserfs_warning(sb, "sh-463",
+					 "bad transaction max batch (%u). "
+					 "FSCK?", journal->j_max_batch);
 			return 1;
 		}
 	} else {
@@ -2693,9 +2697,11 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
                    The file system was created by old version
 		   of mkreiserfs, so some fields contain zeros,
 		   and we need to advise proper values for them */
-	        if (p_s_sb->s_blocksize != REISERFS_STANDARD_BLKSIZE)
-	                reiserfs_panic(p_s_sb, "sh-464: bad blocksize (%u)",
-				       p_s_sb->s_blocksize);
+		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
+			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
+					 sb->s_blocksize);
+			return 1;
+		}
 		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
 		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
 		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
@@ -2706,10 +2712,10 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
 /*
 ** must be called once on fs mount.  calls journal_read for you
 */
-int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
+int journal_init(struct super_block *sb, const char *j_dev_name,
 		 int old_format, unsigned int commit_max_age)
 {
-	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;
+	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
 	struct buffer_head *bhjh;
 	struct reiserfs_super_block *rs;
 	struct reiserfs_journal_header *jh;
@@ -2717,10 +2723,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	struct reiserfs_journal_list *jl;
 	char b[BDEVNAME_SIZE];
 
-	journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));
+	journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1256: unable to get memory for journal structure");
+		reiserfs_warning(sb, "journal-1256",
+				 "unable to get memory for journal structure");
 		return 1;
 	}
 	memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2729,51 +2735,51 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	INIT_LIST_HEAD(&journal->j_working_list);
 	INIT_LIST_HEAD(&journal->j_journal_list);
 	journal->j_persistent_trans = 0;
-	if (reiserfs_allocate_list_bitmaps(p_s_sb,
+	if (reiserfs_allocate_list_bitmaps(sb,
 					   journal->j_list_bitmap,
-					   reiserfs_bmap_count(p_s_sb)))
+					   reiserfs_bmap_count(sb)))
 		goto free_and_return;
-	allocate_bitmap_nodes(p_s_sb);
+	allocate_bitmap_nodes(sb);
 
 	/* reserved for journal area support */
-	SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
+	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
 						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
-						 / p_s_sb->s_blocksize +
-						 reiserfs_bmap_count(p_s_sb) +
+						 / sb->s_blocksize +
+						 reiserfs_bmap_count(sb) +
 						 1 :
 						 REISERFS_DISK_OFFSET_IN_BYTES /
-						 p_s_sb->s_blocksize + 2);
+						 sb->s_blocksize + 2);
 
 	/* Sanity check to see is the standard journal fitting withing first bitmap
 	   (actual for small blocksizes) */
-	if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&
-	    (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +
-	     SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1393: journal does not fit for area "
-				 "addressed by first of bitmap blocks. It starts at "
+	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
+	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
+		reiserfs_warning(sb, "journal-1393",
+				 "journal does not fit for area addressed "
+				 "by first of bitmap blocks. It starts at "
 				 "%u and its size is %u. Block size %ld",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
-				 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
-				 p_s_sb->s_blocksize);
+				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+				 SB_ONDISK_JOURNAL_SIZE(sb),
+				 sb->s_blocksize);
 		goto free_and_return;
 	}
 
-	if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {
-		reiserfs_warning(p_s_sb,
-				 "sh-462: unable to initialize jornal device");
+	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+		reiserfs_warning(sb, "sh-462",
+				 "unable to initialize jornal device");
 		goto free_and_return;
 	}
 
-	rs = SB_DISK_SUPER_BLOCK(p_s_sb);
+	rs = SB_DISK_SUPER_BLOCK(sb);
 
 	/* read journal header */
-	bhjh = journal_bread(p_s_sb,
-			     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-			     SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	bhjh = journal_bread(sb,
+			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
+			     SB_ONDISK_JOURNAL_SIZE(sb));
 	if (!bhjh) {
-		reiserfs_warning(p_s_sb,
-				 "sh-459: unable to read journal header");
+		reiserfs_warning(sb, "sh-459",
+				 "unable to read journal header");
 		goto free_and_return;
 	}
 	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
@@ -2782,10 +2788,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	if (is_reiserfs_jr(rs)
 	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
 		sb_jp_journal_magic(rs))) {
-		reiserfs_warning(p_s_sb,
-				 "sh-460: journal header magic %x "
-				 "(device %s) does not match to magic found in super "
-				 "block %x", jh->jh_journal.jp_journal_magic,
+		reiserfs_warning(sb, "sh-460",
+				 "journal header magic %x (device %s) does "
+				 "not match to magic found in super block %x",
+				 jh->jh_journal.jp_journal_magic,
 				 bdevname(journal->j_dev_bd, b),
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
@@ -2798,7 +2804,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
 	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
 
-	if (check_advise_trans_params(p_s_sb, journal) != 0)
+	if (check_advise_trans_params(sb, journal) != 0)
 	        goto free_and_return;
 	journal->j_default_max_commit_age = journal->j_max_commit_age;
 
@@ -2807,12 +2813,12 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 		journal->j_max_trans_age = commit_max_age;
 	}
 
-	reiserfs_info(p_s_sb, "journal params: device %s, size %u, "
+	reiserfs_info(sb, "journal params: device %s, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
 		      bdevname(journal->j_dev_bd, b),
-		      SB_ONDISK_JOURNAL_SIZE(p_s_sb),
-		      SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		      SB_ONDISK_JOURNAL_SIZE(sb),
+		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
 		      journal->j_max_batch,
 		      journal->j_max_commit_age, journal->j_max_trans_age);
@@ -2820,7 +2826,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	brelse(bhjh);
 
 	journal->j_list_bitmap_index = 0;
-	journal_list_init(p_s_sb);
+	journal_list_init(sb);
 
 	memset(journal->j_list_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
@@ -2852,7 +2858,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_must_wait = 0;
 
 	if (journal->j_cnode_free == 0) {
-        	reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "
+		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
 		                 "allocation failed (%ld bytes). Journal is "
 		                 "too large for available memory. Usually "
 		                 "this is due to a journal that is too large.",
@@ -2860,16 +2866,17 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
         	goto free_and_return;
 	}
 
-	init_journal_hash(p_s_sb);
+	init_journal_hash(sb);
 	jl = journal->j_current_jl;
-	jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+	jl->j_list_bitmap = get_list_bitmap(sb, jl);
 	if (!jl->j_list_bitmap) {
-		reiserfs_warning(p_s_sb,
-				 "journal-2005, get_list_bitmap failed for journal list 0");
+		reiserfs_warning(sb, "journal-2005",
+				 "get_list_bitmap failed for journal list 0");
 		goto free_and_return;
 	}
-	if (journal_read(p_s_sb) < 0) {
-		reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");
+	if (journal_read(sb) < 0) {
+		reiserfs_warning(sb, "reiserfs-2006",
+				 "Replay Failure, unable to mount");
 		goto free_and_return;
 	}
 
@@ -2878,10 +2885,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 		commit_wq = create_workqueue("reiserfs");
 
 	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
-	journal->j_work_sb = p_s_sb;
+	journal->j_work_sb = sb;
 	return 0;
       free_and_return:
-	free_journal_ram(p_s_sb);
+	free_journal_ram(sb);
 	return 1;
 }
 
@@ -2912,7 +2919,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 	return 0;
 }
 
-/* this must be called inside a transaction, and requires the 
+/* this must be called inside a transaction, and requires the
 ** kernel_lock to be held
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
@@ -2970,7 +2977,7 @@ static void wake_queued_writers(struct super_block *s)
 		wake_up(&journal->j_join_wait);
 }
 
-static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
+static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned long bcount = journal->j_bcount;
@@ -2997,43 +3004,43 @@ static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
 ** expect to use in nblocks.
 */
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *p_s_sb, unsigned long nblocks,
+			      struct super_block *sb, unsigned long nblocks,
 			      int join)
 {
 	time_t now = get_seconds();
-	int old_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	unsigned int old_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_transaction_handle myth;
 	int sched_count = 0;
 	int retval;
 
-	reiserfs_check_lock_depth(p_s_sb, "journal_begin");
+	reiserfs_check_lock_depth(sb, "journal_begin");
 	BUG_ON(nblocks > journal->j_trans_max);
 
-	PROC_INFO_INC(p_s_sb, journal.journal_being);
+	PROC_INFO_INC(sb, journal.journal_being);
 	/* set here for journal_join */
 	th->t_refcount = 1;
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 
       relock:
-	lock_journal(p_s_sb);
+	lock_journal(sb);
 	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		retval = journal->j_errno;
 		goto out_fail;
 	}
 	journal->j_bcount++;
 
 	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
-		unlock_journal(p_s_sb);
-		reiserfs_wait_on_write_block(p_s_sb);
-		PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);
+		unlock_journal(sb);
+		reiserfs_wait_on_write_block(sb);
+		PROC_INFO_INC(sb, journal.journal_relock_writers);
 		goto relock;
 	}
 	now = get_seconds();
 
 	/* if there is no room in the journal OR
-	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
+	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
 	 ** we don't sleep if there aren't other writers
 	 */
 
@@ -3048,7 +3055,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
 
 		old_trans_id = journal->j_trans_id;
-		unlock_journal(p_s_sb);	/* allow others to finish this transaction */
+		unlock_journal(sb);	/* allow others to finish this transaction */
 
 		if (!join && (journal->j_len_alloc + nblocks + 2) >=
 		    journal->j_max_batch &&
@@ -3056,7 +3063,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 		    (journal->j_len_alloc * 75)) {
 			if (atomic_read(&journal->j_wcount) > 10) {
 				sched_count++;
-				queue_log_writer(p_s_sb);
+				queue_log_writer(sb);
 				goto relock;
 			}
 		}
@@ -3066,25 +3073,25 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 		if (atomic_read(&journal->j_jlock)) {
 			while (journal->j_trans_id == old_trans_id &&
 			       atomic_read(&journal->j_jlock)) {
-				queue_log_writer(p_s_sb);
+				queue_log_writer(sb);
 			}
 			goto relock;
 		}
-		retval = journal_join(&myth, p_s_sb, 1);
+		retval = journal_join(&myth, sb, 1);
 		if (retval)
 			goto out_fail;
 
 		/* someone might have ended the transaction while we joined */
 		if (old_trans_id != journal->j_trans_id) {
-			retval = do_journal_end(&myth, p_s_sb, 1, 0);
+			retval = do_journal_end(&myth, sb, 1, 0);
 		} else {
-			retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);
+			retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
 		}
 
 		if (retval)
 			goto out_fail;
 
-		PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);
+		PROC_INFO_INC(sb, journal.journal_relock_wcount);
 		goto relock;
 	}
 	/* we are the first writer, set trans_id */
@@ -3096,7 +3103,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	th->t_blocks_logged = 0;
 	th->t_blocks_allocated = nblocks;
 	th->t_trans_id = journal->j_trans_id;
-	unlock_journal(p_s_sb);
+	unlock_journal(sb);
 	INIT_LIST_HEAD(&th->t_list);
 	get_fs_excl();
 	return 0;
@@ -3106,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 	return retval;
 }
 
@@ -3157,7 +3164,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
 }
 
 static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *p_s_sb, unsigned long nblocks)
+			struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -3166,11 +3173,11 @@ static int journal_join(struct reiserfs_transaction_handle *th,
 	 */
 	th->t_handle_save = cur_th;
 	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
 }
 
 int journal_join_abort(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, unsigned long nblocks)
+		       struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -3179,11 +3186,11 @@ int journal_join_abort(struct reiserfs_transaction_handle *th,
 	 */
 	th->t_handle_save = cur_th;
 	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);
+	return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th,
-		  struct super_block *p_s_sb, unsigned long nblocks)
+		  struct super_block *sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 	int ret;
@@ -3191,28 +3198,29 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 	th->t_handle_save = NULL;
 	if (cur_th) {
 		/* we are nesting into the current transaction */
-		if (cur_th->t_super == p_s_sb) {
+		if (cur_th->t_super == sb) {
 			BUG_ON(!cur_th->t_refcount);
 			cur_th->t_refcount++;
 			memcpy(th, cur_th, sizeof(*th));
 			if (th->t_refcount <= 1)
-				reiserfs_warning(p_s_sb,
-						 "BAD: refcount <= 1, but journal_info != 0");
+				reiserfs_warning(sb, "reiserfs-2005",
+						 "BAD: refcount <= 1, but "
+						 "journal_info != 0");
 			return 0;
 		} else {
 			/* we've ended up with a handle from a different filesystem.
 			 ** save it and restore on journal_end.  This should never
 			 ** really happen...
 			 */
-			reiserfs_warning(p_s_sb,
-					 "clm-2100: nesting info a different FS");
+			reiserfs_warning(sb, "clm-2100",
+					 "nesting info a different FS");
 			th->t_handle_save = current->journal_info;
 			current->journal_info = th;
 		}
 	} else {
 		current->journal_info = th;
 	}
-	ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);
+	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
 	BUG_ON(current->journal_info != th);
 
 	/* I guess this boils down to being the reciprocal of clm-2100 above.
@@ -3232,32 +3240,32 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 **
 ** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
 ** transaction is committed.
-** 
+**
 ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
 int journal_mark_dirty(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, struct buffer_head *bh)
+		       struct super_block *sb, struct buffer_head *bh)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	int count_already_incd = 0;
 	int prepared = 0;
 	BUG_ON(!th->t_trans_id);
 
-	PROC_INFO_INC(p_s_sb, journal.mark_dirty);
+	PROC_INFO_INC(sb, journal.mark_dirty);
 	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super,
-			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 
-	p_s_sb->s_dirt = 1;
+	sb->s_dirt = 1;
 
 	prepared = test_clear_buffer_journal_prepared(bh);
 	clear_buffer_journal_restore_dirty(bh);
 	/* already in this transaction, we are done */
 	if (buffer_journaled(bh)) {
-		PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);
+		PROC_INFO_INC(sb, journal.mark_dirty_already);
 		return 0;
 	}
 
@@ -3266,7 +3274,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	 ** could get to disk too early.  NOT GOOD.
 	 */
 	if (!prepared || buffer_dirty(bh)) {
-		reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "
+		reiserfs_warning(sb, "journal-1777",
+				 "buffer %llu bad state "
 				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
 				 (unsigned long long)bh->b_blocknr,
 				 prepared ? ' ' : '!',
@@ -3276,23 +3285,23 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 	}
 
 	if (atomic_read(&(journal->j_wcount)) <= 0) {
-		reiserfs_warning(p_s_sb,
-				 "journal-1409: journal_mark_dirty returning because j_wcount was %d",
+		reiserfs_warning(sb, "journal-1409",
+				 "returning because j_wcount was %d",
 				 atomic_read(&(journal->j_wcount)));
 		return 1;
 	}
-	/* this error means I've screwed up, and we've overflowed the transaction.  
+	/* this error means I've screwed up, and we've overflowed the transaction.
 	 ** Nothing can be done here, except make the FS readonly or panic.
 	 */
 	if (journal->j_len >= journal->j_trans_max) {
-		reiserfs_panic(th->t_super,
-			       "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",
+		reiserfs_panic(th->t_super, "journal-1413",
+			       "j_len (%lu) is too big",
 			       journal->j_len);
 	}
 
 	if (buffer_journal_dirty(bh)) {
 		count_already_incd = 1;
-		PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);
+		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
 		clear_buffer_journal_dirty(bh);
 	}
 
@@ -3304,9 +3313,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 	/* now put this guy on the end */
 	if (!cn) {
-		cn = get_cnode(p_s_sb);
+		cn = get_cnode(sb);
 		if (!cn) {
-			reiserfs_panic(p_s_sb, "get_cnode failed!\n");
+			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
 		}
 
 		if (th->t_blocks_logged == th->t_blocks_allocated) {
@@ -3318,7 +3327,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 
 		cn->bh = bh;
 		cn->blocknr = bh->b_blocknr;
-		cn->sb = p_s_sb;
+		cn->sb = sb;
 		cn->jlist = NULL;
 		insert_journal_hash(journal->j_hash_table, cn);
 		if (!count_already_incd) {
@@ -3339,11 +3348,11 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 }
 
 int journal_end(struct reiserfs_transaction_handle *th,
-		struct super_block *p_s_sb, unsigned long nblocks)
+		struct super_block *sb, unsigned long nblocks)
 {
 	if (!current->journal_info && th->t_refcount > 1)
-		reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",
-				 th->t_refcount);
+		reiserfs_warning(sb, "REISER-NESTING",
+				 "th NULL, refcount %d", th->t_refcount);
 
 	if (!th->t_trans_id) {
 		WARN_ON(1);
@@ -3366,26 +3375,26 @@ int journal_end(struct reiserfs_transaction_handle *th,
 		}
 		return 0;
 	} else {
-		return do_journal_end(th, p_s_sb, nblocks, 0);
+		return do_journal_end(th, sb, nblocks, 0);
 	}
 }
 
-/* removes from the current transaction, relsing and descrementing any counters.  
+/* removes from the current transaction, relsing and descrementing any counters.
 ** also files the removed buffer directly onto the clean list
 **
 ** called by journal_mark_freed when a block has been deleted
 **
 ** returns 1 if it cleaned and relsed the buffer. 0 otherwise
 */
-static int remove_from_transaction(struct super_block *p_s_sb,
+static int remove_from_transaction(struct super_block *sb,
 				   b_blocknr_t blocknr, int already_cleaned)
 {
 	struct buffer_head *bh;
 	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	int ret = 0;
 
-	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
 	if (!cn || !cn->bh) {
 		return ret;
 	}
@@ -3403,7 +3412,7 @@ static int remove_from_transaction(struct super_block *p_s_sb,
 		journal->j_last = cn->prev;
 	}
 	if (bh)
-		remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,
+		remove_journal_hash(sb, journal->j_hash_table, NULL,
 				    bh->b_blocknr, 0);
 	clear_buffer_journaled(bh);	/* don't log this one */
 
@@ -3413,14 +3422,14 @@ static int remove_from_transaction(struct super_block *p_s_sb,
 		clear_buffer_journal_test(bh);
 		put_bh(bh);
 		if (atomic_read(&(bh->b_count)) < 0) {
-			reiserfs_warning(p_s_sb,
-					 "journal-1752: remove from trans, b_count < 0");
+			reiserfs_warning(sb, "journal-1752",
+					 "b_count < 0");
 		}
 		ret = 1;
 	}
 	journal->j_len--;
 	journal->j_len_alloc--;
-	free_cnode(p_s_sb, cn);
+	free_cnode(sb, cn);
 	return ret;
 }
 
@@ -3468,22 +3477,22 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
 }
 
 /* syncs the commit blocks, but does not force the real buffers to disk
-** will wait until the current transaction is done/committed before returning 
+** will wait until the current transaction is done/committed before returning
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th,
-		     struct super_block *p_s_sb, unsigned long nblocks)
+		     struct super_block *sb, unsigned long nblocks)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	BUG_ON(!th->t_trans_id);
 	/* you can sync while nested, very, very bad */
 	BUG_ON(th->t_refcount > 1);
 	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
 					     1);
-		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
 	}
-	return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);
+	return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
 }
 
 /*
@@ -3493,7 +3502,7 @@ static void flush_async_commits(struct work_struct *work)
 {
 	struct reiserfs_journal *journal =
 		container_of(work, struct reiserfs_journal, j_work.work);
-	struct super_block *p_s_sb = journal->j_work_sb;
+	struct super_block *sb = journal->j_work_sb;
 	struct reiserfs_journal_list *jl;
 	struct list_head *entry;
 
@@ -3502,7 +3511,7 @@ static void flush_async_commits(struct work_struct *work)
 		/* last entry is the youngest, commit it and you get everything */
 		entry = journal->j_journal_list.prev;
 		jl = JOURNAL_LIST_ENTRY(entry);
-		flush_commit_list(p_s_sb, jl, 1);
+		flush_commit_list(sb, jl, 1);
 	}
 	unlock_kernel();
 }
@@ -3511,11 +3520,11 @@ static void flush_async_commits(struct work_struct *work)
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 */
-int reiserfs_flush_old_commits(struct super_block *p_s_sb)
+int reiserfs_flush_old_commits(struct super_block *sb)
 {
 	time_t now;
 	struct reiserfs_transaction_handle th;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	now = get_seconds();
 	/* safety check so we don't flush while we are replaying the log during
@@ -3532,35 +3541,35 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb)
 	    journal->j_trans_start_time > 0 &&
 	    journal->j_len > 0 &&
 	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		if (!journal_join(&th, p_s_sb, 1)) {
-			reiserfs_prepare_for_journal(p_s_sb,
-						     SB_BUFFER_WITH_SB(p_s_sb),
+		if (!journal_join(&th, sb, 1)) {
+			reiserfs_prepare_for_journal(sb,
+						     SB_BUFFER_WITH_SB(sb),
 						     1);
-			journal_mark_dirty(&th, p_s_sb,
-					   SB_BUFFER_WITH_SB(p_s_sb));
+			journal_mark_dirty(&th, sb,
+					   SB_BUFFER_WITH_SB(sb));
 
 			/* we're only being called from kreiserfsd, it makes no sense to do
 			 ** an async commit so that kreiserfsd can do it later
 			 */
-			do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);
+			do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
 		}
 	}
-	return p_s_sb->s_dirt;
+	return sb->s_dirt;
 }
 
 /*
 ** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
-** 
-** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all 
+**
+** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
 ** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
 ** flushes the commit list and returns 0.
 **
 ** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
-** 
+**
 ** Note, we can't allow the journal_end to proceed while there are still writers in the log.
 */
 static int check_journal_end(struct reiserfs_transaction_handle *th,
-			     struct super_block *p_s_sb, unsigned long nblocks,
+			     struct super_block *sb, unsigned long nblocks,
 			     int flags)
 {
 
@@ -3569,13 +3578,13 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	int commit_now = flags & COMMIT_NOW;
 	int wait_on_commit = flags & WAIT;
 	struct reiserfs_journal_list *jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	BUG_ON(!th->t_trans_id);
 
 	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super,
-			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+		reiserfs_panic(th->t_super, "journal-1577",
+			       "handle trans id %ld != current trans id %ld",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 
@@ -3584,7 +3593,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 		atomic_dec(&(journal->j_wcount));
 	}
 
-	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released 
+	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
 	 ** will be dealt with by next transaction that actually writes something, but should be taken
 	 ** care of in this trans
 	 */
@@ -3593,7 +3602,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	/* if wcount > 0, and we are called to with flush or commit_now,
 	 ** we wait on j_join_wait.  We will wake up when the last writer has
 	 ** finished the transaction, and started it on its way to the disk.
-	 ** Then, we flush the commit or journal list, and just return 0 
+	 ** Then, we flush the commit or journal list, and just return 0
 	 ** because the rest of journal end was already done for this transaction.
 	 */
 	if (atomic_read(&(journal->j_wcount)) > 0) {
@@ -3608,31 +3617,31 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 			if (flush) {
 				journal->j_next_full_flush = 1;
 			}
-			unlock_journal(p_s_sb);
+			unlock_journal(sb);
 
 			/* sleep while the current transaction is still j_jlocked */
 			while (journal->j_trans_id == trans_id) {
 				if (atomic_read(&journal->j_jlock)) {
-					queue_log_writer(p_s_sb);
+					queue_log_writer(sb);
 				} else {
-					lock_journal(p_s_sb);
+					lock_journal(sb);
 					if (journal->j_trans_id == trans_id) {
 						atomic_set(&(journal->j_jlock),
 							   1);
 					}
-					unlock_journal(p_s_sb);
+					unlock_journal(sb);
 				}
 			}
 			BUG_ON(journal->j_trans_id == trans_id);
 			
 			if (commit_now
-			    && journal_list_still_alive(p_s_sb, trans_id)
+			    && journal_list_still_alive(sb, trans_id)
 			    && wait_on_commit) {
-				flush_commit_list(p_s_sb, jl, 1);
+				flush_commit_list(sb, jl, 1);
 			}
 			return 0;
 		}
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		return 0;
 	}
 
@@ -3649,13 +3658,13 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	    && journal->j_len_alloc < journal->j_max_batch
 	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
 		journal->j_bcount++;
-		unlock_journal(p_s_sb);
+		unlock_journal(sb);
 		return 0;
 	}
 
-	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-		reiserfs_panic(p_s_sb,
-			       "journal-003: journal_end: j_start (%ld) is too high\n",
+	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
+		reiserfs_panic(sb, "journal-003",
+			       "j_start (%ld) is too high",
 			       journal->j_start);
 	}
 	return 1;
@@ -3664,7 +3673,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 /*
 ** Does all the work that makes deleting blocks safe.
 ** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
-** 
+**
 ** otherwise:
 ** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
 ** before this transaction has finished.
@@ -3676,16 +3685,16 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
 */
 int journal_mark_freed(struct reiserfs_transaction_handle *th,
-		       struct super_block *p_s_sb, b_blocknr_t blocknr)
+		       struct super_block *sb, b_blocknr_t blocknr)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	struct buffer_head *bh = NULL;
 	struct reiserfs_list_bitmap *jb = NULL;
 	int cleaned = 0;
 	BUG_ON(!th->t_trans_id);
 
-	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
 	if (cn && cn->bh) {
 		bh = cn->bh;
 		get_bh(bh);
@@ -3695,15 +3704,15 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 		clear_buffer_journal_new(bh);
 		clear_prepared_bits(bh);
 		reiserfs_clean_and_file_buffer(bh);
-		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
 	} else {
 		/* set the bit for this block in the journal bitmap for this transaction */
 		jb = journal->j_current_jl->j_list_bitmap;
 		if (!jb) {
-			reiserfs_panic(p_s_sb,
-				       "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");
+			reiserfs_panic(sb, "journal-1702",
+				       "journal_list_bitmap is NULL");
 		}
-		set_bit_in_list_bitmap(p_s_sb, blocknr, jb);
+		set_bit_in_list_bitmap(sb, blocknr, jb);
 
 		/* Note, the entire while loop is not allowed to schedule.  */
 
@@ -3711,13 +3720,13 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 			clear_prepared_bits(bh);
 			reiserfs_clean_and_file_buffer(bh);
 		}
-		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+		cleaned = remove_from_transaction(sb, blocknr, cleaned);
 
 		/* find all older transactions with this block, make sure they don't try to write it out */
-		cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,
+		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
 					  blocknr);
 		while (cn) {
-			if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
+			if (sb == cn->sb && blocknr == cn->blocknr) {
 				set_bit(BLOCK_FREED, &cn->state);
 				if (cn->bh) {
 					if (!cleaned) {
@@ -3733,8 +3742,9 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
 						put_bh(cn->bh);
 						if (atomic_read
 						    (&(cn->bh->b_count)) < 0) {
-							reiserfs_warning(p_s_sb,
-									 "journal-2138: cn->bh->b_count < 0");
+							reiserfs_warning(sb,
+								 "journal-2138",
+								 "cn->bh->b_count < 0");
 						}
 					}
 					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
@@ -3824,7 +3834,7 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
 
 int reiserfs_commit_for_inode(struct inode *inode)
 {
-	unsigned long id = REISERFS_I(inode)->i_trans_id;
+	unsigned int id = REISERFS_I(inode)->i_trans_id;
 	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 
 	/* for the whole inode, assume unset id means it was
@@ -3839,18 +3849,18 @@ int reiserfs_commit_for_inode(struct inode *inode)
 	return __commit_trans_jl(inode, id, jl);
 }
 
-void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
+void reiserfs_restore_prepared_buffer(struct super_block *sb,
 				      struct buffer_head *bh)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-	PROC_INFO_INC(p_s_sb, journal.restore_prepared);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	PROC_INFO_INC(sb, journal.restore_prepared);
 	if (!bh) {
 		return;
 	}
 	if (test_clear_buffer_journal_restore_dirty(bh) &&
 	    buffer_journal_dirty(bh)) {
 		struct reiserfs_journal_cnode *cn;
-		cn = get_journal_hash_dev(p_s_sb,
+		cn = get_journal_hash_dev(sb,
 					  journal->j_list_hash_table,
 					  bh->b_blocknr);
 		if (cn && can_dirty(cn)) {
@@ -3867,12 +3877,12 @@ extern struct tree_balance *cur_tb;
 ** be written to disk while we are altering it.  So, we must:
 ** clean it
 ** wait on it.
-** 
+**
 */
-int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
+int reiserfs_prepare_for_journal(struct super_block *sb,
 				 struct buffer_head *bh, int wait)
 {
-	PROC_INFO_INC(p_s_sb, journal.prepare);
+	PROC_INFO_INC(sb, journal.prepare);
 
 	if (!trylock_buffer(bh)) {
 		if (!wait)
@@ -3909,7 +3919,7 @@ static void flush_old_journal_lists(struct super_block *s)
 	}
 }
 
-/* 
+/*
 ** long and ugly.  If flush, will not return until all commit
 ** blocks and all real buffers in the trans are on disk.
 ** If no_async, won't return until all commit blocks are on disk.
@@ -3920,10 +3930,10 @@ static void flush_old_journal_lists(struct super_block *s)
 ** journal lists, etc just won't happen.
 */
 static int do_journal_end(struct reiserfs_transaction_handle *th,
-			  struct super_block *p_s_sb, unsigned long nblocks,
+			  struct super_block *sb, unsigned long nblocks,
 			  int flags)
 {
-	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
 	struct reiserfs_journal_cnode *last_cn = NULL;
 	struct reiserfs_journal_desc *desc;
@@ -3938,7 +3948,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	struct reiserfs_journal_list *jl, *temp_jl;
 	struct list_head *entry, *safe;
 	unsigned long jindex;
-	unsigned long commit_trans_id;
+	unsigned int commit_trans_id;
 	int trans_half;
 
 	BUG_ON(th->t_refcount > 1);
@@ -3946,21 +3956,21 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 	/* protect flush_older_commits from doing mistakes if the
            transaction ID counter gets overflowed.  */
-	if (th->t_trans_id == ~0UL)
+	if (th->t_trans_id == ~0U)
 		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
 	flush = flags & FLUSH_ALL;
 	wait_on_commit = flags & WAIT;
 
 	put_fs_excl();
 	current->journal_info = th->t_handle_save;
-	reiserfs_check_lock_depth(p_s_sb, "journal end");
+	reiserfs_check_lock_depth(sb, "journal end");
 	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
 					     1);
-		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+		journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
 	}
 
-	lock_journal(p_s_sb);
+	lock_journal(sb);
 	if (journal->j_next_full_flush) {
 		flags |= FLUSH_ALL;
 		flush = 1;
@@ -3970,13 +3980,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		wait_on_commit = 1;
 	}
 
-	/* check_journal_end locks the journal, and unlocks if it does not return 1 
+	/* check_journal_end locks the journal, and unlocks if it does not return 1
 	 ** it tells us if we should continue with the journal_end, or just return
 	 */
-	if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-		p_s_sb->s_dirt = 1;
-		wake_queued_writers(p_s_sb);
-		reiserfs_async_progress_wait(p_s_sb);
+	if (!check_journal_end(th, sb, nblocks, flags)) {
+		sb->s_dirt = 1;
+		wake_queued_writers(sb);
+		reiserfs_async_progress_wait(sb);
 		goto out;
 	}
 
@@ -4005,8 +4015,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 
 	/* setup description block */
 	d_bh =
-	    journal_getblk(p_s_sb,
-			   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	    journal_getblk(sb,
+			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			   journal->j_start);
 	set_buffer_uptodate(d_bh);
 	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
@@ -4015,9 +4025,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	set_desc_trans_id(desc, journal->j_trans_id);
 
 	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
-	c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 			      ((journal->j_start + journal->j_len +
-				1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
 	memset(c_bh->b_data, 0, c_bh->b_size);
 	set_commit_trans_id(commit, journal->j_trans_id);
@@ -4050,13 +4060,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 **  for each real block, add it to the journal list hash,
 	 ** copy into real block index array in the commit or desc block
 	 */
-	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	trans_half = journal_trans_half(sb->s_blocksize);
 	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
 		if (buffer_journaled(cn->bh)) {
-			jl_cn = get_cnode(p_s_sb);
+			jl_cn = get_cnode(sb);
 			if (!jl_cn) {
-				reiserfs_panic(p_s_sb,
-					       "journal-1676, get_cnode returned NULL\n");
+				reiserfs_panic(sb, "journal-1676",
+					       "get_cnode returned NULL");
 			}
 			if (i == 0) {
 				jl->j_realblock = jl_cn;
@@ -4067,18 +4077,19 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 				last_cn->next = jl_cn;
 			}
 			last_cn = jl_cn;
-			/* make sure the block we are trying to log is not a block 
+			/* make sure the block we are trying to log is not a block
 			   of journal or reserved area */
 
 			if (is_block_in_log_or_reserved_area
-			    (p_s_sb, cn->bh->b_blocknr)) {
-				reiserfs_panic(p_s_sb,
-					       "journal-2332: Trying to log block %lu, which is a log block\n",
+			    (sb, cn->bh->b_blocknr)) {
+				reiserfs_panic(sb, "journal-2332",
+					       "Trying to log block %lu, "
+					       "which is a log block",
 					       cn->bh->b_blocknr);
 			}
 			jl_cn->blocknr = cn->bh->b_blocknr;
 			jl_cn->state = 0;
-			jl_cn->sb = p_s_sb;
+			jl_cn->sb = sb;
 			jl_cn->bh = cn->bh;
 			jl_cn->jlist = jl;
 			insert_journal_hash(journal->j_list_hash_table, jl_cn);
@@ -4119,11 +4130,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			char *addr;
 			struct page *page;
 			tmp_bh =
-			    journal_getblk(p_s_sb,
-					   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			    journal_getblk(sb,
+					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 					   ((cur_write_start +
 					     jindex) %
-					    SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+					    SB_ONDISK_JOURNAL_SIZE(sb)));
 			set_buffer_uptodate(tmp_bh);
 			page = cn->bh->b_page;
 			addr = kmap(page);
@@ -4137,12 +4148,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			clear_buffer_journaled(cn->bh);
 		} else {
 			/* JDirty cleared sometime during transaction.  don't log this one */
-			reiserfs_warning(p_s_sb,
-					 "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");
+			reiserfs_warning(sb, "journal-2048",
+					 "BAD, buffer in journal hash, "
+					 "but not JDirty!");
 			brelse(cn->bh);
 		}
 		next = cn->next;
-		free_cnode(p_s_sb, cn);
+		free_cnode(sb, cn);
 		cn = next;
 		cond_resched();
 	}
@@ -4152,7 +4164,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
 	 */
 
-	journal->j_current_jl = alloc_journal_list(p_s_sb);
+	journal->j_current_jl = alloc_journal_list(sb);
 
 	/* now it is safe to insert this transaction on the main list */
 	list_add_tail(&jl->j_list, &journal->j_journal_list);
@@ -4163,7 +4175,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	old_start = journal->j_start;
 	journal->j_start =
 	    (journal->j_start + journal->j_len +
-	     2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
+	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
 	atomic_set(&(journal->j_wcount), 0);
 	journal->j_bcount = 0;
 	journal->j_last = NULL;
@@ -4178,7 +4190,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	journal->j_len_alloc = 0;
 	journal->j_next_full_flush = 0;
 	journal->j_next_async_flush = 0;
-	init_journal_hash(p_s_sb);
+	init_journal_hash(sb);
 
 	// make sure reiserfs_add_jh sees the new current_jl before we
 	// write out the tails
@@ -4207,14 +4219,14 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 ** queue don't wait for this proc to flush journal lists and such.
 	 */
 	if (flush) {
-		flush_commit_list(p_s_sb, jl, 1);
-		flush_journal_list(p_s_sb, jl, 1);
+		flush_commit_list(sb, jl, 1);
+		flush_journal_list(sb, jl, 1);
 	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
 		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
 
-	/* if the next transaction has any chance of wrapping, flush 
-	 ** transactions that might get overwritten.  If any journal lists are very 
-	 ** old flush them as well.  
+	/* if the next transaction has any chance of wrapping, flush
+	 ** transactions that might get overwritten.  If any journal lists are very
+	 ** old flush them as well.
 	 */
       first_jl:
 	list_for_each_safe(entry, safe, &journal->j_journal_list) {
@@ -4222,11 +4234,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		if (journal->j_start <= temp_jl->j_start) {
 			if ((journal->j_start + journal->j_trans_max + 1) >=
 			    temp_jl->j_start) {
-				flush_used_journal_lists(p_s_sb, temp_jl);
+				flush_used_journal_lists(sb, temp_jl);
 				goto first_jl;
 			} else if ((journal->j_start +
 				    journal->j_trans_max + 1) <
-				   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+				   SB_ONDISK_JOURNAL_SIZE(sb)) {
 				/* if we don't cross into the next transaction and we don't
 				 * wrap, there is no way we can overlap any later transactions
 				 * break now
@@ -4235,11 +4247,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			}
 		} else if ((journal->j_start +
 			    journal->j_trans_max + 1) >
-			   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+			   SB_ONDISK_JOURNAL_SIZE(sb)) {
 			if (((journal->j_start + journal->j_trans_max + 1) %
-			     SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
+			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
 			    temp_jl->j_start) {
-				flush_used_journal_lists(p_s_sb, temp_jl);
+				flush_used_journal_lists(sb, temp_jl);
 				goto first_jl;
 			} else {
 				/* we don't overlap anything from out start to the end of the
@@ -4250,46 +4262,47 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			}
 		}
 	}
-	flush_old_journal_lists(p_s_sb);
+	flush_old_journal_lists(sb);
 
 	journal->j_current_jl->j_list_bitmap =
-	    get_list_bitmap(p_s_sb, journal->j_current_jl);
+	    get_list_bitmap(sb, journal->j_current_jl);
 
 	if (!(journal->j_current_jl->j_list_bitmap)) {
-		reiserfs_panic(p_s_sb,
-			       "journal-1996: do_journal_end, could not get a list bitmap\n");
+		reiserfs_panic(sb, "journal-1996",
+			       "could not get a list bitmap");
 	}
 
 	atomic_set(&(journal->j_jlock), 0);
-	unlock_journal(p_s_sb);
+	unlock_journal(sb);
 	/* wake up any body waiting to join. */
 	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
 	wake_up(&(journal->j_join_wait));
 
 	if (!flush && wait_on_commit &&
-	    journal_list_still_alive(p_s_sb, commit_trans_id)) {
-		flush_commit_list(p_s_sb, jl, 1);
+	    journal_list_still_alive(sb, commit_trans_id)) {
+		flush_commit_list(sb, jl, 1);
 	}
       out:
-	reiserfs_check_lock_depth(p_s_sb, "journal end2");
+	reiserfs_check_lock_depth(sb, "journal end2");
 
 	memset(th, 0, sizeof(*th));
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
-	th->t_super = p_s_sb;
+	th->t_super = sb;
 
 	return journal->j_errno;
 }
 
-static void __reiserfs_journal_abort_hard(struct super_block *sb)
+/* Send the file system read only and refuse new transactions */
+void reiserfs_abort_journal(struct super_block *sb, int errno)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	if (test_bit(J_ABORTED, &journal->j_state))
 		return;
 
-	printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
-	       reiserfs_bdevname(sb));
+	if (!journal->j_errno)
+		journal->j_errno = errno;
 
 	sb->s_flags |= MS_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
@@ -4299,19 +4312,3 @@ static void __reiserfs_journal_abort_hard(struct super_block *sb)
 #endif
 }
 
-static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	if (test_bit(J_ABORTED, &journal->j_state))
-		return;
-
-	if (!journal->j_errno)
-		journal->j_errno = errno;
-
-	__reiserfs_journal_abort_hard(sb);
-}
-
-void reiserfs_journal_abort(struct super_block *sb, int errno)
-{
-	__reiserfs_journal_abort_soft(sb, errno);
-}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 6de060a6aa7f..381750a155f6 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -111,7 +111,7 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
 	item_num_in_dest =
 	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
 
-	leaf_paste_entries(dest_bi->bi_bh, item_num_in_dest,
+	leaf_paste_entries(dest_bi, item_num_in_dest,
 			   (last_first ==
 			    FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
 									  item_num_in_dest))
@@ -119,8 +119,8 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
 			   DEH_SIZE * copy_count + copy_records_len);
 }
 
-/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or 
-   part of it or nothing (see the return 0 below) from SOURCE to the end 
+/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
+   part of it or nothing (see the return 0 below) from SOURCE to the end
    (if last_first) or beginning (!last_first) of the DEST */
 /* returns 1 if anything was copied, else 0 */
 static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
@@ -168,10 +168,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
 			if (bytes_or_entries == ih_item_len(ih)
 			    && is_indirect_le_ih(ih))
 				if (get_ih_free_space(ih))
-					reiserfs_panic(NULL,
-						       "vs-10020: leaf_copy_boundary_item: "
-						       "last unformatted node must be filled entirely (%h)",
-						       ih);
+					reiserfs_panic(sb_from_bi(dest_bi),
+						       "vs-10020",
+						       "last unformatted node "
+						       "must be filled "
+						       "entirely (%h)", ih);
 		}
 #endif
 
@@ -395,7 +396,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		else {
 			struct item_head n_ih;
 
-			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST 
+			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
 			   part defined by 'cpy_bytes'; create new item header; change old item_header (????);
 			   n_ih = new item_header;
 			 */
@@ -425,7 +426,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		else {
 			struct item_head n_ih;
 
-			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST 
+			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
 			   part defined by 'cpy_bytes'; create new item header;
 			   n_ih = new item_header;
 			 */
@@ -622,9 +623,8 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
 		break;
 
 	default:
-		reiserfs_panic(NULL,
-			       "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)",
-			       shift_mode);
+		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
+			       "shift type is unknown (%d)", shift_mode);
 	}
 	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
 	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
@@ -674,9 +674,9 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
 #ifdef CONFIG_REISERFS_CHECK
 			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
 				print_cur_tb("vs-10275");
-				reiserfs_panic(tb->tb_sb,
-					       "vs-10275: leaf_shift_left: balance condition corrupted (%c)",
-					       tb->tb_mode);
+				reiserfs_panic(tb->tb_sb, "vs-10275",
+					       "balance condition corrupted "
+					       "(%c)", tb->tb_mode);
 			}
 #endif
 
@@ -724,7 +724,7 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
 static void leaf_delete_items_entirely(struct buffer_info *bi,
 				       int first, int del_num);
 /*  If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
-    If not. 
+    If not.
     If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
     the first item. Part defined by del_bytes. Don't delete first item header
     If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
@@ -783,7 +783,7 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 				/* len = body len of item */
 				len = ih_item_len(ih);
 
-			/* delete the part of the last item of the bh 
+			/* delete the part of the last item of the bh
 			   do not delete item header
 			 */
 			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
@@ -865,7 +865,7 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
 	}
 }
 
-/* paste paste_size bytes to affected_item_num-th item. 
+/* paste paste_size bytes to affected_item_num-th item.
    When item is a directory, this only prepare space for new entries */
 void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
 			  int pos_in_item, int paste_size,
@@ -889,9 +889,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
 
 #ifdef CONFIG_REISERFS_CHECK
 	if (zeros_number > paste_size) {
+		struct super_block *sb = NULL;
+		if (bi && bi->tb)
+			sb = bi->tb->tb_sb;
 		print_cur_tb("10177");
-		reiserfs_panic(NULL,
-			       "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
+		reiserfs_panic(sb, "vs-10177",
+			       "zeros_number == %d, paste_size == %d",
 			       zeros_number, paste_size);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
@@ -1019,7 +1022,7 @@ static int leaf_cut_entries(struct buffer_head *bh,
 /*  when cut item is part of regular file
         pos_in_item - first byte that must be cut
         cut_size - number of bytes to be cut beginning from pos_in_item
- 
+
    when cut item is part of directory
         pos_in_item - number of first deleted entry
         cut_size - count of deleted entries
@@ -1191,7 +1194,7 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
 }
 
 /* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
-void leaf_paste_entries(struct buffer_head *bh,
+void leaf_paste_entries(struct buffer_info *bi,
 			int item_num,
 			int before,
 			int new_entry_count,
@@ -1203,6 +1206,7 @@ void leaf_paste_entries(struct buffer_head *bh,
 	struct reiserfs_de_head *deh;
 	char *insert_point;
 	int i, old_entry_num;
+	struct buffer_head *bh = bi->bi_bh;
 
 	if (new_entry_count == 0)
 		return;
@@ -1271,7 +1275,7 @@ void leaf_paste_entries(struct buffer_head *bh,
 	/* change item key if necessary (when we paste before 0-th entry */
 	if (!before) {
 		set_le_ih_k_offset(ih, deh_offset(new_dehs));
-/*      memcpy (&ih->ih_key.k_offset, 
+/*      memcpy (&ih->ih_key.k_offset,
 		       &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
 	}
 #ifdef CONFIG_REISERFS_CHECK
@@ -1287,13 +1291,17 @@ void leaf_paste_entries(struct buffer_head *bh,
 			prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
 
 			if (prev && prev <= deh_location(&(deh[i])))
-				reiserfs_warning(NULL,
-						 "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
-						 ih, deh + i - 1, i, deh + i);
+				reiserfs_error(sb_from_bi(bi), "vs-10240",
+					       "directory item (%h) "
+					       "corrupted (prev %a, "
+					       "cur(%d) %a)",
+					       ih, deh + i - 1, i, deh + i);
 			if (next && next >= deh_location(&(deh[i])))
-				reiserfs_warning(NULL,
-						 "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
-						 ih, i, deh + i, deh + i + 1);
+				reiserfs_error(sb_from_bi(bi), "vs-10250",
+					       "directory item (%h) "
+					       "corrupted (cur(%d) %a, "
+					       "next %a)",
+					       ih, i, deh + i, deh + i + 1);
 		}
 	}
 #endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 738967f6c8ee..efd4d720718e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -106,7 +106,7 @@ key of the first directory entry in it.
 This function first calls search_by_key, then, if item whose first
 entry matches is not found it looks for the entry inside directory
 item found by search_by_key. Fills the path to the entry, and to the
-entry position in the item 
+entry position in the item
 
 */
 
@@ -120,8 +120,8 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 	switch (retval) {
 	case ITEM_NOT_FOUND:
 		if (!PATH_LAST_POSITION(path)) {
-			reiserfs_warning(sb,
-					 "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
+			reiserfs_error(sb, "vs-7000", "search_by_key "
+				       "returned item position == 0");
 			pathrelse(path);
 			return IO_ERROR;
 		}
@@ -135,8 +135,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 
 	default:
 		pathrelse(path);
-		reiserfs_warning(sb,
-				 "vs-7002: search_by_entry_key: no path to here");
+		reiserfs_error(sb, "vs-7002", "no path to here");
 		return IO_ERROR;
 	}
 
@@ -146,10 +145,9 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 	if (!is_direntry_le_ih(de->de_ih) ||
 	    COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
 		print_block(de->de_bh, 0, -1, -1);
-		reiserfs_panic(sb,
-			       "vs-7005: search_by_entry_key: found item %h is not directory item or "
-			       "does not belong to the same directory as key %K",
-			       de->de_ih, key);
+		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
+			       "item or does not belong to the same directory "
+			       "as key %K", de->de_ih, key);
 	}
 #endif				/* CONFIG_REISERFS_CHECK */
 
@@ -300,8 +298,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
 		    search_by_entry_key(dir->i_sb, &key_to_search,
 					path_to_entry, de);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(dir->i_sb, "zam-7001: io error in %s",
-					 __func__);
+			reiserfs_error(dir->i_sb, "zam-7001", "io error");
 			return IO_ERROR;
 		}
 
@@ -361,9 +358,10 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(-EACCES);
 		}
 
-		/* Propogate the priv_object flag so we know we're in the priv tree */
-		if (is_reiserfs_priv_object(dir))
-			reiserfs_mark_inode_private(inode);
+		/* Propagate the private flag so we know we're
+		 * in the priv tree */
+		if (IS_PRIVATE(dir))
+			inode->i_flags |= S_PRIVATE;
 	}
 	reiserfs_write_unlock(dir->i_sb);
 	if (retval == IO_ERROR) {
@@ -373,7 +371,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 	return d_splice_alias(inode, dentry);
 }
 
-/* 
+/*
 ** looks up the dentry of the parent directory for child.
 ** taken from ext2_get_parent
 */
@@ -403,7 +401,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	return d_obtain_alias(inode);
 }
 
-/* add entry to the directory (entry can be hidden). 
+/* add entry to the directory (entry can be hidden).
 
 insert definition of when hidden directories are used here -Hans
 
@@ -484,10 +482,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 		}
 
 		if (retval != NAME_FOUND) {
-			reiserfs_warning(dir->i_sb,
-					 "zam-7002:%s: \"reiserfs_find_entry\" "
-					 "has returned unexpected value (%d)",
-					 __func__, retval);
+			reiserfs_error(dir->i_sb, "zam-7002",
+				       "reiserfs_find_entry() returned "
+				       "unexpected value (%d)", retval);
 		}
 
 		return -EEXIST;
@@ -498,8 +495,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 				MAX_GENERATION_NUMBER + 1);
 	if (gen_number > MAX_GENERATION_NUMBER) {
 		/* there is no free generation number */
-		reiserfs_warning(dir->i_sb,
-				 "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
+		reiserfs_warning(dir->i_sb, "reiserfs-7010",
+				 "Congratulations! we have got hash function "
+				 "screwed up");
 		if (buffer != small_buf)
 			kfree(buffer);
 		pathrelse(&path);
@@ -515,10 +513,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	if (gen_number != 0) {	/* we need to re-search for the insertion point */
 		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
 		    NAME_NOT_FOUND) {
-			reiserfs_warning(dir->i_sb,
-					 "vs-7032: reiserfs_add_entry: "
-					 "entry with this key (%K) already exists",
-					 &entry_key);
+			reiserfs_warning(dir->i_sb, "vs-7032",
+					 "entry with this key (%K) already "
+					 "exists", &entry_key);
 
 			if (buffer != small_buf)
 				kfree(buffer);
@@ -555,15 +552,15 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 */
 static int drop_new_inode(struct inode *inode)
 {
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	make_bad_inode(inode);
 	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
 	return 0;
 }
 
-/* utility function that does setup for reiserfs_new_inode.  
-** DQUOT_INIT needs lots of credits so it's better to have it
+/* utility function that does setup for reiserfs_new_inode.
+** vfs_dq_init needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
@@ -586,7 +583,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 	} else {
 		inode->i_gid = current_fsgid();
 	}
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	return 0;
 }
 
@@ -601,20 +598,22 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 	struct reiserfs_transaction_handle th;
-	int locked;
+	struct reiserfs_security_handle security;
 
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
 
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
-
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
@@ -623,15 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	retval =
 	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode);
+			       inode, &security);
 	if (retval)
 		goto out_failed;
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	inode->i_op = &reiserfs_file_inode_operations;
 	inode->i_fop = &reiserfs_file_operations;
 	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
@@ -658,8 +652,6 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
@@ -670,12 +662,12 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	int retval;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	int locked;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -685,13 +677,15 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
 
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
-
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
@@ -700,16 +694,11 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 
 	retval =
 	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode);
+			       inode, &security);
 	if (retval) {
 		goto out_failed;
 	}
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	inode->i_op = &reiserfs_special_inode_operations;
 	init_special_inode(inode, inode->i_mode, rdev);
 
@@ -739,8 +728,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
@@ -750,12 +737,12 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int retval;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	int locked;
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
@@ -767,11 +754,14 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 	new_inode_init(inode, dir, mode);
 
-	locked = reiserfs_cache_default_acl(dir);
-
+	jbegin_count += reiserfs_cache_default_acl(dir);
+	retval = reiserfs_security_init(dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
 	reiserfs_write_lock(dir->i_sb);
-	if (locked)
-		reiserfs_write_lock_xattrs(dir->i_sb);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
@@ -787,17 +777,12 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	    retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
 					old_format_only(dir->i_sb) ?
 					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-					dentry, inode);
+					dentry, inode, &security);
 	if (retval) {
 		dir->i_nlink--;
 		goto out_failed;
 	}
 
-	if (locked) {
-		reiserfs_write_unlock_xattrs(dir->i_sb);
-		locked = 0;
-	}
-
 	reiserfs_update_inode_transaction(inode);
 	reiserfs_update_inode_transaction(dir);
 
@@ -827,8 +812,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
       out_failed:
-	if (locked)
-		reiserfs_write_unlock_xattrs(dir->i_sb);
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
 }
@@ -837,7 +820,7 @@ static inline int reiserfs_empty_dir(struct inode *inode)
 {
 	/* we can cheat because an old format dir cannot have
 	 ** EMPTY_DIR_SIZE, and a new format dir cannot have
-	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size, 
+	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
 	 ** regardless of disk format version, the directory is empty.
 	 */
 	if (inode->i_size != EMPTY_DIR_SIZE &&
@@ -903,8 +886,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	if (inode->i_nlink != 2 && inode->i_nlink != 1)
-		reiserfs_warning(inode->i_sb, "%s: empty directory has nlink "
-				 "!= 2 (%d)", __func__, inode->i_nlink);
+		reiserfs_error(inode->i_sb, "reiserfs-7040",
+			       "empty directory has nlink != 2 (%d)",
+			       inode->i_nlink);
 
 	clear_nlink(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -980,10 +964,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 	if (!inode->i_nlink) {
-		reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file "
-				 "(%s:%lu), %d", __func__,
-				 reiserfs_bdevname(inode->i_sb), inode->i_ino,
-				 inode->i_nlink);
+		reiserfs_warning(inode->i_sb, "reiserfs-7042",
+				 "deleting nonexistent file (%lu), %d",
+				 inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 
@@ -1037,6 +1020,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	char *name;
 	int item_len;
 	struct reiserfs_transaction_handle th;
+	struct reiserfs_security_handle security;
 	int mode = S_IFLNK | S_IRWXUGO;
 	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
 	int jbegin_count =
@@ -1049,6 +1033,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	}
 	new_inode_init(inode, parent_dir, mode);
 
+	retval = reiserfs_security_init(parent_dir, inode, &security);
+	if (retval < 0) {
+		drop_new_inode(inode);
+		return retval;
+	}
+	jbegin_count += retval;
+
 	reiserfs_write_lock(parent_dir->i_sb);
 	item_len = ROUND_UP(strlen(symname));
 	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
@@ -1066,8 +1057,6 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	memcpy(name, symname, strlen(symname));
 	padd_item(name, item_len, strlen(symname));
 
-	/* We would inherit the default ACL here, but symlinks don't get ACLs */
-
 	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
@@ -1077,7 +1066,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 
 	retval =
 	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
-			       dentry, inode);
+			       dentry, inode, &security);
 	kfree(name);
 	if (retval) {		/* reiserfs_new_inode iputs for us */
 		goto out_failed;
@@ -1173,7 +1162,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	return retval;
 }
 
-// de contains information pointing to an entry which 
+/* de contains information pointing to an entry which */
 static int de_still_valid(const char *name, int len,
 			  struct reiserfs_dir_entry *de)
 {
@@ -1196,15 +1185,14 @@ static int entry_points_to_object(const char *name, int len,
 
 	if (inode) {
 		if (!de_visible(de->de_deh + de->de_entry_num))
-			reiserfs_panic(NULL,
-				       "vs-7042: entry_points_to_object: entry must be visible");
+			reiserfs_panic(inode->i_sb, "vs-7042",
+				       "entry must be visible");
 		return (de->de_objectid == inode->i_ino) ? 1 : 0;
 	}
 
 	/* this must be added hidden entry */
 	if (de_visible(de->de_deh + de->de_entry_num))
-		reiserfs_panic(NULL,
-			       "vs-7043: entry_points_to_object: entry must be visible");
+		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
 
 	return 1;
 }
@@ -1218,10 +1206,10 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
 	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
 }
 
-/* 
+/*
  * process, that is going to call fix_nodes/do_balance must hold only
  * one path. If it holds 2 or more, it can get into endless waiting in
- * get_empty_nodes or its clones 
+ * get_empty_nodes or its clones
  */
 static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry)
@@ -1275,7 +1263,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	old_inode_mode = old_inode->i_mode;
 	if (S_ISDIR(old_inode_mode)) {
-		// make sure, that directory being renamed has correct ".." 
+		// make sure, that directory being renamed has correct ".."
 		// and that its new parent directory has not too many links
 		// already
 
@@ -1286,8 +1274,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			}
 		}
 
-		/* directory is renamed, its parent directory will be changed, 
-		 ** so find ".." entry 
+		/* directory is renamed, its parent directory will be changed,
+		 ** so find ".." entry
 		 */
 		dot_dot_de.de_gen_number_bit_string = NULL;
 		retval =
@@ -1318,8 +1306,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			       new_dentry->d_name.len, old_inode, 0);
 	if (retval == -EEXIST) {
 		if (!new_dentry_inode) {
-			reiserfs_panic(old_dir->i_sb,
-				       "vs-7050: new entry is found, new inode == 0\n");
+			reiserfs_panic(old_dir->i_sb, "vs-7050",
+				       "new entry is found, new inode == 0");
 		}
 	} else if (retval) {
 		int err = journal_end(&th, old_dir->i_sb, jbegin_count);
@@ -1397,9 +1385,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		   this stuff, yes? Then, having
 		   gathered everything into RAM we
 		   should lock the buffers, yes?  -Hans */
-		/* probably.  our rename needs to hold more 
-		 ** than one path at once.  The seals would 
-		 ** have to be written to deal with multi-path 
+		/* probably.  our rename needs to hold more
+		 ** than one path at once.  The seals would
+		 ** have to be written to deal with multi-path
 		 ** issues -chris
 		 */
 		/* sanity checking before doing the rename - avoid races many
@@ -1477,7 +1465,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	if (S_ISDIR(old_inode_mode)) {
-		// adjust ".." of renamed directory 
+		/* adjust ".." of renamed directory */
 		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
 		journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
 
@@ -1499,8 +1487,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (reiserfs_cut_from_item
 	    (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
 	     0) < 0)
-		reiserfs_warning(old_dir->i_sb,
-				 "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?");
+		reiserfs_error(old_dir->i_sb, "vs-7060",
+			       "couldn't not cut old name. Fsck later?");
 
 	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
 
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index ea0cf8c28a99..3a6de810bd61 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -18,8 +18,7 @@
 static void check_objectid_map(struct super_block *s, __le32 * map)
 {
 	if (le32_to_cpu(map[0]) != 1)
-		reiserfs_panic(s,
-			       "vs-15010: check_objectid_map: map corrupted: %lx",
+		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
 			       (long unsigned int)le32_to_cpu(map[0]));
 
 	// FIXME: add something else here
@@ -61,7 +60,7 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
 	/* comment needed -Hans */
 	unused_objectid = le32_to_cpu(map[1]);
 	if (unused_objectid == U32_MAX) {
-		reiserfs_warning(s, "%s: no more object ids", __func__);
+		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
 		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
 		return 0;
 	}
@@ -160,9 +159,8 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
 		i += 2;
 	}
 
-	reiserfs_warning(s,
-			 "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
-			 (long unsigned)objectid_to_release);
+	reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
+		       (long unsigned)objectid_to_release);
 }
 
 int reiserfs_convert_objectid_map_v1(struct super_block *s)
@@ -182,7 +180,7 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
 
 	if (cur_size > new_size) {
 		/* mark everyone used that was listed as free at the end of the objectid
-		 ** map 
+		 ** map
 		 */
 		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
 		set_sb_oid_cursize(disk_sb, new_size);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 740bb8c0c1ae..536eacaeb710 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -157,19 +157,16 @@ static void sprintf_disk_child(char *buf, struct disk_child *dc)
 		dc_size(dc));
 }
 
-static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip)
+static char *is_there_reiserfs_struct(char *fmt, int *what)
 {
 	char *k = fmt;
 
-	*skip = 0;
-
 	while ((k = strchr(k, '%')) != NULL) {
 		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
 		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
 			*what = k[1];
 			break;
 		}
-		(*skip)++;
 		k++;
 	}
 	return k;
@@ -181,30 +178,29 @@ static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip)
    appropriative printk. With this reiserfs_warning you can use format
    specification for complex structures like you used to do with
    printfs for integers, doubles and pointers. For instance, to print
-   out key structure you have to write just: 
-   reiserfs_warning ("bad key %k", key); 
-   instead of 
-   printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, 
-           key->k_offset, key->k_uniqueness); 
+   out key structure you have to write just:
+   reiserfs_warning ("bad key %k", key);
+   instead of
+   printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+           key->k_offset, key->k_uniqueness);
 */
-
+static DEFINE_SPINLOCK(error_lock);
 static void prepare_error_buf(const char *fmt, va_list args)
 {
 	char *fmt1 = fmt_buf;
 	char *k;
 	char *p = error_buf;
-	int i, j, what, skip;
+	int what;
+
+	spin_lock(&error_lock);
 
 	strcpy(fmt1, fmt);
 
-	while ((k = is_there_reiserfs_struct(fmt1, &what, &skip)) != NULL) {
+	while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
 		*k = 0;
 
 		p += vsprintf(p, fmt1, args);
 
-		for (i = 0; i < skip; i++)
-			j = va_arg(args, int);
-
 		switch (what) {
 		case 'k':
 			sprintf_le_key(p, va_arg(args, struct reiserfs_key *));
@@ -243,15 +239,16 @@ static void prepare_error_buf(const char *fmt, va_list args)
 		fmt1 = k + 2;
 	}
 	vsprintf(p, fmt1, args);
+	spin_unlock(&error_lock);
 
 }
 
 /* in addition to usual conversion specifiers this accepts reiserfs
-   specific conversion specifiers: 
-   %k to print little endian key, 
-   %K to print cpu key, 
+   specific conversion specifiers:
+   %k to print little endian key,
+   %K to print cpu key,
    %h to print item_head,
-   %t to print directory entry 
+   %t to print directory entry
    %z to print block head (arg must be struct buffer_head *
    %b to print buffer_head
 */
@@ -264,14 +261,17 @@ static void prepare_error_buf(const char *fmt, va_list args)
     va_end( args );\
 }
 
-void reiserfs_warning(struct super_block *sb, const char *fmt, ...)
+void __reiserfs_warning(struct super_block *sb, const char *id,
+			 const char *function, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
 	if (sb)
-		printk(KERN_WARNING "ReiserFS: %s: warning: %s\n",
-		       reiserfs_bdevname(sb), error_buf);
+		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
+		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
+		       function, error_buf);
 	else
-		printk(KERN_WARNING "ReiserFS: warning: %s\n", error_buf);
+		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
+		       id ? id : "", id ? " " : "", function, error_buf);
 }
 
 /* No newline.. reiserfs_info calls can be followed by printk's */
@@ -279,10 +279,10 @@ void reiserfs_info(struct super_block *sb, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
 	if (sb)
-		printk(KERN_NOTICE "ReiserFS: %s: %s",
-		       reiserfs_bdevname(sb), error_buf);
+		printk(KERN_NOTICE "REISERFS (device %s): %s",
+		       sb->s_id, error_buf);
 	else
-		printk(KERN_NOTICE "ReiserFS: %s", error_buf);
+		printk(KERN_NOTICE "REISERFS %s:", error_buf);
 }
 
 /* No newline.. reiserfs_printk calls can be followed by printk's */
@@ -297,10 +297,10 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 #ifdef CONFIG_REISERFS_CHECK
 	do_reiserfs_warning(fmt);
 	if (s)
-		printk(KERN_DEBUG "ReiserFS: %s: %s\n",
-		       reiserfs_bdevname(s), error_buf);
+		printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
+		       s->s_id, error_buf);
 	else
-		printk(KERN_DEBUG "ReiserFS: %s\n", error_buf);
+		printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
 #endif
 }
 
@@ -314,17 +314,17 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
     maintainer-errorid.  Don't bother with reusing errorids, there are
     lots of numbers out there.
 
-    Example: 
-    
+    Example:
+
     reiserfs_panic(
 	p_sb, "reiser-29: reiserfs_new_blocknrs: "
 	"one of search_start or rn(%d) is equal to MAX_B_NUM,"
-	"which means that we are optimizing location based on the bogus location of a temp buffer (%p).", 
+	"which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
 	rn, bh
     );
 
     Regular panic()s sometimes clear the screen before the message can
-    be read, thus the need for the while loop.  
+    be read, thus the need for the while loop.
 
     Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
     pointless complexity):
@@ -353,14 +353,46 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 extern struct tree_balance *cur_tb;
 #endif
 
-void reiserfs_panic(struct super_block *sb, const char *fmt, ...)
+void __reiserfs_panic(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
 {
 	do_reiserfs_warning(fmt);
 
+#ifdef CONFIG_REISERFS_CHECK
 	dump_stack();
+#endif
+	if (sb)
+		panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
+		      sb->s_id, id ? id : "", id ? " " : "",
+		      function, error_buf);
+	else
+		panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
+		      id ? id : "", id ? " " : "", function, error_buf);
+}
+
+void __reiserfs_error(struct super_block *sb, const char *id,
+		      const char *function, const char *fmt, ...)
+{
+	do_reiserfs_warning(fmt);
 
-	panic(KERN_EMERG "REISERFS: panic (device %s): %s\n",
-	       reiserfs_bdevname(sb), error_buf);
+	BUG_ON(sb == NULL);
+
+	if (reiserfs_error_panic(sb))
+		__reiserfs_panic(sb, id, function, error_buf);
+
+	if (id && id[0])
+		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
+		       sb->s_id, id, function, error_buf);
+	else
+		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
+		       sb->s_id, function, error_buf);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	reiserfs_info(sb, "Remounting filesystem read-only\n");
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_abort_journal(sb, -EIO);
 }
 
 void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
@@ -368,18 +400,18 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 	do_reiserfs_warning(fmt);
 
 	if (reiserfs_error_panic(sb)) {
-		panic(KERN_CRIT "REISERFS: panic (device %s): %s\n",
-		      reiserfs_bdevname(sb), error_buf);
+		panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
+		      error_buf);
 	}
 
-	if (sb->s_flags & MS_RDONLY)
+	if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
 		return;
 
-	printk(KERN_CRIT "REISERFS: abort (device %s): %s\n",
-	       reiserfs_bdevname(sb), error_buf);
+	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
+	       error_buf);
 
 	sb->s_flags |= MS_RDONLY;
-	reiserfs_journal_abort(sb, errno);
+	reiserfs_abort_journal(sb, errno);
 }
 
 /* this prints internal nodes (4 keys/items in line) (dc_number,
@@ -681,12 +713,10 @@ static void check_leaf_block_head(struct buffer_head *bh)
 	blkh = B_BLK_HEAD(bh);
 	nr = blkh_nr_item(blkh);
 	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL,
-			       "vs-6010: check_leaf_block_head: invalid item number %z",
+		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
 			       bh);
 	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
-		reiserfs_panic(NULL,
-			       "vs-6020: check_leaf_block_head: invalid free space %z",
+		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
 			       bh);
 
 }
@@ -697,21 +727,15 @@ static void check_internal_block_head(struct buffer_head *bh)
 
 	blkh = B_BLK_HEAD(bh);
 	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
-		reiserfs_panic(NULL,
-			       "vs-6025: check_internal_block_head: invalid level %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
 
 	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL,
-			       "vs-6030: check_internal_block_head: invalid item number %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
 
 	if (B_FREE_SPACE(bh) !=
 	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
 	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
-		reiserfs_panic(NULL,
-			       "vs-6040: check_internal_block_head: invalid free space %z",
-			       bh);
+		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
 
 }
 
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 37173fa07d15..9229e5514a4e 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -321,7 +321,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 		   /* incore fields */
 		   "j_1st_reserved_block: \t%i\n"
 		   "j_state: \t%li\n"
-		   "j_trans_id: \t%lu\n"
+		   "j_trans_id: \t%u\n"
 		   "j_mount_id: \t%lu\n"
 		   "j_start: \t%lu\n"
 		   "j_len: \t%lu\n"
@@ -329,7 +329,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 		   "j_wcount: \t%i\n"
 		   "j_bcount: \t%lu\n"
 		   "j_first_unflushed_offset: \t%lu\n"
-		   "j_last_flush_trans_id: \t%lu\n"
+		   "j_last_flush_trans_id: \t%u\n"
 		   "j_trans_start_time: \t%li\n"
 		   "j_list_bitmap_index: \t%i\n"
 		   "j_must_wait: \t%i\n"
@@ -492,7 +492,6 @@ int reiserfs_proc_info_init(struct super_block *sb)
 	spin_lock_init(&__PINFO(sb).lock);
 	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
 	if (REISERFS_SB(sb)->procdir) {
-		REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
 		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
 		add_file(sb, "super", show_super);
@@ -503,7 +502,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
 		add_file(sb, "journal", show_journal);
 		return 0;
 	}
-	reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s",
+	reiserfs_warning(sb, "cannot create /proc/%s/%s",
 			 proc_info_root_name, b);
 	return 1;
 }
@@ -556,11 +555,8 @@ int reiserfs_proc_info_global_init(void)
 {
 	if (proc_info_root == NULL) {
 		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if (proc_info_root) {
-			proc_info_root->owner = THIS_MODULE;
-		} else {
-			reiserfs_warning(NULL,
-					 "reiserfs: cannot create /proc/%s",
+		if (!proc_info_root) {
+			reiserfs_warning(NULL, "cannot create /proc/%s",
 					 proc_info_root_name);
 			return 1;
 		}
@@ -634,7 +630,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
  *
  */
 
-/* 
+/*
  * Make Linus happy.
  * Local variables:
  * c-indentation-style: "K&R"
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index f71c3948edef..238e9d9b31e0 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -1,8 +1,8 @@
-/* 
+/*
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-/* 
+/*
  * Written by Alexander Zarochentcev.
  *
  * The kernel part of the (on-line) reiserfs resizer.
@@ -101,7 +101,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
 
 			/* just in case vfree schedules on us, copy the new
-			 ** pointer into the journal struct before freeing the 
+			 ** pointer into the journal struct before freeing the
 			 ** old one
 			 */
 			node_tmp = jb->bitmaps;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index abbc64dcc8d4..d036ee5b1c81 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -23,7 +23,6 @@
  * get_rkey
  * key_in_buffer
  * decrement_bcount
- * decrement_counters_in_path
  * reiserfs_check_path
  * pathrelse_and_restore
  * pathrelse
@@ -57,28 +56,28 @@
 #include <linux/quotaops.h>
 
 /* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE(const struct buffer_head *p_s_bh)
+inline int B_IS_IN_TREE(const struct buffer_head *bh)
 {
 
-	RFALSE(B_LEVEL(p_s_bh) > MAX_HEIGHT,
-	       "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
+	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
+	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
 
-	return (B_LEVEL(p_s_bh) != FREE_LEVEL);
+	return (B_LEVEL(bh) != FREE_LEVEL);
 }
 
 //
 // to gets item head in le form
 //
-inline void copy_item_head(struct item_head *p_v_to,
-			   const struct item_head *p_v_from)
+inline void copy_item_head(struct item_head *to,
+			   const struct item_head *from)
 {
-	memcpy(p_v_to, p_v_from, IH_SIZE);
+	memcpy(to, from, IH_SIZE);
 }
 
 /* k1 is pointer to on-disk structure which is stored in little-endian
    form. k2 is pointer to cpu variable. For key of items of the same
    object this returns 0.
-   Returns: -1 if key1 < key2 
+   Returns: -1 if key1 < key2
    0 if key1 == key2
    1 if key1 > key2 */
 inline int comp_short_keys(const struct reiserfs_key *le_key,
@@ -136,15 +135,15 @@ static inline int comp_keys(const struct reiserfs_key *le_key,
 inline int comp_short_le_keys(const struct reiserfs_key *key1,
 			      const struct reiserfs_key *key2)
 {
-	__u32 *p_s_1_u32, *p_s_2_u32;
-	int n_key_length = REISERFS_SHORT_KEY_LEN;
+	__u32 *k1_u32, *k2_u32;
+	int key_length = REISERFS_SHORT_KEY_LEN;
 
-	p_s_1_u32 = (__u32 *) key1;
-	p_s_2_u32 = (__u32 *) key2;
-	for (; n_key_length--; ++p_s_1_u32, ++p_s_2_u32) {
-		if (le32_to_cpu(*p_s_1_u32) < le32_to_cpu(*p_s_2_u32))
+	k1_u32 = (__u32 *) key1;
+	k2_u32 = (__u32 *) key2;
+	for (; key_length--; ++k1_u32, ++k2_u32) {
+		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
 			return -1;
-		if (le32_to_cpu(*p_s_1_u32) > le32_to_cpu(*p_s_2_u32))
+		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
 			return 1;
 	}
 	return 0;
@@ -175,52 +174,51 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
  *  Binary search toolkit function                                        *
  *  Search for an item in the array by the item key                       *
  *  Returns:    1 if found,  0 if not found;                              *
- *        *p_n_pos = number of the searched element if found, else the    *
- *        number of the first element that is larger than p_v_key.        *
+ *        *pos = number of the searched element if found, else the        *
+ *        number of the first element that is larger than key.            *
  **************************************************************************/
-/* For those not familiar with binary search: n_lbound is the leftmost item that it
- could be, n_rbound the rightmost item that it could be.  We examine the item
- halfway between n_lbound and n_rbound, and that tells us either that we can increase
- n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that
+/* For those not familiar with binary search: lbound is the leftmost item that it
+ could be, rbound the rightmost item that it could be.  We examine the item
+ halfway between lbound and rbound, and that tells us either that we can increase
+ lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
  there are no possible items, and we have not found it. With each examination we
  cut the number of possible items it could be by one more than half rounded down,
  or we find it. */
-static inline int bin_search(const void *p_v_key,	/* Key to search for.                   */
-			     const void *p_v_base,	/* First item in the array.             */
-			     int p_n_num,	/* Number of items in the array.        */
-			     int p_n_width,	/* Item size in the array.
-						   searched. Lest the reader be
-						   confused, note that this is crafted
-						   as a general function, and when it
-						   is applied specifically to the array
-						   of item headers in a node, p_n_width
-						   is actually the item header size not
-						   the item size.                      */
-			     int *p_n_pos	/* Number of the searched for element. */
+static inline int bin_search(const void *key,	/* Key to search for. */
+			     const void *base,	/* First item in the array. */
+			     int num,	/* Number of items in the array. */
+			     int width,	/* Item size in the array.
+					   searched. Lest the reader be
+					   confused, note that this is crafted
+					   as a general function, and when it
+					   is applied specifically to the array
+					   of item headers in a node, width
+					   is actually the item header size not
+					   the item size. */
+			     int *pos /* Number of the searched for element. */
     )
 {
-	int n_rbound, n_lbound, n_j;
+	int rbound, lbound, j;
 
-	for (n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0)) / 2;
-	     n_lbound <= n_rbound; n_j = (n_rbound + n_lbound) / 2)
+	for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
+	     lbound <= rbound; j = (rbound + lbound) / 2)
 		switch (comp_keys
-			((struct reiserfs_key *)((char *)p_v_base +
-						 n_j * p_n_width),
-			 (struct cpu_key *)p_v_key)) {
+			((struct reiserfs_key *)((char *)base + j * width),
+			 (struct cpu_key *)key)) {
 		case -1:
-			n_lbound = n_j + 1;
+			lbound = j + 1;
 			continue;
 		case 1:
-			n_rbound = n_j - 1;
+			rbound = j - 1;
 			continue;
 		case 0:
-			*p_n_pos = n_j;
+			*pos = j;
 			return ITEM_FOUND;	/* Key found in the array.  */
 		}
 
 	/* bin_search did not find given key, it returns position of key,
 	   that is minimal and greater than the given one. */
-	*p_n_pos = n_lbound;
+	*pos = lbound;
 	return ITEM_NOT_FOUND;
 }
 
@@ -243,90 +241,88 @@ static const struct reiserfs_key MAX_KEY = {
    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
    the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
    case we return a special key, either MIN_KEY or MAX_KEY. */
-static inline const struct reiserfs_key *get_lkey(const struct treepath
-						  *p_s_chk_path,
-						  const struct super_block
-						  *p_s_sb)
+static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
+						  const struct super_block *sb)
 {
-	int n_position, n_path_offset = p_s_chk_path->path_length;
-	struct buffer_head *p_s_parent;
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
 
-	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
 	       "PAP-5010: invalid offset in the path");
 
 	/* While not higher in path than first element. */
-	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
 
 		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
 		       "PAP-5020: parent is not uptodate");
 
 		/* Parent at the path is not in the tree now. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent =
-		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
 			return &MAX_KEY;
 		/* Check whether position in the parent is correct. */
-		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_chk_path,
-					  n_path_offset)) >
-		    B_NR_ITEMS(p_s_parent))
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
 			return &MAX_KEY;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_chk_path,
-					n_path_offset + 1)->b_blocknr)
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
 			return &MAX_KEY;
 		/* Return delimiting key if position in the parent is not equal to zero. */
-		if (n_position)
-			return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
+		if (position)
+			return B_N_PDELIM_KEY(parent, position - 1);
 	}
 	/* Return MIN_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MIN_KEY;
 	return &MAX_KEY;
 }
 
 /* Get delimiting key of the buffer at the path and its right neighbor. */
-inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
-					   const struct super_block *p_s_sb)
+inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
+					   const struct super_block *sb)
 {
-	int n_position, n_path_offset = p_s_chk_path->path_length;
-	struct buffer_head *p_s_parent;
+	int position, path_offset = chk_path->path_length;
+	struct buffer_head *parent;
 
-	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
 	       "PAP-5030: invalid offset in the path");
 
-	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
 
 		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
 		       "PAP-5040: parent is not uptodate");
 
 		/* Parent at the path is not in the tree now. */
 		if (!B_IS_IN_TREE
-		    (p_s_parent =
-		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+		    (parent =
+		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
 			return &MIN_KEY;
 		/* Check whether position in the parent is correct. */
-		if ((n_position =
-		     PATH_OFFSET_POSITION(p_s_chk_path,
-					  n_path_offset)) >
-		    B_NR_ITEMS(p_s_parent))
+		if ((position =
+		     PATH_OFFSET_POSITION(chk_path,
+					  path_offset)) >
+		    B_NR_ITEMS(parent))
 			return &MIN_KEY;
 		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
-		    PATH_OFFSET_PBUFFER(p_s_chk_path,
-					n_path_offset + 1)->b_blocknr)
+		if (B_N_CHILD_NUM(parent, position) !=
+		    PATH_OFFSET_PBUFFER(chk_path,
+					path_offset + 1)->b_blocknr)
 			return &MIN_KEY;
 		/* Return delimiting key if position in the parent is not the last one. */
-		if (n_position != B_NR_ITEMS(p_s_parent))
-			return B_N_PDELIM_KEY(p_s_parent, n_position);
+		if (position != B_NR_ITEMS(parent))
+			return B_N_PDELIM_KEY(parent, position);
 	}
 	/* Return MAX_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(sb))
 		return &MAX_KEY;
 	return &MIN_KEY;
 }
@@ -336,60 +332,29 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
    the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
    buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
    this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
-static inline int key_in_buffer(struct treepath *p_s_chk_path,	/* Path which should be checked.  */
-				const struct cpu_key *p_s_key,	/* Key which should be checked.   */
-				struct super_block *p_s_sb	/* Super block pointer.           */
+static inline int key_in_buffer(struct treepath *chk_path,	/* Path which should be checked.  */
+				const struct cpu_key *key,	/* Key which should be checked.   */
+				struct super_block *sb
     )
 {
 
-	RFALSE(!p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
-	       || p_s_chk_path->path_length > MAX_HEIGHT,
+	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
+	       || chk_path->path_length > MAX_HEIGHT,
 	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
-	       p_s_key, p_s_chk_path->path_length);
-	RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
+	       key, chk_path->path_length);
+	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
 	       "PAP-5060: device must not be NODEV");
 
-	if (comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1)
+	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
 		/* left delimiting key is bigger, that the key we look for */
 		return 0;
-	//  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
-	if (comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1)
-		/* p_s_key must be less than right delimitiing key */
+	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
+	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
+		/* key must be less than right delimitiing key */
 		return 0;
 	return 1;
 }
 
-inline void decrement_bcount(struct buffer_head *p_s_bh)
-{
-	if (p_s_bh) {
-		if (atomic_read(&(p_s_bh->b_count))) {
-			put_bh(p_s_bh);
-			return;
-		}
-		reiserfs_panic(NULL,
-			       "PAP-5070: decrement_bcount: trying to free free buffer %b",
-			       p_s_bh);
-	}
-}
-
-/* Decrement b_count field of the all buffers in the path. */
-void decrement_counters_in_path(struct treepath *p_s_search_path)
-{
-	int n_path_offset = p_s_search_path->path_length;
-
-	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
-	       n_path_offset > EXTENDED_MAX_HEIGHT - 1,
-	       "PAP-5080: invalid path offset of %d", n_path_offset);
-
-	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
-		struct buffer_head *bh;
-
-		bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
-		decrement_bcount(bh);
-	}
-	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
 int reiserfs_check_path(struct treepath *p)
 {
 	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
@@ -397,40 +362,38 @@ int reiserfs_check_path(struct treepath *p)
 	return 0;
 }
 
-/* Release all buffers in the path. Restore dirty bits clean
-** when preparing the buffer for the log
-**
-** only called from fix_nodes()
-*/
-void pathrelse_and_restore(struct super_block *s, struct treepath *p_s_search_path)
+/* Drop the reference to each buffer in a path and restore
+ * dirty bits clean when preparing the buffer for the log.
+ * This version should only be called from fix_nodes() */
+void pathrelse_and_restore(struct super_block *sb,
+			   struct treepath *search_path)
 {
-	int n_path_offset = p_s_search_path->path_length;
+	int path_offset = search_path->path_length;
 
-	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
 	       "clm-4000: invalid path offset");
 
-	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
-		reiserfs_restore_prepared_buffer(s,
-						 PATH_OFFSET_PBUFFER
-						 (p_s_search_path,
-						  n_path_offset));
-		brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
+		struct buffer_head *bh;
+		bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
+		reiserfs_restore_prepared_buffer(sb, bh);
+		brelse(bh);
 	}
-	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
-/* Release all buffers in the path. */
-void pathrelse(struct treepath *p_s_search_path)
+/* Drop the reference to each buffer in a path */
+void pathrelse(struct treepath *search_path)
 {
-	int n_path_offset = p_s_search_path->path_length;
+	int path_offset = search_path->path_length;
 
-	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
 	       "PAP-5090: invalid path offset");
 
-	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
-		brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
+	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
+		brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
 
-	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
 static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
@@ -444,23 +407,24 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
 
 	blkh = (struct block_head *)buf;
 	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
-		reiserfs_warning(NULL,
-				 "is_leaf: this should be caught earlier");
+		reiserfs_warning(NULL, "reiserfs-5080",
+				 "this should be caught earlier");
 		return 0;
 	}
 
 	nr = blkh_nr_item(blkh);
 	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
 		/* item number is too big or too small */
-		reiserfs_warning(NULL, "is_leaf: nr_item seems wrong: %z", bh);
+		reiserfs_warning(NULL, "reiserfs-5081",
+				 "nr_item seems wrong: %z", bh);
 		return 0;
 	}
 	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
 	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
 	if (used_space != blocksize - blkh_free_space(blkh)) {
 		/* free space does not match to calculated amount of use space */
-		reiserfs_warning(NULL, "is_leaf: free space seems wrong: %z",
-				 bh);
+		reiserfs_warning(NULL, "reiserfs-5082",
+				 "free space seems wrong: %z", bh);
 		return 0;
 	}
 	// FIXME: it is_leaf will hit performance too much - we may have
@@ -471,29 +435,29 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
 	prev_location = blocksize;
 	for (i = 0; i < nr; i++, ih++) {
 		if (le_ih_k_type(ih) == TYPE_ANY) {
-			reiserfs_warning(NULL,
-					 "is_leaf: wrong item type for item %h",
+			reiserfs_warning(NULL, "reiserfs-5083",
+					 "wrong item type for item %h",
 					 ih);
 			return 0;
 		}
 		if (ih_location(ih) >= blocksize
 		    || ih_location(ih) < IH_SIZE * nr) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item location seems wrong: %h",
+			reiserfs_warning(NULL, "reiserfs-5084",
+					 "item location seems wrong: %h",
 					 ih);
 			return 0;
 		}
 		if (ih_item_len(ih) < 1
 		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item length seems wrong: %h",
+			reiserfs_warning(NULL, "reiserfs-5085",
+					 "item length seems wrong: %h",
 					 ih);
 			return 0;
 		}
 		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
-			reiserfs_warning(NULL,
-					 "is_leaf: item location seems wrong (second one): %h",
-					 ih);
+			reiserfs_warning(NULL, "reiserfs-5086",
+					 "item location seems wrong "
+					 "(second one): %h", ih);
 			return 0;
 		}
 		prev_location = ih_location(ih);
@@ -514,24 +478,23 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
 	nr = blkh_level(blkh);
 	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
 		/* this level is not possible for internal nodes */
-		reiserfs_warning(NULL,
-				 "is_internal: this should be caught earlier");
+		reiserfs_warning(NULL, "reiserfs-5087",
+				 "this should be caught earlier");
 		return 0;
 	}
 
 	nr = blkh_nr_item(blkh);
 	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
 		/* for internal which is not root we might check min number of keys */
-		reiserfs_warning(NULL,
-				 "is_internal: number of key seems wrong: %z",
-				 bh);
+		reiserfs_warning(NULL, "reiserfs-5088",
+				 "number of key seems wrong: %z", bh);
 		return 0;
 	}
 
 	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
 	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL,
-				 "is_internal: free space seems wrong: %z", bh);
+		reiserfs_warning(NULL, "reiserfs-5089",
+				 "free space seems wrong: %z", bh);
 		return 0;
 	}
 	// one may imagine much more checks
@@ -543,8 +506,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
 static int is_tree_node(struct buffer_head *bh, int level)
 {
 	if (B_LEVEL(bh) != level) {
-		reiserfs_warning(NULL,
-				 "is_tree_node: node level %d does not match to the expected one %d",
+		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
+				 "not match to the expected one %d",
 				 B_LEVEL(bh), level);
 		return 0;
 	}
@@ -580,10 +543,10 @@ static void search_by_key_reada(struct super_block *s,
 /**************************************************************************
  * Algorithm   SearchByKey                                                *
  *             look for item in the Disk S+Tree by its key                *
- * Input:  p_s_sb   -  super block                                        *
- *         p_s_key  - pointer to the key to search                        *
+ * Input:  sb   -  super block                                            *
+ *         key  - pointer to the key to search                            *
  * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR                         *
- *         p_s_search_path - path from the root to the needed leaf        *
+ *         search_path - path from the root to the needed leaf            *
  **************************************************************************/
 
 /* This function fills up the path from the root to the leaf as it
@@ -600,22 +563,22 @@ static void search_by_key_reada(struct super_block *s,
    correctness of the top of the path but need not be checked for the
    correctness of the bottom of the path */
 /* The function is NOT SCHEDULE-SAFE! */
-int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/* Key to search. */
-		  struct treepath *p_s_search_path,/* This structure was
+int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to search. */
+		  struct treepath *search_path,/* This structure was
 						   allocated and initialized
 						   by the calling
 						   function. It is filled up
 						   by this function.  */
-		  int n_stop_level	/* How far down the tree to search. To
+		  int stop_level	/* How far down the tree to search. To
 					   stop at leaf level - set to
 					   DISK_LEAF_NODE_LEVEL */
     )
 {
-	b_blocknr_t n_block_number;
+	b_blocknr_t block_number;
 	int expected_level;
-	struct buffer_head *p_s_bh;
-	struct path_element *p_s_last_element;
-	int n_node_level, n_retval;
+	struct buffer_head *bh;
+	struct path_element *last_element;
+	int node_level, retval;
 	int right_neighbor_of_leaf_node;
 	int fs_gen;
 	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
@@ -623,80 +586,79 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 	int reada_count = 0;
 
 #ifdef CONFIG_REISERFS_CHECK
-	int n_repeat_counter = 0;
+	int repeat_counter = 0;
 #endif
 
-	PROC_INFO_INC(p_s_sb, search_by_key);
+	PROC_INFO_INC(sb, search_by_key);
 
 	/* As we add each node to a path we increase its count.  This means that
 	   we must be careful to release all nodes in a path before we either
 	   discard the path struct or re-use the path struct, as we do here. */
 
-	decrement_counters_in_path(p_s_search_path);
+	pathrelse(search_path);
 
 	right_neighbor_of_leaf_node = 0;
 
 	/* With each iteration of this loop we search through the items in the
 	   current node, and calculate the next current node(next path element)
 	   for the next iteration of this loop.. */
-	n_block_number = SB_ROOT_BLOCK(p_s_sb);
+	block_number = SB_ROOT_BLOCK(sb);
 	expected_level = -1;
 	while (1) {
 
 #ifdef CONFIG_REISERFS_CHECK
-		if (!(++n_repeat_counter % 50000))
-			reiserfs_warning(p_s_sb, "PAP-5100: search_by_key: %s:"
-					 "there were %d iterations of while loop "
-					 "looking for key %K",
-					 current->comm, n_repeat_counter,
-					 p_s_key);
+		if (!(++repeat_counter % 50000))
+			reiserfs_warning(sb, "PAP-5100",
+					 "%s: there were %d iterations of "
+					 "while loop looking for key %K",
+					 current->comm, repeat_counter,
+					 key);
 #endif
 
 		/* prep path to have another element added to it. */
-		p_s_last_element =
-		    PATH_OFFSET_PELEMENT(p_s_search_path,
-					 ++p_s_search_path->path_length);
-		fs_gen = get_generation(p_s_sb);
+		last_element =
+		    PATH_OFFSET_PELEMENT(search_path,
+					 ++search_path->path_length);
+		fs_gen = get_generation(sb);
 
 		/* Read the next tree node, and set the last element in the path to
 		   have a pointer to it. */
-		if ((p_s_bh = p_s_last_element->pe_buffer =
-		     sb_getblk(p_s_sb, n_block_number))) {
-			if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
-				search_by_key_reada(p_s_sb, reada_bh,
+		if ((bh = last_element->pe_buffer =
+		     sb_getblk(sb, block_number))) {
+			if (!buffer_uptodate(bh) && reada_count > 1)
+				search_by_key_reada(sb, reada_bh,
 						    reada_blocks, reada_count);
-			}
-			ll_rw_block(READ, 1, &p_s_bh);
-			wait_on_buffer(p_s_bh);
-			if (!buffer_uptodate(p_s_bh))
+			ll_rw_block(READ, 1, &bh);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
 				goto io_error;
 		} else {
 		      io_error:
-			p_s_search_path->path_length--;
-			pathrelse(p_s_search_path);
+			search_path->path_length--;
+			pathrelse(search_path);
 			return IO_ERROR;
 		}
 		reada_count = 0;
 		if (expected_level == -1)
-			expected_level = SB_TREE_HEIGHT(p_s_sb);
+			expected_level = SB_TREE_HEIGHT(sb);
 		expected_level--;
 
 		/* It is possible that schedule occurred. We must check whether the key
 		   to search is still in the tree rooted from the current buffer. If
 		   not then repeat search from the root. */
-		if (fs_changed(fs_gen, p_s_sb) &&
-		    (!B_IS_IN_TREE(p_s_bh) ||
-		     B_LEVEL(p_s_bh) != expected_level ||
-		     !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
-			PROC_INFO_INC(p_s_sb, search_by_key_fs_changed);
-			PROC_INFO_INC(p_s_sb, search_by_key_restarted);
-			PROC_INFO_INC(p_s_sb,
+		if (fs_changed(fs_gen, sb) &&
+		    (!B_IS_IN_TREE(bh) ||
+		     B_LEVEL(bh) != expected_level ||
+		     !key_in_buffer(search_path, key, sb))) {
+			PROC_INFO_INC(sb, search_by_key_fs_changed);
+			PROC_INFO_INC(sb, search_by_key_restarted);
+			PROC_INFO_INC(sb,
 				      sbk_restarted[expected_level - 1]);
-			decrement_counters_in_path(p_s_search_path);
+			pathrelse(search_path);
 
 			/* Get the root block number so that we can repeat the search
 			   starting from the root. */
-			n_block_number = SB_ROOT_BLOCK(p_s_sb);
+			block_number = SB_ROOT_BLOCK(sb);
 			expected_level = -1;
 			right_neighbor_of_leaf_node = 0;
 
@@ -704,53 +666,53 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 			continue;
 		}
 
-		/* only check that the key is in the buffer if p_s_key is not
+		/* only check that the key is in the buffer if key is not
 		   equal to the MAX_KEY. Latter case is only possible in
 		   "finish_unfinished()" processing during mount. */
-		RFALSE(comp_keys(&MAX_KEY, p_s_key) &&
-		       !key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
+		RFALSE(comp_keys(&MAX_KEY, key) &&
+		       !key_in_buffer(search_path, key, sb),
 		       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
 		if (cur_tb) {
 			print_cur_tb("5140");
-			reiserfs_panic(p_s_sb,
-				       "PAP-5140: search_by_key: schedule occurred in do_balance!");
+			reiserfs_panic(sb, "PAP-5140",
+				       "schedule occurred in do_balance!");
 		}
 #endif
 
 		// make sure, that the node contents look like a node of
 		// certain level
-		if (!is_tree_node(p_s_bh, expected_level)) {
-			reiserfs_warning(p_s_sb, "vs-5150: search_by_key: "
-					 "invalid format found in block %ld. Fsck?",
-					 p_s_bh->b_blocknr);
-			pathrelse(p_s_search_path);
+		if (!is_tree_node(bh, expected_level)) {
+			reiserfs_error(sb, "vs-5150",
+				       "invalid format found in block %ld. "
+				       "Fsck?", bh->b_blocknr);
+			pathrelse(search_path);
 			return IO_ERROR;
 		}
 
 		/* ok, we have acquired next formatted node in the tree */
-		n_node_level = B_LEVEL(p_s_bh);
+		node_level = B_LEVEL(bh);
 
-		PROC_INFO_BH_STAT(p_s_sb, p_s_bh, n_node_level - 1);
+		PROC_INFO_BH_STAT(sb, bh, node_level - 1);
 
-		RFALSE(n_node_level < n_stop_level,
+		RFALSE(node_level < stop_level,
 		       "vs-5152: tree level (%d) is less than stop level (%d)",
-		       n_node_level, n_stop_level);
+		       node_level, stop_level);
 
-		n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
-				      B_NR_ITEMS(p_s_bh),
-				      (n_node_level ==
+		retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
+				      B_NR_ITEMS(bh),
+				      (node_level ==
 				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
 				      KEY_SIZE,
-				      &(p_s_last_element->pe_position));
-		if (n_node_level == n_stop_level) {
-			return n_retval;
+				      &(last_element->pe_position));
+		if (node_level == stop_level) {
+			return retval;
 		}
 
 		/* we are not in the stop level */
-		if (n_retval == ITEM_FOUND)
+		if (retval == ITEM_FOUND)
 			/* item has been found, so we choose the pointer which is to the right of the found one */
-			p_s_last_element->pe_position++;
+			last_element->pe_position++;
 
 		/* if item was not found we choose the position which is to
 		   the left of the found item. This requires no code,
@@ -759,24 +721,24 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 		/* So we have chosen a position in the current node which is
 		   an internal node.  Now we calculate child block number by
 		   position in the node. */
-		n_block_number =
-		    B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
+		block_number =
+		    B_N_CHILD_NUM(bh, last_element->pe_position);
 
 		/* if we are going to read leaf nodes, try for read ahead as well */
-		if ((p_s_search_path->reada & PATH_READA) &&
-		    n_node_level == DISK_LEAF_NODE_LEVEL + 1) {
-			int pos = p_s_last_element->pe_position;
-			int limit = B_NR_ITEMS(p_s_bh);
+		if ((search_path->reada & PATH_READA) &&
+		    node_level == DISK_LEAF_NODE_LEVEL + 1) {
+			int pos = last_element->pe_position;
+			int limit = B_NR_ITEMS(bh);
 			struct reiserfs_key *le_key;
 
-			if (p_s_search_path->reada & PATH_READA_BACK)
+			if (search_path->reada & PATH_READA_BACK)
 				limit = 0;
 			while (reada_count < SEARCH_BY_KEY_READA) {
 				if (pos == limit)
 					break;
 				reada_blocks[reada_count++] =
-				    B_N_CHILD_NUM(p_s_bh, pos);
-				if (p_s_search_path->reada & PATH_READA_BACK)
+				    B_N_CHILD_NUM(bh, pos);
+				if (search_path->reada & PATH_READA_BACK)
 					pos--;
 				else
 					pos++;
@@ -784,9 +746,9 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 				/*
 				 * check to make sure we're in the same object
 				 */
-				le_key = B_N_PDELIM_KEY(p_s_bh, pos);
+				le_key = B_N_PDELIM_KEY(bh, pos);
 				if (le32_to_cpu(le_key->k_objectid) !=
-				    p_s_key->on_disk_key.k_objectid) {
+				    key->on_disk_key.k_objectid) {
 					break;
 				}
 			}
@@ -795,11 +757,11 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
 }
 
 /* Form the path to an item and position in this item which contains
-   file byte defined by p_s_key. If there is no such item
+   file byte defined by key. If there is no such item
    corresponding to the key, we point the path to the item with
-   maximal key less than p_s_key, and *p_n_pos_in_item is set to one
+   maximal key less than key, and *pos_in_item is set to one
    past the last entry/byte in the item.  If searching for entry in a
-   directory item, and it is not found, *p_n_pos_in_item is set to one
+   directory item, and it is not found, *pos_in_item is set to one
    entry more than the entry with maximal key which is less than the
    sought key.
 
@@ -810,48 +772,48 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/*
    units of directory entries.  */
 
 /* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the super block.          */
+int search_for_position_by_key(struct super_block *sb,	/* Pointer to the super block.          */
 			       const struct cpu_key *p_cpu_key,	/* Key to search (cpu variable)         */
-			       struct treepath *p_s_search_path	/* Filled up by this function.          */
+			       struct treepath *search_path	/* Filled up by this function.          */
     )
 {
 	struct item_head *p_le_ih;	/* pointer to on-disk structure */
-	int n_blk_size;
+	int blk_size;
 	loff_t item_offset, offset;
 	struct reiserfs_dir_entry de;
 	int retval;
 
 	/* If searching for directory entry. */
 	if (is_direntry_cpu_key(p_cpu_key))
-		return search_by_entry_key(p_s_sb, p_cpu_key, p_s_search_path,
+		return search_by_entry_key(sb, p_cpu_key, search_path,
 					   &de);
 
 	/* If not searching for directory entry. */
 
 	/* If item is found. */
-	retval = search_item(p_s_sb, p_cpu_key, p_s_search_path);
+	retval = search_item(sb, p_cpu_key, search_path);
 	if (retval == IO_ERROR)
 		return retval;
 	if (retval == ITEM_FOUND) {
 
 		RFALSE(!ih_item_len
 		       (B_N_PITEM_HEAD
-			(PATH_PLAST_BUFFER(p_s_search_path),
-			 PATH_LAST_POSITION(p_s_search_path))),
+			(PATH_PLAST_BUFFER(search_path),
+			 PATH_LAST_POSITION(search_path))),
 		       "PAP-5165: item length equals zero");
 
-		pos_in_item(p_s_search_path) = 0;
+		pos_in_item(search_path) = 0;
 		return POSITION_FOUND;
 	}
 
-	RFALSE(!PATH_LAST_POSITION(p_s_search_path),
+	RFALSE(!PATH_LAST_POSITION(search_path),
 	       "PAP-5170: position equals zero");
 
 	/* Item is not found. Set path to the previous item. */
 	p_le_ih =
-	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
-			   --PATH_LAST_POSITION(p_s_search_path));
-	n_blk_size = p_s_sb->s_blocksize;
+	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
+			   --PATH_LAST_POSITION(search_path));
+	blk_size = sb->s_blocksize;
 
 	if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
 		return FILE_NOT_FOUND;
@@ -863,10 +825,10 @@ int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the sup
 
 	/* Needed byte is contained in the item pointed to by the path. */
 	if (item_offset <= offset &&
-	    item_offset + op_bytes_number(p_le_ih, n_blk_size) > offset) {
-		pos_in_item(p_s_search_path) = offset - item_offset;
+	    item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
+		pos_in_item(search_path) = offset - item_offset;
 		if (is_indirect_le_ih(p_le_ih)) {
-			pos_in_item(p_s_search_path) /= n_blk_size;
+			pos_in_item(search_path) /= blk_size;
 		}
 		return POSITION_FOUND;
 	}
@@ -874,30 +836,30 @@ int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the sup
 	/* Needed byte is not contained in the item pointed to by the
 	   path. Set pos_in_item out of the item. */
 	if (is_indirect_le_ih(p_le_ih))
-		pos_in_item(p_s_search_path) =
+		pos_in_item(search_path) =
 		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
 	else
-		pos_in_item(p_s_search_path) = ih_item_len(p_le_ih);
+		pos_in_item(search_path) = ih_item_len(p_le_ih);
 
 	return POSITION_NOT_FOUND;
 }
 
 /* Compare given item and item pointed to by the path. */
-int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path)
+int comp_items(const struct item_head *stored_ih, const struct treepath *path)
 {
-	struct buffer_head *p_s_bh;
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
 	struct item_head *ih;
 
 	/* Last buffer at the path is not in the tree. */
-	if (!B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)))
+	if (!B_IS_IN_TREE(bh))
 		return 1;
 
 	/* Last path position is invalid. */
-	if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh))
+	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
 		return 1;
 
 	/* we need only to know, whether it is the same item */
-	ih = get_ih(p_s_path);
+	ih = get_ih(path);
 	return memcmp(stored_ih, ih, IH_SIZE);
 }
 
@@ -924,9 +886,9 @@ static inline int prepare_for_direct_item(struct treepath *path,
 	}
 	// new file gets truncated
 	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
-		// 
+		//
 		round_len = ROUND_UP(new_file_length);
-		/* this was n_new_file_length < le_ih ... */
+		/* this was new_file_length < le_ih ... */
 		if (round_len < le_ih_k_offset(le_ih)) {
 			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
 			return M_DELETE;	/* Delete this item. */
@@ -986,96 +948,95 @@ static inline int prepare_for_direntry_item(struct treepath *path,
     In case of file truncate calculate whether this item must be deleted/truncated or last
     unformatted node of this item will be converted to a direct item.
     This function returns a determination of what balance mode the calling function should employ. */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed,	/* Number of unformatted nodes which were removed
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed,	/* Number of unformatted nodes which were removed
 																						   from end of the file. */
-				      int *p_n_cut_size, unsigned long long n_new_file_length	/* MAX_KEY_OFFSET in case of delete. */
+				      int *cut_size, unsigned long long new_file_length	/* MAX_KEY_OFFSET in case of delete. */
     )
 {
-	struct super_block *p_s_sb = inode->i_sb;
-	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path);
-	struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path);
+	struct super_block *sb = inode->i_sb;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
 
 	BUG_ON(!th->t_trans_id);
 
 	/* Stat_data item. */
 	if (is_statdata_le_ih(p_le_ih)) {
 
-		RFALSE(n_new_file_length != max_reiserfs_offset(inode),
+		RFALSE(new_file_length != max_reiserfs_offset(inode),
 		       "PAP-5210: mode must be M_DELETE");
 
-		*p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
+		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
 		return M_DELETE;
 	}
 
 	/* Directory item. */
 	if (is_direntry_le_ih(p_le_ih))
-		return prepare_for_direntry_item(p_s_path, p_le_ih, inode,
-						 n_new_file_length,
-						 p_n_cut_size);
+		return prepare_for_direntry_item(path, p_le_ih, inode,
+						 new_file_length,
+						 cut_size);
 
 	/* Direct item. */
 	if (is_direct_le_ih(p_le_ih))
-		return prepare_for_direct_item(p_s_path, p_le_ih, inode,
-					       n_new_file_length, p_n_cut_size);
+		return prepare_for_direct_item(path, p_le_ih, inode,
+					       new_file_length, cut_size);
 
 	/* Case of an indirect item. */
 	{
-	    int blk_size = p_s_sb->s_blocksize;
+	    int blk_size = sb->s_blocksize;
 	    struct item_head s_ih;
 	    int need_re_search;
 	    int delete = 0;
 	    int result = M_CUT;
 	    int pos = 0;
 
-	    if ( n_new_file_length == max_reiserfs_offset (inode) ) {
+	    if ( new_file_length == max_reiserfs_offset (inode) ) {
 		/* prepare_for_delete_or_cut() is called by
 		 * reiserfs_delete_item() */
-		n_new_file_length = 0;
+		new_file_length = 0;
 		delete = 1;
 	    }
 
 	    do {
 		need_re_search = 0;
-		*p_n_cut_size = 0;
-		p_s_bh = PATH_PLAST_BUFFER(p_s_path);
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+		*cut_size = 0;
+		bh = PATH_PLAST_BUFFER(path);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 		pos = I_UNFM_NUM(&s_ih);
 
-		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > n_new_file_length) {
+		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
 		    __le32 *unfm;
 		    __u32 block;
 
 		    /* Each unformatted block deletion may involve one additional
 		     * bitmap block into the transaction, thereby the initial
 		     * journal space reservation might not be enough. */
-		    if (!delete && (*p_n_cut_size) != 0 &&
-			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
+		    if (!delete && (*cut_size) != 0 &&
+			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
 			break;
-		    }
 
-		    unfm = (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + pos - 1;
+		    unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
 		    block = get_block_num(unfm, 0);
 
 		    if (block != 0) {
-			reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
+			reiserfs_prepare_for_journal(sb, bh, 1);
 			put_block_num(unfm, 0, 0);
-			journal_mark_dirty (th, p_s_sb, p_s_bh);
+			journal_mark_dirty(th, sb, bh);
 			reiserfs_free_block(th, inode, block, 1);
 		    }
 
 		    cond_resched();
 
-		    if (item_moved (&s_ih, p_s_path))  {
+		    if (item_moved (&s_ih, path))  {
 			need_re_search = 1;
 			break;
 		    }
 
 		    pos --;
-		    (*p_n_removed) ++;
-		    (*p_n_cut_size) -= UNFM_P_SIZE;
+		    (*removed)++;
+		    (*cut_size) -= UNFM_P_SIZE;
 
 		    if (pos == 0) {
-			(*p_n_cut_size) -= IH_SIZE;
+			(*cut_size) -= IH_SIZE;
 			result = M_DELETE;
 			break;
 		    }
@@ -1083,12 +1044,12 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 		/* a trick.  If the buffer has been logged, this will do nothing.  If
 		** we've broken the loop without logging it, it will restore the
 		** buffer */
-		reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
+		reiserfs_restore_prepared_buffer(sb, bh);
 	    } while (need_re_search &&
-		     search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND);
-	    pos_in_item(p_s_path) = pos * UNFM_P_SIZE;
+		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
+	    pos_in_item(path) = pos * UNFM_P_SIZE;
 
-	    if (*p_n_cut_size == 0) {
+	    if (*cut_size == 0) {
 		/* Nothing were cut. maybe convert last unformatted node to the
 		 * direct item? */
 		result = M_CONVERT;
@@ -1098,45 +1059,45 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 }
 
 /* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode)
+static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
 {
-	int n_del_size;
-	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
+	int del_size;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
 
 	if (is_statdata_le_ih(p_le_ih))
 		return 0;
 
-	n_del_size =
-	    (c_mode ==
-	     M_DELETE) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
+	del_size =
+	    (mode ==
+	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
 	if (is_direntry_le_ih(p_le_ih)) {
-		// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
-		// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
-		// empty size.  ick. FIXME, is this right?
-		//
-		return n_del_size;
+		/* return EMPTY_DIR_SIZE; We delete emty directoris only.
+		 * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
+		 * empty size.  ick. FIXME, is this right? */
+		return del_size;
 	}
 
 	if (is_indirect_le_ih(p_le_ih))
-		n_del_size = (n_del_size / UNFM_P_SIZE) * (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);	// - get_ih_free_space (p_le_ih);
-	return n_del_size;
+		del_size = (del_size / UNFM_P_SIZE) *
+				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
+	return del_size;
 }
 
 static void init_tb_struct(struct reiserfs_transaction_handle *th,
-			   struct tree_balance *p_s_tb,
-			   struct super_block *p_s_sb,
-			   struct treepath *p_s_path, int n_size)
+			   struct tree_balance *tb,
+			   struct super_block *sb,
+			   struct treepath *path, int size)
 {
 
 	BUG_ON(!th->t_trans_id);
 
-	memset(p_s_tb, '\0', sizeof(struct tree_balance));
-	p_s_tb->transaction_handle = th;
-	p_s_tb->tb_sb = p_s_sb;
-	p_s_tb->tb_path = p_s_path;
-	PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
-	PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
-	p_s_tb->insert_size[0] = n_size;
+	memset(tb, '\0', sizeof(struct tree_balance));
+	tb->transaction_handle = th;
+	tb->tb_sb = sb;
+	tb->tb_path = path;
+	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
+	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
+	tb->insert_size[0] = size;
 }
 
 void padd_item(char *item, int total_length, int length)
@@ -1175,73 +1136,77 @@ char head2type(struct item_head *ih)
 }
 #endif
 
-/* Delete object item. */
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path,	/* Path to the deleted item. */
-			 const struct cpu_key *p_s_item_key,	/* Key to search for the deleted item.  */
-			 struct inode *p_s_inode,	/* inode is here just to update i_blocks and quotas */
-			 struct buffer_head *p_s_un_bh)
-{				/* NULL or unformatted node pointer.    */
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+/* Delete object item.
+ * th       - active transaction handle
+ * path     - path to the deleted item
+ * item_key - key to search for the deleted item
+ * indode   - used for updating i_blocks and quotas
+ * un_bh    - NULL or unformatted node pointer
+ */
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *item_key,
+			 struct inode *inode, struct buffer_head *un_bh)
+{
+	struct super_block *sb = inode->i_sb;
 	struct tree_balance s_del_balance;
 	struct item_head s_ih;
 	struct item_head *q_ih;
 	int quota_cut_bytes;
-	int n_ret_value, n_del_size, n_removed;
+	int ret_value, del_size, removed;
 
 #ifdef CONFIG_REISERFS_CHECK
-	char c_mode;
-	int n_iter = 0;
+	char mode;
+	int iter = 0;
 #endif
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path,
+	init_tb_struct(th, &s_del_balance, sb, path,
 		       0 /*size is unknown */ );
 
 	while (1) {
-		n_removed = 0;
+		removed = 0;
 
 #ifdef CONFIG_REISERFS_CHECK
-		n_iter++;
-		c_mode =
+		iter++;
+		mode =
 #endif
-		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
-					      p_s_item_key, &n_removed,
-					      &n_del_size,
-					      max_reiserfs_offset(p_s_inode));
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &del_size,
+					      max_reiserfs_offset(inode));
 
-		RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
+		RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
 
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
-		s_del_balance.insert_size[0] = n_del_size;
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+		s_del_balance.insert_size[0] = del_size;
 
-		n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
-		if (n_ret_value != REPEAT_SEARCH)
+		ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
 			break;
 
-		PROC_INFO_INC(p_s_sb, delete_item_restarted);
+		PROC_INFO_INC(sb, delete_item_restarted);
 
 		// file system changed, repeat search
-		n_ret_value =
-		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
-		if (n_ret_value == IO_ERROR)
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == IO_ERROR)
 			break;
-		if (n_ret_value == FILE_NOT_FOUND) {
-			reiserfs_warning(p_s_sb,
-					 "vs-5340: reiserfs_delete_item: "
+		if (ret_value == FILE_NOT_FOUND) {
+			reiserfs_warning(sb, "vs-5340",
 					 "no items of the file %K found",
-					 p_s_item_key);
+					 item_key);
 			break;
 		}
 	}			/* while (1) */
 
-	if (n_ret_value != CARRY_ON) {
+	if (ret_value != CARRY_ON) {
 		unfix_nodes(&s_del_balance);
 		return 0;
 	}
 	// reiserfs_delete_item returns item length when success
-	n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
-	q_ih = get_ih(p_s_path);
+	ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+	q_ih = get_ih(path);
 	quota_cut_bytes = ih_item_len(q_ih);
 
 	/* hack so the quota code doesn't have to guess if the file
@@ -1250,15 +1215,15 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 	 ** split into multiple items, and we only want to decrement for
 	 ** the unfm node once
 	 */
-	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
-		if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
-			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
+		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
 			quota_cut_bytes = 0;
 		}
 	}
 
-	if (p_s_un_bh) {
+	if (un_bh) {
 		int off;
 		char *data;
 
@@ -1276,31 +1241,31 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		 ** The unformatted node must be dirtied later on.  We can't be
 		 ** sure here if the entire tail has been deleted yet.
 		 **
-		 ** p_s_un_bh is from the page cache (all unformatted nodes are
+		 ** un_bh is from the page cache (all unformatted nodes are
 		 ** from the page cache) and might be a highmem page.  So, we
-		 ** can't use p_s_un_bh->b_data.
+		 ** can't use un_bh->b_data.
 		 ** -clm
 		 */
 
-		data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
+		data = kmap_atomic(un_bh->b_page, KM_USER0);
 		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
 		memcpy(data + off,
-		       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih),
-		       n_ret_value);
+		       B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
+		       ret_value);
 		kunmap_atomic(data, KM_USER0);
 	}
 	/* Perform balancing after all resources have been collected at once. */
 	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
 
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
-		       quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
+		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
 
 	/* Return deleted body length */
-	return n_ret_value;
+	return ret_value;
 }
 
 /* Summary Of Mechanisms For Handling Collisions Between Processes:
@@ -1338,10 +1303,9 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 	while (1) {
 		retval = search_item(th->t_super, &cpu_key, &path);
 		if (retval == IO_ERROR) {
-			reiserfs_warning(th->t_super,
-					 "vs-5350: reiserfs_delete_solid_item: "
-					 "i/o failure occurred trying to delete %K",
-					 &cpu_key);
+			reiserfs_error(th->t_super, "vs-5350",
+				       "i/o failure occurred trying "
+				       "to delete %K", &cpu_key);
 			break;
 		}
 		if (retval != ITEM_FOUND) {
@@ -1355,9 +1319,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 			     GET_GENERATION_NUMBER(le_key_k_offset
 						   (le_key_version(key),
 						    key)) == 1))
-				reiserfs_warning(th->t_super,
-						 "vs-5355: reiserfs_delete_solid_item: %k not found",
-						 key);
+				reiserfs_warning(th->t_super, "vs-5355",
+						 "%k not found", key);
 			break;
 		}
 		if (!tb_init) {
@@ -1383,14 +1346,13 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 					       quota_cut_bytes, inode->i_uid,
 					       key2type(key));
 #endif
-				DQUOT_FREE_SPACE_NODIRTY(inode,
+				vfs_dq_free_space_nodirty(inode,
 							 quota_cut_bytes);
 			}
 			break;
 		}
 		// IO_ERROR, NO_DISK_SPACE, etc
-		reiserfs_warning(th->t_super,
-				 "vs-5360: reiserfs_delete_solid_item: "
+		reiserfs_warning(th->t_super, "vs-5360",
 				 "could not delete %K due to fix_nodes failure",
 				 &cpu_key);
 		unfix_nodes(&tb);
@@ -1462,36 +1424,37 @@ static void unmap_buffers(struct page *page, loff_t pos)
 }
 
 static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
-				    struct inode *p_s_inode,
+				    struct inode *inode,
 				    struct page *page,
-				    struct treepath *p_s_path,
-				    const struct cpu_key *p_s_item_key,
-				    loff_t n_new_file_size, char *p_c_mode)
+				    struct treepath *path,
+				    const struct cpu_key *item_key,
+				    loff_t new_file_size, char *mode)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
-	int n_block_size = p_s_sb->s_blocksize;
+	struct super_block *sb = inode->i_sb;
+	int block_size = sb->s_blocksize;
 	int cut_bytes;
 	BUG_ON(!th->t_trans_id);
-	BUG_ON(n_new_file_size != p_s_inode->i_size);
+	BUG_ON(new_file_size != inode->i_size);
 
 	/* the page being sent in could be NULL if there was an i/o error
 	 ** reading in the last block.  The user will hit problems trying to
 	 ** read the file, but for now we just skip the indirect2direct
 	 */
-	if (atomic_read(&p_s_inode->i_count) > 1 ||
-	    !tail_has_to_be_packed(p_s_inode) ||
-	    !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
-		// leave tail in an unformatted node    
-		*p_c_mode = M_SKIP_BALANCING;
+	if (atomic_read(&inode->i_count) > 1 ||
+	    !tail_has_to_be_packed(inode) ||
+	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
+		/* leave tail in an unformatted node */
+		*mode = M_SKIP_BALANCING;
 		cut_bytes =
-		    n_block_size - (n_new_file_size & (n_block_size - 1));
-		pathrelse(p_s_path);
+		    block_size - (new_file_size & (block_size - 1));
+		pathrelse(path);
 		return cut_bytes;
 	}
-	/* Permorm the conversion to a direct_item. */
-	/*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); */
-	return indirect2direct(th, p_s_inode, page, p_s_path, p_s_item_key,
-			       n_new_file_size, p_c_mode);
+	/* Perform the conversion to a direct_item. */
+	/* return indirect_to_direct(inode, path, item_key,
+				  new_file_size, mode); */
+	return indirect2direct(th, inode, page, path, item_key,
+			       new_file_size, mode);
 }
 
 /* we did indirect_to_direct conversion. And we have inserted direct
@@ -1515,8 +1478,8 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 		/* look for the last byte of the tail */
 		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
 		    POSITION_NOT_FOUND)
-			reiserfs_panic(inode->i_sb,
-				       "vs-5615: indirect_to_direct_roll_back: found invalid item");
+			reiserfs_panic(inode->i_sb, "vs-5615",
+				       "found invalid item");
 		RFALSE(path->pos_in_item !=
 		       ih_item_len(PATH_PITEM_HEAD(path)) - 1,
 		       "vs-5616: appended bytes found");
@@ -1533,38 +1496,39 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
 		set_cpu_key_k_offset(&tail_key,
 				     cpu_key_k_offset(&tail_key) - removed);
 	}
-	reiserfs_warning(inode->i_sb,
-			 "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
+	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
+			 "conversion has been rolled back due to "
+			 "lack of disk space");
 	//mark_file_without_tail (inode);
 	mark_inode_dirty(inode);
 }
 
 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
 int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *p_s_path,
-			   struct cpu_key *p_s_item_key,
-			   struct inode *p_s_inode,
-			   struct page *page, loff_t n_new_file_size)
+			   struct treepath *path,
+			   struct cpu_key *item_key,
+			   struct inode *inode,
+			   struct page *page, loff_t new_file_size)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	/* Every function which is going to call do_balance must first
 	   create a tree_balance structure.  Then it must fill up this
 	   structure by using the init_tb_struct and fix_nodes functions.
 	   After that we can make tree balancing. */
 	struct tree_balance s_cut_balance;
 	struct item_head *p_le_ih;
-	int n_cut_size = 0,	/* Amount to be cut. */
-	    n_ret_value = CARRY_ON, n_removed = 0,	/* Number of the removed unformatted nodes. */
-	    n_is_inode_locked = 0;
-	char c_mode;		/* Mode of the balance. */
+	int cut_size = 0,	/* Amount to be cut. */
+	    ret_value = CARRY_ON, removed = 0,	/* Number of the removed unformatted nodes. */
+	    is_inode_locked = 0;
+	char mode;		/* Mode of the balance. */
 	int retval2 = -1;
 	int quota_cut_bytes;
 	loff_t tail_pos = 0;
 
 	BUG_ON(!th->t_trans_id);
 
-	init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path,
-		       n_cut_size);
+	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
+		       cut_size);
 
 	/* Repeat this loop until we either cut the item without needing
 	   to balance, or we fix_nodes without schedule occurring */
@@ -1574,144 +1538,142 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		   free unformatted nodes which are pointed to by the cut
 		   pointers. */
 
-		c_mode =
-		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
-					      p_s_item_key, &n_removed,
-					      &n_cut_size, n_new_file_size);
-		if (c_mode == M_CONVERT) {
+		mode =
+		    prepare_for_delete_or_cut(th, inode, path,
+					      item_key, &removed,
+					      &cut_size, new_file_size);
+		if (mode == M_CONVERT) {
 			/* convert last unformatted node to direct item or leave
 			   tail in the unformatted node */
-			RFALSE(n_ret_value != CARRY_ON,
+			RFALSE(ret_value != CARRY_ON,
 			       "PAP-5570: can not convert twice");
 
-			n_ret_value =
-			    maybe_indirect_to_direct(th, p_s_inode, page,
-						     p_s_path, p_s_item_key,
-						     n_new_file_size, &c_mode);
-			if (c_mode == M_SKIP_BALANCING)
+			ret_value =
+			    maybe_indirect_to_direct(th, inode, page,
+						     path, item_key,
+						     new_file_size, &mode);
+			if (mode == M_SKIP_BALANCING)
 				/* tail has been left in the unformatted node */
-				return n_ret_value;
+				return ret_value;
 
-			n_is_inode_locked = 1;
+			is_inode_locked = 1;
 
 			/* removing of last unformatted node will change value we
 			   have to return to truncate. Save it */
-			retval2 = n_ret_value;
-			/*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1)); */
+			retval2 = ret_value;
+			/*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
 
 			/* So, we have performed the first part of the conversion:
 			   inserting the new direct item.  Now we are removing the
 			   last unformatted node pointer. Set key to search for
 			   it. */
-			set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT);
-			p_s_item_key->key_length = 4;
-			n_new_file_size -=
-			    (n_new_file_size & (p_s_sb->s_blocksize - 1));
-			tail_pos = n_new_file_size;
-			set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1);
+			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
+			item_key->key_length = 4;
+			new_file_size -=
+			    (new_file_size & (sb->s_blocksize - 1));
+			tail_pos = new_file_size;
+			set_cpu_key_k_offset(item_key, new_file_size + 1);
 			if (search_for_position_by_key
-			    (p_s_sb, p_s_item_key,
-			     p_s_path) == POSITION_NOT_FOUND) {
-				print_block(PATH_PLAST_BUFFER(p_s_path), 3,
-					    PATH_LAST_POSITION(p_s_path) - 1,
-					    PATH_LAST_POSITION(p_s_path) + 1);
-				reiserfs_panic(p_s_sb,
-					       "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)",
-					       p_s_item_key);
+			    (sb, item_key,
+			     path) == POSITION_NOT_FOUND) {
+				print_block(PATH_PLAST_BUFFER(path), 3,
+					    PATH_LAST_POSITION(path) - 1,
+					    PATH_LAST_POSITION(path) + 1);
+				reiserfs_panic(sb, "PAP-5580", "item to "
+					       "convert does not exist (%K)",
+					       item_key);
 			}
 			continue;
 		}
-		if (n_cut_size == 0) {
-			pathrelse(p_s_path);
+		if (cut_size == 0) {
+			pathrelse(path);
 			return 0;
 		}
 
-		s_cut_balance.insert_size[0] = n_cut_size;
+		s_cut_balance.insert_size[0] = cut_size;
 
-		n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL);
-		if (n_ret_value != REPEAT_SEARCH)
+		ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
+		if (ret_value != REPEAT_SEARCH)
 			break;
 
-		PROC_INFO_INC(p_s_sb, cut_from_item_restarted);
+		PROC_INFO_INC(sb, cut_from_item_restarted);
 
-		n_ret_value =
-		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
-		if (n_ret_value == POSITION_FOUND)
+		ret_value =
+		    search_for_position_by_key(sb, item_key, path);
+		if (ret_value == POSITION_FOUND)
 			continue;
 
-		reiserfs_warning(p_s_sb,
-				 "PAP-5610: reiserfs_cut_from_item: item %K not found",
-				 p_s_item_key);
+		reiserfs_warning(sb, "PAP-5610", "item %K not found",
+				 item_key);
 		unfix_nodes(&s_cut_balance);
-		return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
+		return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
 	}			/* while */
 
 	// check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
-	if (n_ret_value != CARRY_ON) {
-		if (n_is_inode_locked) {
+	if (ret_value != CARRY_ON) {
+		if (is_inode_locked) {
 			// FIXME: this seems to be not needed: we are always able
 			// to cut item
-			indirect_to_direct_roll_back(th, p_s_inode, p_s_path);
+			indirect_to_direct_roll_back(th, inode, path);
 		}
-		if (n_ret_value == NO_DISK_SPACE)
-			reiserfs_warning(p_s_sb, "NO_DISK_SPACE");
+		if (ret_value == NO_DISK_SPACE)
+			reiserfs_warning(sb, "reiserfs-5092",
+					 "NO_DISK_SPACE");
 		unfix_nodes(&s_cut_balance);
 		return -EIO;
 	}
 
 	/* go ahead and perform balancing */
 
-	RFALSE(c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
+	RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
 
 	/* Calculate number of bytes that need to be cut from the item. */
 	quota_cut_bytes =
-	    (c_mode ==
-	     M_DELETE) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.
+	    (mode ==
+	     M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
 	    insert_size[0];
 	if (retval2 == -1)
-		n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
+		ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
 	else
-		n_ret_value = retval2;
+		ret_value = retval2;
 
 	/* For direct items, we only change the quota when deleting the last
 	 ** item.
 	 */
 	p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
-	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
-		if (c_mode == M_DELETE &&
-		    (le_ih_k_offset(p_le_ih) & (p_s_sb->s_blocksize - 1)) ==
+	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+		if (mode == M_DELETE &&
+		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
 		    1) {
 			// FIXME: this is to keep 3.5 happy
-			REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
-			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
 		} else {
 			quota_cut_bytes = 0;
 		}
 	}
 #ifdef CONFIG_REISERFS_CHECK
-	if (n_is_inode_locked) {
+	if (is_inode_locked) {
 		struct item_head *le_ih =
 		    PATH_PITEM_HEAD(s_cut_balance.tb_path);
 		/* we are going to complete indirect2direct conversion. Make
 		   sure, that we exactly remove last unformatted node pointer
 		   of the item */
 		if (!is_indirect_le_ih(le_ih))
-			reiserfs_panic(p_s_sb,
-				       "vs-5652: reiserfs_cut_from_item: "
+			reiserfs_panic(sb, "vs-5652",
 				       "item must be indirect %h", le_ih);
 
-		if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-			reiserfs_panic(p_s_sb,
-				       "vs-5653: reiserfs_cut_from_item: "
-				       "completing indirect2direct conversion indirect item %h "
-				       "being deleted must be of 4 byte long",
-				       le_ih);
+		if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
+			reiserfs_panic(sb, "vs-5653", "completing "
+				       "indirect2direct conversion indirect "
+				       "item %h being deleted must be of "
+				       "4 byte long", le_ih);
 
-		if (c_mode == M_CUT
+		if (mode == M_CUT
 		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-			reiserfs_panic(p_s_sb,
-				       "vs-5654: reiserfs_cut_from_item: "
-				       "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
+			reiserfs_panic(sb, "vs-5654", "can not complete "
+				       "indirect2direct conversion of %h "
+				       "(CUT, insert_size==%d)",
 				       le_ih, s_cut_balance.insert_size[0]);
 		}
 		/* it would be useful to make sure, that right neighboring
@@ -1719,23 +1681,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 	}
 #endif
 
-	do_balance(&s_cut_balance, NULL, NULL, c_mode);
-	if (n_is_inode_locked) {
+	do_balance(&s_cut_balance, NULL, NULL, mode);
+	if (is_inode_locked) {
 		/* we've done an indirect->direct conversion.  when the data block
 		 ** was freed, it was removed from the list of blocks that must
 		 ** be flushed before the transaction commits, make sure to
 		 ** unmap and invalidate it
 		 */
 		unmap_buffers(page, tail_pos);
-		REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask;
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 	}
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(p_s_inode->i_sb, REISERFS_DEBUG_CODE,
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
-		       quota_cut_bytes, p_s_inode->i_uid, '?');
+		       quota_cut_bytes, inode->i_uid, '?');
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
-	return n_ret_value;
+	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+	return ret_value;
 }
 
 static void truncate_directory(struct reiserfs_transaction_handle *th,
@@ -1743,8 +1705,7 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
 {
 	BUG_ON(!th->t_trans_id);
 	if (inode->i_nlink)
-		reiserfs_warning(inode->i_sb,
-				 "vs-5655: truncate_directory: link count != 0");
+		reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
 
 	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
 	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
@@ -1756,8 +1717,8 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
 
 /* Truncate file to the new size. Note, this must be called with a transaction
    already started */
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p_s_inode,	/* ->i_size contains new
-												   size */
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			  struct inode *inode,	/* ->i_size contains new size */
 			 struct page *page,	/* up to date for last block */
 			 int update_timestamps	/* when it is called by
 						   file_release to convert
@@ -1768,47 +1729,45 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
 	struct item_head *p_le_ih;	/* Pointer to an item header. */
 	struct cpu_key s_item_key;	/* Key to search for a previous file item. */
-	loff_t n_file_size,	/* Old file size. */
-	 n_new_file_size;	/* New file size. */
-	int n_deleted;		/* Number of deleted or truncated bytes. */
+	loff_t file_size,	/* Old file size. */
+	 new_file_size;	/* New file size. */
+	int deleted;		/* Number of deleted or truncated bytes. */
 	int retval;
 	int err = 0;
 
 	BUG_ON(!th->t_trans_id);
 	if (!
-	    (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode)
-	     || S_ISLNK(p_s_inode->i_mode)))
+	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
 		return 0;
 
-	if (S_ISDIR(p_s_inode->i_mode)) {
+	if (S_ISDIR(inode->i_mode)) {
 		// deletion of directory - no need to update timestamps
-		truncate_directory(th, p_s_inode);
+		truncate_directory(th, inode);
 		return 0;
 	}
 
 	/* Get new file size. */
-	n_new_file_size = p_s_inode->i_size;
+	new_file_size = inode->i_size;
 
 	// FIXME: note, that key type is unimportant here
-	make_cpu_key(&s_item_key, p_s_inode, max_reiserfs_offset(p_s_inode),
+	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
 		     TYPE_DIRECT, 3);
 
 	retval =
-	    search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+	    search_for_position_by_key(inode->i_sb, &s_item_key,
 				       &s_search_path);
 	if (retval == IO_ERROR) {
-		reiserfs_warning(p_s_inode->i_sb,
-				 "vs-5657: reiserfs_do_truncate: "
-				 "i/o failure occurred trying to truncate %K",
-				 &s_item_key);
+		reiserfs_error(inode->i_sb, "vs-5657",
+			       "i/o failure occurred trying to truncate %K",
+			       &s_item_key);
 		err = -EIO;
 		goto out;
 	}
 	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-		reiserfs_warning(p_s_inode->i_sb,
-				 "PAP-5660: reiserfs_do_truncate: "
-				 "wrong result %d of search for %K", retval,
-				 &s_item_key);
+		reiserfs_error(inode->i_sb, "PAP-5660",
+			       "wrong result %d of search for %K", retval,
+			       &s_item_key);
 
 		err = -EIO;
 		goto out;
@@ -1819,56 +1778,56 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 	/* Get real file size (total length of all file items) */
 	p_le_ih = PATH_PITEM_HEAD(&s_search_path);
 	if (is_statdata_le_ih(p_le_ih))
-		n_file_size = 0;
+		file_size = 0;
 	else {
 		loff_t offset = le_ih_k_offset(p_le_ih);
 		int bytes =
-		    op_bytes_number(p_le_ih, p_s_inode->i_sb->s_blocksize);
+		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
 
 		/* this may mismatch with real file size: if last direct item
 		   had no padding zeros and last unformatted node had no free
 		   space, this file would have this file size */
-		n_file_size = offset + bytes - 1;
+		file_size = offset + bytes - 1;
 	}
 	/*
 	 * are we doing a full truncate or delete, if so
 	 * kick in the reada code
 	 */
-	if (n_new_file_size == 0)
+	if (new_file_size == 0)
 		s_search_path.reada = PATH_READA | PATH_READA_BACK;
 
-	if (n_file_size == 0 || n_file_size < n_new_file_size) {
+	if (file_size == 0 || file_size < new_file_size) {
 		goto update_and_out;
 	}
 
 	/* Update key to search for the last file item. */
-	set_cpu_key_k_offset(&s_item_key, n_file_size);
+	set_cpu_key_k_offset(&s_item_key, file_size);
 
 	do {
 		/* Cut or delete file item. */
-		n_deleted =
+		deleted =
 		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
-					   p_s_inode, page, n_new_file_size);
-		if (n_deleted < 0) {
-			reiserfs_warning(p_s_inode->i_sb,
-					 "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
+					   inode, page, new_file_size);
+		if (deleted < 0) {
+			reiserfs_warning(inode->i_sb, "vs-5665",
+					 "reiserfs_cut_from_item failed");
 			reiserfs_check_path(&s_search_path);
 			return 0;
 		}
 
-		RFALSE(n_deleted > n_file_size,
+		RFALSE(deleted > file_size,
 		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
-		       n_deleted, n_file_size, &s_item_key);
+		       deleted, file_size, &s_item_key);
 
 		/* Change key to search the last file item. */
-		n_file_size -= n_deleted;
+		file_size -= deleted;
 
-		set_cpu_key_k_offset(&s_item_key, n_file_size);
+		set_cpu_key_k_offset(&s_item_key, file_size);
 
 		/* While there are bytes to truncate and previous file item is presented in the tree. */
 
 		/*
-		 ** This loop could take a really long time, and could log 
+		 ** This loop could take a really long time, and could log
 		 ** many more blocks than a transaction can hold.  So, we do a polite
 		 ** journal end here, and if the transaction needs ending, we make
 		 ** sure the file is consistent before ending the current trans
@@ -1877,37 +1836,38 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 		if (journal_transaction_should_end(th, 0) ||
 		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
 			int orig_len_alloc = th->t_blocks_allocated;
-			decrement_counters_in_path(&s_search_path);
+			pathrelse(&s_search_path);
 
 			if (update_timestamps) {
-				p_s_inode->i_mtime = p_s_inode->i_ctime =
-				    CURRENT_TIME_SEC;
+				inode->i_mtime = CURRENT_TIME_SEC;
+				inode->i_ctime = CURRENT_TIME_SEC;
 			}
-			reiserfs_update_sd(th, p_s_inode);
+			reiserfs_update_sd(th, inode);
 
-			err = journal_end(th, p_s_inode->i_sb, orig_len_alloc);
+			err = journal_end(th, inode->i_sb, orig_len_alloc);
 			if (err)
 				goto out;
-			err = journal_begin(th, p_s_inode->i_sb,
+			err = journal_begin(th, inode->i_sb,
 					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
 			if (err)
 				goto out;
-			reiserfs_update_inode_transaction(p_s_inode);
+			reiserfs_update_inode_transaction(inode);
 		}
-	} while (n_file_size > ROUND_UP(n_new_file_size) &&
-		 search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+	} while (file_size > ROUND_UP(new_file_size) &&
+		 search_for_position_by_key(inode->i_sb, &s_item_key,
 					    &s_search_path) == POSITION_FOUND);
 
-	RFALSE(n_file_size > ROUND_UP(n_new_file_size),
+	RFALSE(file_size > ROUND_UP(new_file_size),
 	       "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
-	       n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid);
+	       new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
 
       update_and_out:
 	if (update_timestamps) {
 		// this is truncate, not file closing
-		p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_mtime = CURRENT_TIME_SEC;
+		inode->i_ctime = CURRENT_TIME_SEC;
 	}
-	reiserfs_update_sd(th, p_s_inode);
+	reiserfs_update_sd(th, inode);
 
       out:
 	pathrelse(&s_search_path);
@@ -1917,7 +1877,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 #ifdef CONFIG_REISERFS_CHECK
 // this makes sure, that we __append__, not overwrite or add holes
 static void check_research_for_paste(struct treepath *path,
-				     const struct cpu_key *p_s_key)
+				     const struct cpu_key *key)
 {
 	struct item_head *found_ih = get_ih(path);
 
@@ -1925,36 +1885,36 @@ static void check_research_for_paste(struct treepath *path,
 		if (le_ih_k_offset(found_ih) +
 		    op_bytes_number(found_ih,
 				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(p_s_key)
+		    cpu_key_k_offset(key)
 		    || op_bytes_number(found_ih,
 				       get_last_bh(path)->b_size) !=
 		    pos_in_item(path))
-			reiserfs_panic(NULL,
-				       "PAP-5720: check_research_for_paste: "
-				       "found direct item %h or position (%d) does not match to key %K",
-				       found_ih, pos_in_item(path), p_s_key);
+			reiserfs_panic(NULL, "PAP-5720", "found direct item "
+				       "%h or position (%d) does not match "
+				       "to key %K", found_ih,
+				       pos_in_item(path), key);
 	}
 	if (is_indirect_le_ih(found_ih)) {
 		if (le_ih_k_offset(found_ih) +
 		    op_bytes_number(found_ih,
 				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(p_s_key)
+		    cpu_key_k_offset(key)
 		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
 		    || get_ih_free_space(found_ih) != 0)
-			reiserfs_panic(NULL,
-				       "PAP-5730: check_research_for_paste: "
-				       "found indirect item (%h) or position (%d) does not match to key (%K)",
-				       found_ih, pos_in_item(path), p_s_key);
+			reiserfs_panic(NULL, "PAP-5730", "found indirect "
+				       "item (%h) or position (%d) does not "
+				       "match to key (%K)",
+				       found_ih, pos_in_item(path), key);
 	}
 }
 #endif				/* config reiserfs check */
 
 /* Paste bytes to the existing item. Returns bytes number pasted into the item. */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_search_path,	/* Path to the pasted item.          */
-			     const struct cpu_key *p_s_key,	/* Key to search for the needed item. */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path,	/* Path to the pasted item.	  */
+			     const struct cpu_key *key,	/* Key to search for the needed item. */
 			     struct inode *inode,	/* Inode item belongs to */
-			     const char *p_c_body,	/* Pointer to the bytes to paste.    */
-			     int n_pasted_size)
+			     const char *body,	/* Pointer to the bytes to paste.    */
+			     int pasted_size)
 {				/* Size of pasted bytes.             */
 	struct tree_balance s_paste_balance;
 	int retval;
@@ -1967,18 +1927,18 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 #ifdef REISERQUOTA_DEBUG
 	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
-		       n_pasted_size, inode->i_uid,
-		       key2type(&(p_s_key->on_disk_key)));
+		       pasted_size, inode->i_uid,
+		       key2type(&(key->on_disk_key)));
 #endif
 
-	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
-		pathrelse(p_s_search_path);
+	if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+		pathrelse(search_path);
 		return -EDQUOT;
 	}
-	init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path,
-		       n_pasted_size);
+	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
+		       pasted_size);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_paste_balance.key = p_s_key->on_disk_key;
+	s_paste_balance.key = key->on_disk_key;
 #endif
 
 	/* DQUOT_* can schedule, must check before the fix_nodes */
@@ -1988,33 +1948,33 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 
 	while ((retval =
 		fix_nodes(M_PASTE, &s_paste_balance, NULL,
-			  p_c_body)) == REPEAT_SEARCH) {
+			  body)) == REPEAT_SEARCH) {
 	      search_again:
 		/* file system changed while we were in the fix_nodes */
 		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
 		retval =
-		    search_for_position_by_key(th->t_super, p_s_key,
-					       p_s_search_path);
+		    search_for_position_by_key(th->t_super, key,
+					       search_path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto error_out;
 		}
 		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb,
-					 "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists",
-					 p_s_key);
+			reiserfs_warning(inode->i_sb, "PAP-5710",
+					 "entry or pasted byte (%K) exists",
+					 key);
 			retval = -EEXIST;
 			goto error_out;
 		}
 #ifdef CONFIG_REISERFS_CHECK
-		check_research_for_paste(p_s_search_path, p_s_key);
+		check_research_for_paste(search_path, key);
 #endif
 	}
 
 	/* Perform balancing after all resources are collected by fix_nodes, and
 	   accessing them will not risk triggering schedule. */
 	if (retval == CARRY_ON) {
-		do_balance(&s_paste_balance, NULL /*ih */ , p_c_body, M_PASTE);
+		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
 		return 0;
 	}
 	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
@@ -2024,18 +1984,24 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 #ifdef REISERQUOTA_DEBUG
 	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
-		       n_pasted_size, inode->i_uid,
-		       key2type(&(p_s_key->on_disk_key)));
+		       pasted_size, inode->i_uid,
+		       key2type(&(key->on_disk_key)));
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
+	vfs_dq_free_space_nodirty(inode, pasted_size);
 	return retval;
 }
 
-/* Insert new item into the buffer at the path. */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path,	/* Path to the inserteded item.         */
-			 const struct cpu_key *key, struct item_head *p_s_ih,	/* Pointer to the item header to insert. */
-			 struct inode *inode, const char *p_c_body)
-{				/* Pointer to the bytes to insert.      */
+/* Insert new item into the buffer at the path.
+ * th   - active transaction handle
+ * path - path to the inserted item
+ * ih   - pointer to the item header to insert
+ * body - pointer to the bytes to insert
+ */
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct treepath *path, const struct cpu_key *key,
+			 struct item_head *ih, struct inode *inode,
+			 const char *body)
+{
 	struct tree_balance s_ins_balance;
 	int retval;
 	int fs_gen = 0;
@@ -2045,28 +2011,27 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	if (inode) {		/* Do we count quotas for item? */
 		fs_gen = get_generation(inode->i_sb);
-		quota_bytes = ih_item_len(p_s_ih);
+		quota_bytes = ih_item_len(ih);
 
 		/* hack so the quota code doesn't have to guess if the file has
 		 ** a tail, links are always tails, so there's no guessing needed
 		 */
-		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_s_ih)) {
+		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
 			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
-		}
 #ifdef REISERQUOTA_DEBUG
 		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
 			       "reiserquota insert_item(): allocating %u id=%u type=%c",
-			       quota_bytes, inode->i_uid, head2type(p_s_ih));
+			       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
-		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
-			pathrelse(p_s_path);
+		if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+			pathrelse(path);
 			return -EDQUOT;
 		}
 	}
-	init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path,
-		       IH_SIZE + ih_item_len(p_s_ih));
+	init_tb_struct(th, &s_ins_balance, th->t_super, path,
+		       IH_SIZE + ih_item_len(ih));
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	s_ins_balance.key = key->on_disk_key;
 #endif
@@ -2076,19 +2041,18 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 	}
 
 	while ((retval =
-		fix_nodes(M_INSERT, &s_ins_balance, p_s_ih,
-			  p_c_body)) == REPEAT_SEARCH) {
+		fix_nodes(M_INSERT, &s_ins_balance, ih,
+			  body)) == REPEAT_SEARCH) {
 	      search_again:
 		/* file system changed while we were in the fix_nodes */
 		PROC_INFO_INC(th->t_super, insert_item_restarted);
-		retval = search_item(th->t_super, key, p_s_path);
+		retval = search_item(th->t_super, key, path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto error_out;
 		}
 		if (retval == ITEM_FOUND) {
-			reiserfs_warning(th->t_super,
-					 "PAP-5760: reiserfs_insert_item: "
+			reiserfs_warning(th->t_super, "PAP-5760",
 					 "key %K already exists in the tree",
 					 key);
 			retval = -EEXIST;
@@ -2098,7 +2062,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 
 	/* make balancing after all resources will be collected at a time */
 	if (retval == CARRY_ON) {
-		do_balance(&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
+		do_balance(&s_ins_balance, ih, body, M_INSERT);
 		return 0;
 	}
 
@@ -2109,9 +2073,9 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 #ifdef REISERQUOTA_DEBUG
 	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
 		       "reiserquota insert_item(): freeing %u id=%u type=%c",
-		       quota_bytes, inode->i_uid, head2type(p_s_ih));
+		       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 	if (inode)
-		DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes);
+		vfs_dq_free_space_nodirty(inode, quota_bytes);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3c820b75829..0ae6486d9046 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/crc32.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -183,9 +184,9 @@ static int finish_unfinished(struct super_block *s)
 		if (REISERFS_SB(s)->s_qf_names[i]) {
 			int ret = reiserfs_quota_on_mount(s, i);
 			if (ret < 0)
-				reiserfs_warning(s,
-						 "reiserfs: cannot turn on journaled quota: error %d",
-						 ret);
+				reiserfs_warning(s, "reiserfs-2500",
+						 "cannot turn on journaled "
+						 "quota: error %d", ret);
 		}
 	}
 #endif
@@ -195,17 +196,16 @@ static int finish_unfinished(struct super_block *s)
 	while (!retval) {
 		retval = search_item(s, &max_cpu_key, &path);
 		if (retval != ITEM_NOT_FOUND) {
-			reiserfs_warning(s,
-					 "vs-2140: finish_unfinished: search_by_key returned %d",
-					 retval);
+			reiserfs_error(s, "vs-2140",
+				       "search_by_key returned %d", retval);
 			break;
 		}
 
 		bh = get_last_bh(&path);
 		item_pos = get_item_pos(&path);
 		if (item_pos != B_NR_ITEMS(bh)) {
-			reiserfs_warning(s,
-					 "vs-2060: finish_unfinished: wrong position found");
+			reiserfs_warning(s, "vs-2060",
+					 "wrong position found");
 			break;
 		}
 		item_pos--;
@@ -235,8 +235,7 @@ static int finish_unfinished(struct super_block *s)
 		if (!inode) {
 			/* the unlink almost completed, it just did not manage to remove
 			   "save" link and release objectid */
-			reiserfs_warning(s,
-					 "vs-2180: finish_unfinished: iget failed for %K",
+			reiserfs_warning(s, "vs-2180", "iget failed for %K",
 					 &obj_key);
 			retval = remove_save_link_only(s, &save_link_key, 1);
 			continue;
@@ -244,21 +243,22 @@ static int finish_unfinished(struct super_block *s)
 
 		if (!truncate && inode->i_nlink) {
 			/* file is not unlinked */
-			reiserfs_warning(s,
-					 "vs-2185: finish_unfinished: file %K is not unlinked",
+			reiserfs_warning(s, "vs-2185",
+					 "file %K is not unlinked",
 					 &obj_key);
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
 		}
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 
 		if (truncate && S_ISDIR(inode->i_mode)) {
 			/* We got a truncate request for a dir which is impossible.
 			   The only imaginable way is to execute unfinished truncate request
 			   then boot into old kernel, remove the file and create dir with
 			   the same key. */
-			reiserfs_warning(s,
-					 "green-2101: impossible truncate on a directory %k. Please report",
+			reiserfs_warning(s, "green-2101",
+					 "impossible truncate on a "
+					 "directory %k. Please report",
 					 INODE_PKEY(inode));
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			truncate = 0;
@@ -288,9 +288,10 @@ static int finish_unfinished(struct super_block *s)
 				/* removal gets completed in iput */
 				retval = 0;
 			} else {
-				reiserfs_warning(s, "Dead loop in "
-						"finish_unfinished detected, "
-						"just remove save link\n");
+				reiserfs_warning(s, "super-2189", "Dead loop "
+						 "in finish_unfinished "
+						 "detected, just remove "
+						 "save link\n");
 				retval = remove_save_link_only(s,
 							&save_link_key, 0);
 			}
@@ -360,8 +361,9 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	} else {
 		/* truncate */
 		if (S_ISDIR(inode->i_mode))
-			reiserfs_warning(inode->i_sb,
-					 "green-2102: Adding a truncate savelink for a directory %k! Please report",
+			reiserfs_warning(inode->i_sb, "green-2102",
+					 "Adding a truncate savelink for "
+					 "a directory %k! Please report",
 					 INODE_PKEY(inode));
 		set_cpu_key_k_offset(&key, 1);
 		set_cpu_key_k_type(&key, TYPE_INDIRECT);
@@ -376,9 +378,9 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	retval = search_item(inode->i_sb, &key, &path);
 	if (retval != ITEM_NOT_FOUND) {
 		if (retval != -ENOSPC)
-			reiserfs_warning(inode->i_sb, "vs-2100: add_save_link:"
-					 "search_by_key (%K) returned %d", &key,
-					 retval);
+			reiserfs_error(inode->i_sb, "vs-2100",
+				       "search_by_key (%K) returned %d", &key,
+				       retval);
 		pathrelse(&path);
 		return;
 	}
@@ -391,9 +393,8 @@ void add_save_link(struct reiserfs_transaction_handle *th,
 	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
 	if (retval) {
 		if (retval != -ENOSPC)
-			reiserfs_warning(inode->i_sb,
-					 "vs-2120: add_save_link: insert_item returned %d",
-					 retval);
+			reiserfs_error(inode->i_sb, "vs-2120",
+				       "insert_item returned %d", retval);
 	} else {
 		if (truncate)
 			REISERFS_I(inode)->i_flags |=
@@ -492,8 +493,7 @@ static void reiserfs_put_super(struct super_block *s)
 	print_statistics(s);
 
 	if (REISERFS_SB(s)->reserved_blocks != 0) {
-		reiserfs_warning(s,
-				 "green-2005: reiserfs_put_super: reserved blocks left %d",
+		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
 				 REISERFS_SB(s)->reserved_blocks);
 	}
 
@@ -559,8 +559,8 @@ static void reiserfs_dirty_inode(struct inode *inode)
 
 	int err = 0;
 	if (inode->i_sb->s_flags & MS_RDONLY) {
-		reiserfs_warning(inode->i_sb,
-				 "clm-6006: writing inode %lu on readonly FS",
+		reiserfs_warning(inode->i_sb, "clm-6006",
+				 "writing inode %lu on readonly FS",
 				 inode->i_ino);
 		return;
 	}
@@ -629,8 +629,6 @@ static const struct super_operations reiserfs_sops = {
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 
-static int reiserfs_dquot_initialize(struct inode *, int);
-static int reiserfs_dquot_drop(struct inode *);
 static int reiserfs_write_dquot(struct dquot *);
 static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
@@ -639,8 +637,8 @@ static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 
 static struct dquot_operations reiserfs_quota_operations = {
-	.initialize = reiserfs_dquot_initialize,
-	.drop = reiserfs_dquot_drop,
+	.initialize = dquot_initialize,
+	.drop = dquot_drop,
 	.alloc_space = dquot_alloc_space,
 	.alloc_inode = dquot_alloc_inode,
 	.free_space = dquot_free_space,
@@ -759,7 +757,7 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 			   char **opt_arg, unsigned long *bit_flags)
 {
 	char *p;
-	/* foo=bar, 
+	/* foo=bar,
 	   ^   ^  ^
 	   |   |  +-- option_end
 	   |   +-- arg_start
@@ -794,13 +792,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 			if (bit_flags) {
 				if (opt->clrmask ==
 				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "%s not supported.",
+					reiserfs_warning(s, "super-6500",
+							 "%s not supported.\n",
 							 p);
 				else
 					*bit_flags &= ~opt->clrmask;
 				if (opt->setmask ==
 				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "%s not supported.",
+					reiserfs_warning(s, "super-6501",
+							 "%s not supported.\n",
 							 p);
 				else
 					*bit_flags |= opt->setmask;
@@ -809,7 +809,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 		}
 	}
 	if (!opt->option_name) {
-		reiserfs_warning(s, "unknown mount option \"%s\"", p);
+		reiserfs_warning(s, "super-6502",
+				 "unknown mount option \"%s\"", p);
 		return -1;
 	}
 
@@ -817,8 +818,9 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 	switch (*p) {
 	case '=':
 		if (!opt->arg_required) {
-			reiserfs_warning(s,
-					 "the option \"%s\" does not require an argument",
+			reiserfs_warning(s, "super-6503",
+					 "the option \"%s\" does not "
+					 "require an argument\n",
 					 opt->option_name);
 			return -1;
 		}
@@ -826,14 +828,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 
 	case 0:
 		if (opt->arg_required) {
-			reiserfs_warning(s,
-					 "the option \"%s\" requires an argument",
-					 opt->option_name);
+			reiserfs_warning(s, "super-6504",
+					 "the option \"%s\" requires an "
+					 "argument\n", opt->option_name);
 			return -1;
 		}
 		break;
 	default:
-		reiserfs_warning(s, "head of option \"%s\" is only correct",
+		reiserfs_warning(s, "super-6505",
+				 "head of option \"%s\" is only correct\n",
 				 opt->option_name);
 		return -1;
 	}
@@ -845,7 +848,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
 	    && !strlen(p)) {
 		/* this catches "option=," if not allowed */
-		reiserfs_warning(s, "empty argument for \"%s\"",
+		reiserfs_warning(s, "super-6506",
+				 "empty argument for \"%s\"\n",
 				 opt->option_name);
 		return -1;
 	}
@@ -867,7 +871,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
 		}
 	}
 
-	reiserfs_warning(s, "bad value \"%s\" for option \"%s\"", p,
+	reiserfs_warning(s, "super-6506",
+			 "bad value \"%s\" for option \"%s\"\n", p,
 			 opt->option_name);
 	return -1;
 }
@@ -957,9 +962,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 				*blocks = simple_strtoul(arg, &p, 0);
 				if (*p != '\0') {
 					/* NNN does not look like a number */
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: bad value %s",
-							 arg);
+					reiserfs_warning(s, "super-6507",
+							 "bad value %s for "
+							 "-oresize\n", arg);
 					return 0;
 				}
 			}
@@ -970,8 +975,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 			unsigned long val = simple_strtoul(arg, &p, 0);
 			/* commit=NNN (time in seconds) */
 			if (*p != '\0' || val >= (unsigned int)-1) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: bad value %s",
+				reiserfs_warning(s, "super-6508",
+						 "bad value %s for -ocommit\n",
 						 arg);
 				return 0;
 			}
@@ -979,16 +984,18 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		}
 
 		if (c == 'w') {
-			reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported");
+			reiserfs_warning(s, "super-6509", "nolargeio option "
+					 "is no longer supported");
 			return 0;
 		}
 
 		if (c == 'j') {
 			if (arg && *arg && jdev_name) {
 				if (*jdev_name) {	//Hm, already assigned?
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: journal device was already  specified to be %s",
-							 *jdev_name);
+					reiserfs_warning(s, "super-6510",
+							 "journal device was "
+							 "already specified to "
+							 "be %s", *jdev_name);
 					return 0;
 				}
 				*jdev_name = arg;
@@ -1000,29 +1007,35 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 
 			if (sb_any_quota_loaded(s) &&
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				reiserfs_warning(s, "super-6511",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
 				return 0;
 			}
 			if (*arg) {	/* Some filename specified? */
 				if (REISERFS_SB(s)->s_qf_names[qtype]
 				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
 					      arg)) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: %s quota file already specified.",
+					reiserfs_warning(s, "super-6512",
+							 "%s quota file "
+							 "already specified.",
 							 QTYPE2NAME(qtype));
 					return 0;
 				}
 				if (strchr(arg, '/')) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: quotafile must be on filesystem root.");
+					reiserfs_warning(s, "super-6513",
+							 "quotafile must be "
+							 "on filesystem root.");
 					return 0;
 				}
 				qf_names[qtype] =
 				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
 				if (!qf_names[qtype]) {
-					reiserfs_warning(s,
-							 "reiserfs_parse_options: not enough memory for storing quotafile name.");
+					reiserfs_warning(s, "reiserfs-2502",
+							 "not enough memory "
+							 "for storing "
+							 "quotafile name.");
 					return 0;
 				}
 				strcpy(qf_names[qtype], arg);
@@ -1040,21 +1053,24 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 			else if (!strcmp(arg, "vfsv0"))
 				*qfmt = QFMT_VFS_V0;
 			else {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: unknown quota format specified.");
+				reiserfs_warning(s, "super-6514",
+						 "unknown quota format "
+						 "specified.");
 				return 0;
 			}
 			if (sb_any_quota_loaded(s) &&
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				reiserfs_warning(s, "super-6515",
+						 "cannot change journaled "
+						 "quota options when quota "
+						 "turned on.");
 				return 0;
 			}
 		}
 #else
 		if (c == 'u' || c == 'g' || c == 'f') {
-			reiserfs_warning(s,
-					 "reiserfs_parse_options: journaled quota options not supported.");
+			reiserfs_warning(s, "reiserfs-2503", "journaled "
+					 "quota options not supported.");
 			return 0;
 		}
 #endif
@@ -1063,15 +1079,15 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 #ifdef CONFIG_QUOTA
 	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
 	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
-		reiserfs_warning(s,
-				 "reiserfs_parse_options: journaled quota format not specified.");
+		reiserfs_warning(s, "super-6515",
+				 "journaled quota format not specified.");
 		return 0;
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
 	    && sb_any_quota_loaded(s)) {
-		reiserfs_warning(s,
-				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
+		reiserfs_warning(s, "super-6516", "quota options must "
+				 "be present when quota is turned on.");
 		return 0;
 	}
 #endif
@@ -1131,14 +1147,15 @@ static void handle_attrs(struct super_block *s)
 
 	if (reiserfs_attrs(s)) {
 		if (old_format_only(s)) {
-			reiserfs_warning(s,
-					 "reiserfs: cannot support attributes on 3.5.x disk format");
+			reiserfs_warning(s, "super-6517", "cannot support "
+					 "attributes on 3.5.x disk format");
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 			return;
 		}
 		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
-			reiserfs_warning(s,
-					 "reiserfs: cannot support attributes until flag is set in super-block");
+			reiserfs_warning(s, "super-6518", "cannot support "
+					 "attributes until flag is set in "
+					 "super-block");
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 		}
 	}
@@ -1280,6 +1297,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
 		s->s_flags &= ~MS_RDONLY;
 		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		if (!old_format_only(s))
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
 		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
 		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
 		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
@@ -1314,7 +1333,7 @@ static int read_super_block(struct super_block *s, int offset)
 
 	bh = sb_bread(s, offset / s->s_blocksize);
 	if (!bh) {
-		reiserfs_warning(s, "sh-2006: read_super_block: "
+		reiserfs_warning(s, "sh-2006",
 				 "bread failed (dev %s, block %lu, size %lu)",
 				 reiserfs_bdevname(s), offset / s->s_blocksize,
 				 s->s_blocksize);
@@ -1328,15 +1347,15 @@ static int read_super_block(struct super_block *s, int offset)
 	}
 	//
 	// ok, reiserfs signature (old or new) found in at the given offset
-	//    
+	//
 	fs_blocksize = sb_blocksize(rs);
 	brelse(bh);
 	sb_set_blocksize(s, fs_blocksize);
 
 	bh = sb_bread(s, offset / s->s_blocksize);
 	if (!bh) {
-		reiserfs_warning(s, "sh-2007: read_super_block: "
-				 "bread failed (dev %s, block %lu, size %lu)\n",
+		reiserfs_warning(s, "sh-2007",
+				 "bread failed (dev %s, block %lu, size %lu)",
 				 reiserfs_bdevname(s), offset / s->s_blocksize,
 				 s->s_blocksize);
 		return 1;
@@ -1344,8 +1363,8 @@ static int read_super_block(struct super_block *s, int offset)
 
 	rs = (struct reiserfs_super_block *)bh->b_data;
 	if (sb_blocksize(rs) != s->s_blocksize) {
-		reiserfs_warning(s, "sh-2011: read_super_block: "
-				 "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n",
+		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
+				 "filesystem on (dev %s, block %Lu, size %lu)",
 				 reiserfs_bdevname(s),
 				 (unsigned long long)bh->b_blocknr,
 				 s->s_blocksize);
@@ -1355,9 +1374,10 @@ static int read_super_block(struct super_block *s, int offset)
 
 	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
 		brelse(bh);
-		reiserfs_warning(s,
-				 "Unfinished reiserfsck --rebuild-tree run detected. Please run\n"
-				 "reiserfsck --rebuild-tree and wait for a completion. If that fails\n"
+		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
+				 "--rebuild-tree run detected. Please run\n"
+				 "reiserfsck --rebuild-tree and wait for a "
+				 "completion. If that fails\n"
 				 "get newer reiserfsprogs package");
 		return 1;
 	}
@@ -1369,18 +1389,15 @@ static int read_super_block(struct super_block *s, int offset)
 		/* magic is of non-standard journal filesystem, look at s_version to
 		   find which format is in use */
 		if (sb_version(rs) == REISERFS_VERSION_2)
-			reiserfs_warning(s,
-					 "read_super_block: found reiserfs format \"3.6\""
-					 " with non-standard journal");
+			reiserfs_info(s, "found reiserfs format \"3.6\""
+				      " with non-standard journal\n");
 		else if (sb_version(rs) == REISERFS_VERSION_1)
-			reiserfs_warning(s,
-					 "read_super_block: found reiserfs format \"3.5\""
-					 " with non-standard journal");
+			reiserfs_info(s, "found reiserfs format \"3.5\""
+				      " with non-standard journal\n");
 		else {
-			reiserfs_warning(s,
-					 "sh-2012: read_super_block: found unknown "
-					 "format \"%u\" of reiserfs with non-standard magic",
-					 sb_version(rs));
+			reiserfs_warning(s, "sh-2012", "found unknown "
+					 "format \"%u\" of reiserfs with "
+					 "non-standard magic", sb_version(rs));
 			return 1;
 		}
 	} else
@@ -1410,8 +1427,7 @@ static int reread_meta_blocks(struct super_block *s)
 	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
-		reiserfs_warning(s,
-				 "reread_meta_blocks, error reading the super");
+		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
 		return 1;
 	}
 
@@ -1454,8 +1470,8 @@ static __u32 find_hash_out(struct super_block *s)
 			if (reiserfs_rupasov_hash(s)) {
 				hash = YURA_HASH;
 			}
-			reiserfs_warning(s, "FS seems to be empty, autodetect "
-					 "is using the default hash");
+			reiserfs_info(s, "FS seems to be empty, autodetect "
+					 "is using the default hash\n");
 			break;
 		}
 		r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
@@ -1475,10 +1491,10 @@ static __u32 find_hash_out(struct super_block *s)
 			&& (yurahash ==
 			    GET_HASH_VALUE(deh_offset
 					   (&(de.de_deh[de.de_entry_num])))))) {
-			reiserfs_warning(s,
-					 "Unable to automatically detect hash function. "
-					 "Please mount with -o hash={tea,rupasov,r5}",
-					 reiserfs_bdevname(s));
+			reiserfs_warning(s, "reiserfs-2506", "Unable to "
+					 "automatically detect hash function. "
+					 "Please mount with -o "
+					 "hash={tea,rupasov,r5}");
 			hash = UNSET_HASH;
 			break;
 		}
@@ -1492,7 +1508,8 @@ static __u32 find_hash_out(struct super_block *s)
 			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
 			hash = R5_HASH;
 		else {
-			reiserfs_warning(s, "Unrecognised hash function");
+			reiserfs_warning(s, "reiserfs-2506",
+					 "Unrecognised hash function");
 			hash = UNSET_HASH;
 		}
 	} while (0);
@@ -1516,21 +1533,24 @@ static int what_hash(struct super_block *s)
 		code = find_hash_out(s);
 
 	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
-		/* detection has found the hash, and we must check against the 
-		 ** mount options 
+		/* detection has found the hash, and we must check against the
+		 ** mount options
 		 */
 		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2507",
+					 "Error, %s hash detected, "
 					 "unable to force rupasov hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
 		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2508",
+					 "Error, %s hash detected, "
 					 "unable to force tea hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
 		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
-			reiserfs_warning(s, "Error, %s hash detected, "
+			reiserfs_warning(s, "reiserfs-2509",
+					 "Error, %s hash detected, "
 					 "unable to force r5 hash",
 					 reiserfs_hashname(code));
 			code = UNSET_HASH;
@@ -1546,7 +1566,7 @@ static int what_hash(struct super_block *s)
 		}
 	}
 
-	/* if we are mounted RW, and we have a new valid hash code, update 
+	/* if we are mounted RW, and we have a new valid hash code, update
 	 ** the super
 	 */
 	if (code != UNSET_HASH &&
@@ -1589,9 +1609,9 @@ static int function2code(hashf_t func)
 	return 0;
 }
 
-#define SWARN(silent, s, ...)			\
+#define SWARN(silent, s, id, ...)			\
 	if (!(silent))				\
-		reiserfs_warning (s, __VA_ARGS__)
+		reiserfs_warning(s, id, __VA_ARGS__)
 
 static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 {
@@ -1625,10 +1645,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
 	/* Preallocate by 16 blocks (17-1) at once */
 	REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
-#ifdef CONFIG_REISERFS_FS_XATTR
-	/* Initialize the rwsem for xattr dir */
-	init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
-#endif
 	/* setup default block allocator options */
 	reiserfs_init_alloc_options(s);
 
@@ -1643,8 +1659,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 #endif
 
 	if (blocks) {
-		SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
-		      "for remount only");
+		SWARN(silent, s, "jmacd-7", "resize option for remount only");
 		goto error;
 	}
 
@@ -1653,8 +1668,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		old_format = 1;
 	/* try new format (64-th 1k block), which can contain reiserfs super block */
 	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
-		SWARN(silent, s,
-		      "sh-2021: reiserfs_fill_super: can not find reiserfs on %s",
+		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
 		      reiserfs_bdevname(s));
 		goto error;
 	}
@@ -1666,13 +1680,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (s->s_bdev && s->s_bdev->bd_inode
 	    && i_size_read(s->s_bdev->bd_inode) <
 	    sb_block_count(rs) * sb_blocksize(rs)) {
-		SWARN(silent, s,
-		      "Filesystem on %s cannot be mounted because it is bigger than the device",
-		      reiserfs_bdevname(s));
-		SWARN(silent, s,
-		      "You may need to run fsck or increase size of your LVM partition");
-		SWARN(silent, s,
-		      "Or may be you forgot to reboot after fdisk when it told you to");
+		SWARN(silent, s, "", "Filesystem cannot be "
+		      "mounted because it is bigger than the device");
+		SWARN(silent, s, "", "You may need to run fsck "
+		      "or increase size of your LVM partition");
+		SWARN(silent, s, "", "Or may be you forgot to "
+		      "reboot after fdisk when it told you to");
 		goto error;
 	}
 
@@ -1680,14 +1693,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	sbi->s_mount_state = REISERFS_VALID_FS;
 
 	if ((errval = reiserfs_init_bitmap_cache(s))) {
-		SWARN(silent, s,
-		      "jmacd-8: reiserfs_fill_super: unable to read bitmap");
+		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
 		goto error;
 	}
 	errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
-	SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON");
-	SWARN(silent, s, "- it is slow mode for debugging.");
+	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
+	SWARN(silent, s, "", "- it is slow mode for debugging.");
 #endif
 
 	/* make data=ordered the default */
@@ -1708,8 +1720,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	}
 	// set_device_ro(s->s_dev, 1) ;
 	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
-		SWARN(silent, s,
-		      "sh-2022: reiserfs_fill_super: unable to initialize journal space");
+		SWARN(silent, s, "sh-2022",
+		      "unable to initialize journal space");
 		goto error;
 	} else {
 		jinit_done = 1;	/* once this is set, journal_release must be called
@@ -1717,8 +1729,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 				 */
 	}
 	if (reread_meta_blocks(s)) {
-		SWARN(silent, s,
-		      "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init");
+		SWARN(silent, s, "jmacd-9",
+		      "unable to reread meta blocks after journal init");
 		goto error;
 	}
 
@@ -1726,8 +1738,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		goto error;
 
 	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
-		SWARN(silent, s,
-		      "clm-7000: Detected readonly device, marking FS readonly");
+		SWARN(silent, s, "clm-7000",
+		      "Detected readonly device, marking FS readonly");
 		s->s_flags |= MS_RDONLY;
 	}
 	args.objectid = REISERFS_ROOT_OBJECTID;
@@ -1736,8 +1748,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
 			 reiserfs_init_locked_inode, (void *)(&args));
 	if (!root_inode) {
-		SWARN(silent, s,
-		      "jmacd-10: reiserfs_fill_super: get root inode failed");
+		SWARN(silent, s, "jmacd-10", "get root inode failed");
 		goto error;
 	}
 
@@ -1786,7 +1797,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		 * avoiding corruption. -jeffm */
 		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
 		    sb_bmap_nr(rs) != 0) {
-			reiserfs_warning(s, "super-2030: This file system "
+			reiserfs_warning(s, "super-2030", "This file system "
 					"claims to use %u bitmap blocks in "
 					"its super block, but requires %u. "
 					"Clearing to zero.", sb_bmap_nr(rs),
@@ -1819,7 +1830,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			} else if (!silent) {
 				reiserfs_info(s, "using 3.5.x disk format\n");
 			}
-		}
+		} else
+			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
+
 
 		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
 		errval = journal_end(&th, s, 1);
@@ -1892,62 +1905,14 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
+	buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+	buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+				sizeof(rs->s_uuid)/2);
+
 	return 0;
 }
 
 #ifdef CONFIG_QUOTA
-static int reiserfs_dquot_initialize(struct inode *inode, int type)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	reiserfs_write_lock(inode->i_sb);
-	ret =
-	    journal_begin(&th, inode->i_sb,
-			  2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (ret)
-		goto out;
-	ret = dquot_initialize(inode, type);
-	err =
-	    journal_end(&th, inode->i_sb,
-			2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (!ret && err)
-		ret = err;
-      out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
-static int reiserfs_dquot_drop(struct inode *inode)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	reiserfs_write_lock(inode->i_sb);
-	ret =
-	    journal_begin(&th, inode->i_sb,
-			  2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (ret) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		goto out;
-	}
-	ret = dquot_drop(inode);
-	err =
-	    journal_end(&th, inode->i_sb,
-			2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (!ret && err)
-		ret = err;
-      out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
 static int reiserfs_write_dquot(struct dquot *dquot)
 {
 	struct reiserfs_transaction_handle th;
@@ -2085,8 +2050,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
 		if (err) {
-			reiserfs_warning(sb,
-				"reiserfs: Unpacking tail of quota file failed"
+			reiserfs_warning(sb, "super-6520",
+				"Unpacking tail of quota file failed"
 				" (%d). Cannot turn on quotas.", err);
 			err = -EINVAL;
 			goto out;
@@ -2097,8 +2062,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			reiserfs_warning(sb,
-				 "reiserfs: Quota file not on filesystem root. "
+			reiserfs_warning(sb, "super-6521",
+				 "Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
 	}
 
@@ -2249,9 +2214,6 @@ static int __init init_reiserfs_fs(void)
 		return ret;
 	}
 
-	if ((ret = reiserfs_xattr_register_handlers()))
-		goto failed_reiserfs_xattr_register_handlers;
-
 	reiserfs_proc_info_global_init();
 	reiserfs_proc_register_global("version",
 				      reiserfs_global_version_in_proc);
@@ -2262,9 +2224,6 @@ static int __init init_reiserfs_fs(void)
 		return 0;
 	}
 
-	reiserfs_xattr_unregister_handlers();
-
-      failed_reiserfs_xattr_register_handlers:
 	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	destroy_inodecache();
@@ -2274,7 +2233,6 @@ static int __init init_reiserfs_fs(void)
 
 static void __exit exit_reiserfs_fs(void)
 {
-	reiserfs_xattr_unregister_handlers();
 	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	unregister_filesystem(&reiserfs_fs_type);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f8121a1147e8..d7f6e51bef2a 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -26,7 +26,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 				   converted item. */
 	struct item_head ind_ih;	/* new indirect item to be inserted or
 					   key of unfm pointer to be pasted */
-	int n_blk_size, n_retval;	/* returned value for reiserfs_insert_item and clones */
+	int blk_size, retval;	/* returned value for reiserfs_insert_item and clones */
 	unp_t unfm_ptr;		/* Handle on an unformatted node
 				   that will be inserted in the
 				   tree. */
@@ -35,7 +35,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 
 	REISERFS_SB(sb)->s_direct2indirect++;
 
-	n_blk_size = sb->s_blocksize;
+	blk_size = sb->s_blocksize;
 
 	/* and key to search for append or insert pointer to the new
 	   unformatted node. */
@@ -46,11 +46,11 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 	/* Set the key to search for the place for new unfm pointer */
 	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
 
-	// FIXME: we could avoid this 
+	/* FIXME: we could avoid this */
 	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
-		reiserfs_warning(sb, "PAP-14030: direct2indirect: "
-				 "pasted or inserted byte exists in the tree %K. "
-				 "Use fsck to repair.", &end_key);
+		reiserfs_error(sb, "PAP-14030",
+			       "pasted or inserted byte exists in "
+			       "the tree %K. Use fsck to repair.", &end_key);
 		pathrelse(path);
 		return -EIO;
 	}
@@ -64,17 +64,17 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
 		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
 		PATH_LAST_POSITION(path)++;
-		n_retval =
+		retval =
 		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
 					 (char *)&unfm_ptr);
 	} else {
 		/* Paste into last indirect item of an object. */
-		n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
+		retval = reiserfs_paste_into_item(th, path, &end_key, inode,
 						    (char *)&unfm_ptr,
 						    UNFM_P_SIZE);
 	}
-	if (n_retval) {
-		return n_retval;
+	if (retval) {
+		return retval;
 	}
 	// note: from here there are two keys which have matching first
 	// three key components. They only differ by the fourth one.
@@ -92,14 +92,13 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 		   last item of the file */
 		if (search_for_position_by_key(sb, &end_key, path) ==
 		    POSITION_FOUND)
-			reiserfs_panic(sb,
-				       "PAP-14050: direct2indirect: "
+			reiserfs_panic(sb, "PAP-14050",
 				       "direct item (%K) not found", &end_key);
 		p_le_ih = PATH_PITEM_HEAD(path);
 		RFALSE(!is_direct_le_ih(p_le_ih),
 		       "vs-14055: direct item expected(%K), found %h",
 		       &end_key, p_le_ih);
-		tail_size = (le_ih_k_offset(p_le_ih) & (n_blk_size - 1))
+		tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
 		    + ih_item_len(p_le_ih) - 1;
 
 		/* we only send the unbh pointer if the buffer is not up to date.
@@ -114,11 +113,11 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 		} else {
 			up_to_date_bh = unbh;
 		}
-		n_retval = reiserfs_delete_item(th, path, &end_key, inode,
+		retval = reiserfs_delete_item(th, path, &end_key, inode,
 						up_to_date_bh);
 
-		total_tail += n_retval;
-		if (tail_size == n_retval)
+		total_tail += retval;
+		if (tail_size == retval)
 			// done: file does not have direct items anymore
 			break;
 
@@ -130,7 +129,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 		unsigned pgoff =
 		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
 		char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0);
-		memset(kaddr + pgoff, 0, n_blk_size - total_tail);
+		memset(kaddr + pgoff, 0, blk_size - total_tail);
 		kunmap_atomic(kaddr, KM_USER0);
 	}
 
@@ -171,14 +170,18 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
    what we expect from it (number of cut bytes). But when tail remains
    in the unformatted node, we set mode to SKIP_BALANCING and unlock
    inode */
-int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct treepath *p_s_path,	/* path to the indirect item. */
-		    const struct cpu_key *p_s_item_key,	/* Key to look for unformatted node pointer to be cut. */
+int indirect2direct(struct reiserfs_transaction_handle *th,
+		    struct inode *inode, struct page *page,
+		    struct treepath *path,	/* path to the indirect item. */
+		    const struct cpu_key *item_key,	/* Key to look for
+							 * unformatted node
+							 * pointer to be cut. */
 		    loff_t n_new_file_size,	/* New file size. */
-		    char *p_c_mode)
+		    char *mode)
 {
-	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	struct item_head s_ih;
-	unsigned long n_block_size = p_s_sb->s_blocksize;
+	unsigned long block_size = sb->s_blocksize;
 	char *tail;
 	int tail_len, round_tail_len;
 	loff_t pos, pos1;	/* position of first byte of the tail */
@@ -186,22 +189,22 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	BUG_ON(!th->t_trans_id);
 
-	REISERFS_SB(p_s_sb)->s_indirect2direct++;
+	REISERFS_SB(sb)->s_indirect2direct++;
 
-	*p_c_mode = M_SKIP_BALANCING;
+	*mode = M_SKIP_BALANCING;
 
 	/* store item head path points to. */
-	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+	copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 
-	tail_len = (n_new_file_size & (n_block_size - 1));
-	if (get_inode_sd_version(p_s_inode) == STAT_DATA_V2)
+	tail_len = (n_new_file_size & (block_size - 1));
+	if (get_inode_sd_version(inode) == STAT_DATA_V2)
 		round_tail_len = ROUND_UP(tail_len);
 	else
 		round_tail_len = tail_len;
 
 	pos =
 	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
-					 1) * p_s_sb->s_blocksize;
+					 1) * sb->s_blocksize;
 	pos1 = pos;
 
 	// we are protected by i_mutex. The tail can not disapper, not
@@ -210,27 +213,26 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	tail = (char *)kmap(page);	/* this can schedule */
 
-	if (path_changed(&s_ih, p_s_path)) {
+	if (path_changed(&s_ih, path)) {
 		/* re-search indirect item */
-		if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path)
+		if (search_for_position_by_key(sb, item_key, path)
 		    == POSITION_NOT_FOUND)
-			reiserfs_panic(p_s_sb,
-				       "PAP-5520: indirect2direct: "
+			reiserfs_panic(sb, "PAP-5520",
 				       "item to be converted %K does not exist",
-				       p_s_item_key);
-		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+				       item_key);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
 #ifdef CONFIG_REISERFS_CHECK
 		pos = le_ih_k_offset(&s_ih) - 1 +
 		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
-		     1) * p_s_sb->s_blocksize;
+		     1) * sb->s_blocksize;
 		if (pos != pos1)
-			reiserfs_panic(p_s_sb, "vs-5530: indirect2direct: "
-				       "tail position changed while we were reading it");
+			reiserfs_panic(sb, "vs-5530", "tail position "
+				       "changed while we were reading it");
 #endif
 	}
 
 	/* Set direct item header to insert. */
-	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(p_s_inode),
+	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
 			  pos1 + 1, TYPE_DIRECT, round_tail_len,
 			  0xffff /*ih_free_space */ );
 
@@ -240,13 +242,13 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 	 */
 	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
 
-	PATH_LAST_POSITION(p_s_path)++;
+	PATH_LAST_POSITION(path)++;
 
-	key = *p_s_item_key;
+	key = *item_key;
 	set_cpu_key_k_type(&key, TYPE_DIRECT);
 	key.key_length = 4;
 	/* Insert tail as new direct item in the tree */
-	if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
+	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
 				 tail ? tail : NULL) < 0) {
 		/* No disk memory. So we can not convert last unformatted node
 		   to the direct item.  In this case we used to adjust
@@ -255,12 +257,12 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 		   unformatted node. For now i_size is considered as guard for
 		   going out of file size */
 		kunmap(page);
-		return n_block_size - round_tail_len;
+		return block_size - round_tail_len;
 	}
 	kunmap(page);
 
 	/* make sure to get the i_blocks changes from reiserfs_insert_item */
-	reiserfs_update_sd(th, p_s_inode);
+	reiserfs_update_sd(th, inode);
 
 	// note: we have now the same as in above direct2indirect
 	// conversion: there are two keys which have matching first three
@@ -268,11 +270,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 
 	/* We have inserted new direct item and must remove last
 	   unformatted node. */
-	*p_c_mode = M_CUT;
+	*mode = M_CUT;
 
 	/* we store position of first direct item in the in-core inode */
-	//mark_file_with_tail (p_s_inode, pos1 + 1);
-	REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1;
+	/* mark_file_with_tail (inode, pos1 + 1); */
+	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
 
-	return n_block_size - round_tail_len;
+	return block_size - round_tail_len;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ad92461cbfc3..f83f52bae390 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -27,6 +27,10 @@
  * these are special cases for filesystem ACLs, they are interpreted by the
  * kernel, in addition, they are negatively and positively cached and attached
  * to the inode so that unnecessary lookups are avoided.
+ *
+ * Locking works like so:
+ * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
+ * The xattrs themselves are protected by the xattr_sem.
  */
 
 #include <linux/reiserfs_fs.h>
@@ -44,328 +48,334 @@
 #include <net/checksum.h>
 #include <linux/smp_lock.h>
 #include <linux/stat.h>
+#include <linux/quotaops.h>
 
-#define FL_READONLY 128
-#define FL_DIR_SEM_HELD 256
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
 
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
-								*prefix);
 
-/* Returns the dentry referring to the root of the extended attribute
- * directory tree. If it has already been retrieved, it is used. If it
- * hasn't been created and the flags indicate creation is allowed, we
- * attempt to create it. On error, we return a pointer-encoded error.
- */
-static struct dentry *get_xa_root(struct super_block *sb, int flags)
+/* Helpers for inode ops. We do this so that we don't have all the VFS
+ * overhead and also for proper i_mutex annotation.
+ * dir->i_mutex must be held for all of them. */
+#ifdef CONFIG_REISERFS_FS_XATTR
+static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
-	struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root);
-	struct dentry *xaroot;
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	vfs_dq_init(dir);
+	return dir->i_op->create(dir, dentry, mode, NULL);
+}
+#endif
 
-	/* This needs to be created at mount-time */
-	if (!privroot)
-		return ERR_PTR(-ENODATA);
+static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	vfs_dq_init(dir);
+	return dir->i_op->mkdir(dir, dentry, mode);
+}
 
-	mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
-	if (REISERFS_SB(sb)->xattr_root) {
-		xaroot = dget(REISERFS_SB(sb)->xattr_root);
-		goto out;
-	}
+/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
+ * mutation ops aren't called during rename or splace, which are the
+ * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
+ * better than allocating another subclass just for this code. */
+static int xattr_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	vfs_dq_init(dir);
 
-	xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
-	if (IS_ERR(xaroot)) {
-		goto out;
-	} else if (!xaroot->d_inode) {
+	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	error = dir->i_op->unlink(dir, dentry);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	if (!error)
+		d_delete(dentry);
+	return error;
+}
+
+static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int error;
+	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	vfs_dq_init(dir);
+
+	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	dentry_unhash(dentry);
+	error = dir->i_op->rmdir(dir, dentry);
+	if (!error)
+		dentry->d_inode->i_flags |= S_DEAD;
+	mutex_unlock(&dentry->d_inode->i_mutex);
+	if (!error)
+		d_delete(dentry);
+	dput(dentry);
+
+	return error;
+}
+
+#define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
+
+/* Returns and possibly creates the xattr dir. */
+static struct dentry *lookup_or_create_dir(struct dentry *parent,
+					    const char *name, int flags)
+{
+	struct dentry *dentry;
+	BUG_ON(!parent);
+
+	dentry = lookup_one_len(name, parent, strlen(name));
+	if (IS_ERR(dentry))
+		return dentry;
+	else if (!dentry->d_inode) {
 		int err = -ENODATA;
-		if (flags == 0 || flags & XATTR_CREATE)
-			err = privroot->d_inode->i_op->mkdir(privroot->d_inode,
-			                                     xaroot, 0700);
+
+		if (xattr_may_create(flags)) {
+			mutex_lock_nested(&parent->d_inode->i_mutex,
+					  I_MUTEX_XATTR);
+			err = xattr_mkdir(parent->d_inode, dentry, 0700);
+			mutex_unlock(&parent->d_inode->i_mutex);
+		}
+
 		if (err) {
-			dput(xaroot);
-			xaroot = ERR_PTR(err);
-			goto out;
+			dput(dentry);
+			dentry = ERR_PTR(err);
 		}
 	}
-	REISERFS_SB(sb)->xattr_root = dget(xaroot);
 
-      out:
-	mutex_unlock(&privroot->d_inode->i_mutex);
-	dput(privroot);
-	return xaroot;
+	return dentry;
+}
+
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
+{
+	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
+	if (!privroot)
+		return ERR_PTR(-ENODATA);
+	return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
 }
 
-/* Opens the directory corresponding to the inode's extended attribute store.
- * If flags allow, the tree to the directory may be created. If creation is
- * prohibited, -ENODATA is returned. */
 static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 {
 	struct dentry *xaroot, *xadir;
 	char namebuf[17];
 
-	xaroot = get_xa_root(inode->i_sb, flags);
+	xaroot = open_xa_root(inode->i_sb, flags);
 	if (IS_ERR(xaroot))
 		return xaroot;
 
-	/* ok, we have xaroot open */
 	snprintf(namebuf, sizeof(namebuf), "%X.%X",
 		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
 		 inode->i_generation);
-	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
-	if (IS_ERR(xadir)) {
-		dput(xaroot);
-		return xadir;
-	}
-
-	if (!xadir->d_inode) {
-		int err;
-		if (flags == 0 || flags & XATTR_CREATE) {
-			/* Although there is nothing else trying to create this directory,
-			 * another directory with the same hash may be created, so we need
-			 * to protect against that */
-			err =
-			    xaroot->d_inode->i_op->mkdir(xaroot->d_inode, xadir,
-							 0700);
-			if (err) {
-				dput(xaroot);
-				dput(xadir);
-				return ERR_PTR(err);
-			}
-		}
-		if (!xadir->d_inode) {
-			dput(xaroot);
-			dput(xadir);
-			return ERR_PTR(-ENODATA);
-		}
-	}
 
+	xadir = lookup_or_create_dir(xaroot, namebuf, flags);
 	dput(xaroot);
 	return xadir;
+
 }
 
-/* Returns a dentry corresponding to a specific extended attribute file
- * for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned. */
-static struct dentry *get_xa_file_dentry(const struct inode *inode,
-					 const char *name, int flags)
-{
-	struct dentry *xadir, *xafile;
-	int err = 0;
+/* The following are side effects of other operations that aren't explicitly
+ * modifying extended attributes. This includes operations such as permissions
+ * or ownership changes, object deletions, etc. */
+struct reiserfs_dentry_buf {
+	struct dentry *xadir;
+	int count;
+	struct dentry *dentries[8];
+};
 
-	xadir = open_xa_dir(inode, flags);
-	if (IS_ERR(xadir)) {
-		return ERR_CAST(xadir);
-	} else if (!xadir->d_inode) {
-		dput(xadir);
-		return ERR_PTR(-ENODATA);
-	}
+static int
+fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
+		    u64 ino, unsigned int d_type)
+{
+	struct reiserfs_dentry_buf *dbuf = buf;
+	struct dentry *dentry;
 
-	xafile = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(xafile)) {
-		dput(xadir);
-		return ERR_CAST(xafile);
-	}
+	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
+		return -ENOSPC;
 
-	if (xafile->d_inode) {	/* file exists */
-		if (flags & XATTR_CREATE) {
-			err = -EEXIST;
-			dput(xafile);
-			goto out;
-		}
-	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
-		goto out;
-	} else {
-		/* inode->i_mutex is down, so nothing else can try to create
-		 * the same xattr */
-		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
-						   0700 | S_IFREG, NULL);
+	if (name[0] == '.' && (name[1] == '\0' ||
+			       (name[1] == '.' && name[2] == '\0')))
+		return 0;
 
-		if (err) {
-			dput(xafile);
-			goto out;
-		}
+	dentry = lookup_one_len(name, dbuf->xadir, namelen);
+	if (IS_ERR(dentry)) {
+		return PTR_ERR(dentry);
+	} else if (!dentry->d_inode) {
+		/* A directory entry exists, but no file? */
+		reiserfs_error(dentry->d_sb, "xattr-20003",
+			       "Corrupted directory: xattr %s listed but "
+			       "not found for file %s.\n",
+			       dentry->d_name.name, dbuf->xadir->d_name.name);
+		dput(dentry);
+		return -EIO;
 	}
 
-      out:
-	dput(xadir);
-	if (err)
-		xafile = ERR_PTR(err);
-	else if (!xafile->d_inode) {
-		dput(xafile);
-		xafile = ERR_PTR(-ENODATA);
-	}
-	return xafile;
+	dbuf->dentries[dbuf->count++] = dentry;
+	return 0;
 }
 
-/*
- * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
- * we need to drop the path before calling the filldir struct.  That
- * would be a big performance hit to the non-xattr case, so I've copied
- * the whole thing for now. --clm
- *
- * the big difference is that I go backwards through the directory,
- * and don't mess with f->f_pos, but the idea is the same.  Do some
- * action on each and every entry in the directory.
- *
- * we're called with i_mutex held, so there are no worries about the directory
- * changing underneath us.
- */
-static int __xattr_readdir(struct inode *inode, void *dirent, filldir_t filldir)
+static void
+cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
 {
-	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
-	INITIALIZE_PATH(path_to_entry);
-	struct buffer_head *bh;
-	int entry_num;
-	struct item_head *ih, tmp_ih;
-	int search_res;
-	char *local_buf;
-	loff_t next_pos;
-	char small_buf[32];	/* avoid kmalloc if we can */
-	struct reiserfs_de_head *deh;
-	int d_reclen;
-	char *d_name;
-	off_t d_off;
-	ino_t d_ino;
-	struct reiserfs_dir_entry de;
-
-	/* form key for search the next directory entry using f_pos field of
-	   file structure */
-	next_pos = max_reiserfs_offset(inode);
-
-	while (1) {
-	      research:
-		if (next_pos <= DOT_DOT_OFFSET)
-			break;
-		make_cpu_key(&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
-
-		search_res =
-		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
-					&de);
-		if (search_res == IO_ERROR) {
-			// FIXME: we could just skip part of directory which could
-			// not be read
-			pathrelse(&path_to_entry);
-			return -EIO;
-		}
+	int i;
+	for (i = 0; i < buf->count; i++)
+		if (buf->dentries[i])
+			dput(buf->dentries[i]);
+}
+
+static int reiserfs_for_each_xattr(struct inode *inode,
+				   int (*action)(struct dentry *, void *),
+				   void *data)
+{
+	struct dentry *dir;
+	int i, err = 0;
+	loff_t pos = 0;
+	struct reiserfs_dentry_buf buf = {
+		.count = 0,
+	};
 
-		if (search_res == NAME_NOT_FOUND)
-			de.de_entry_num--;
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
+		return 0;
 
-		set_de_name_and_namelen(&de);
-		entry_num = de.de_entry_num;
-		deh = &(de.de_deh[entry_num]);
+	dir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	} else if (!dir->d_inode) {
+		err = 0;
+		goto out_dir;
+	}
 
-		bh = de.de_bh;
-		ih = de.de_ih;
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+	buf.xadir = dir;
+	err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
+	while ((err == 0 || err == -ENOSPC) && buf.count) {
+		err = 0;
 
-		if (!is_direntry_le_ih(ih)) {
-			reiserfs_warning(inode->i_sb, "not direntry %h", ih);
-			break;
-		}
-		copy_item_head(&tmp_ih, ih);
+		for (i = 0; i < buf.count && buf.dentries[i]; i++) {
+			int lerr = 0;
+			struct dentry *dentry = buf.dentries[i];
 
-		/* we must have found item, that is item of this directory, */
-		RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
-		       "vs-9000: found item %h does not match to dir we readdir %K",
-		       ih, &pos_key);
+			if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
+				lerr = action(dentry, data);
 
-		if (deh_offset(deh) <= DOT_DOT_OFFSET) {
-			break;
+			dput(dentry);
+			buf.dentries[i] = NULL;
+			err = lerr ?: err;
 		}
+		buf.count = 0;
+		if (!err)
+			err = reiserfs_readdir_dentry(dir, &buf,
+						      fill_with_dentries, &pos);
+	}
+	mutex_unlock(&dir->d_inode->i_mutex);
 
-		/* look for the previous entry in the directory */
-		next_pos = deh_offset(deh) - 1;
-
-		if (!de_visible(deh))
-			/* it is hidden entry */
-			continue;
+	/* Clean up after a failed readdir */
+	cleanup_dentry_buf(&buf);
 
-		d_reclen = entry_length(bh, ih, entry_num);
-		d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
-		d_off = deh_offset(deh);
-		d_ino = deh_objectid(deh);
+	if (!err) {
+		/* We start a transaction here to avoid a ABBA situation
+		 * between the xattr root's i_mutex and the journal lock.
+		 * This doesn't incur much additional overhead since the
+		 * new transaction will just nest inside the
+		 * outer transaction. */
+		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+		struct reiserfs_transaction_handle th;
+		err = journal_begin(&th, inode->i_sb, blocks);
+		if (!err) {
+			int jerror;
+			mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+					  I_MUTEX_XATTR);
+			err = action(dir, data);
+			jerror = journal_end(&th, inode->i_sb, blocks);
+			mutex_unlock(&dir->d_parent->d_inode->i_mutex);
+			err = jerror ?: err;
+		}
+	}
+out_dir:
+	dput(dir);
+out:
+	/* -ENODATA isn't an error */
+	if (err == -ENODATA)
+		err = 0;
+	return err;
+}
 
-		if (!d_name[d_reclen - 1])
-			d_reclen = strlen(d_name);
+static int delete_one_xattr(struct dentry *dentry, void *data)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
 
-		if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)) {
-			/* too big to send back to VFS */
-			continue;
-		}
+	/* This is the xattr dir, handle specially. */
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return xattr_rmdir(dir, dentry);
 
-		/* Ignore the .reiserfs_priv entry */
-		if (reiserfs_xattrs(inode->i_sb) &&
-		    !old_format_only(inode->i_sb) &&
-		    deh_objectid(deh) ==
-		    le32_to_cpu(INODE_PKEY
-				(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->
-				k_objectid))
-			continue;
-
-		if (d_reclen <= 32) {
-			local_buf = small_buf;
-		} else {
-			local_buf = kmalloc(d_reclen, GFP_NOFS);
-			if (!local_buf) {
-				pathrelse(&path_to_entry);
-				return -ENOMEM;
-			}
-			if (item_moved(&tmp_ih, &path_to_entry)) {
-				kfree(local_buf);
+	return xattr_unlink(dir, dentry);
+}
 
-				/* sigh, must retry.  Do this same offset again */
-				next_pos = d_off;
-				goto research;
-			}
-		}
+static int chown_one_xattr(struct dentry *dentry, void *data)
+{
+	struct iattr *attrs = data;
+	return reiserfs_setattr(dentry, attrs);
+}
 
-		// Note, that we copy name to user space via temporary
-		// buffer (local_buf) because filldir will block if
-		// user space buffer is swapped out. At that time
-		// entry can move to somewhere else
-		memcpy(local_buf, d_name, d_reclen);
-
-		/* the filldir function might need to start transactions,
-		 * or do who knows what.  Release the path now that we've
-		 * copied all the important stuff out of the deh
-		 */
-		pathrelse(&path_to_entry);
-
-		if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
-			    DT_UNKNOWN) < 0) {
-			if (local_buf != small_buf) {
-				kfree(local_buf);
-			}
-			goto end;
-		}
-		if (local_buf != small_buf) {
-			kfree(local_buf);
-		}
-	}			/* while */
+/* No i_mutex, but the inode is unconnected. */
+int reiserfs_delete_xattrs(struct inode *inode)
+{
+	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20004",
+				 "Couldn't delete all xattrs (%d)\n", err);
+	return err;
+}
 
-      end:
-	pathrelse(&path_to_entry);
-	return 0;
+/* inode->i_mutex: down */
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+{
+	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
+	if (err)
+		reiserfs_warning(inode->i_sb, "jdm-20007",
+				 "Couldn't chown all xattrs (%d)\n", err);
+	return err;
 }
 
-/*
- * this could be done with dedicated readdir ops for the xattr files,
- * but I want to get something working asap
- * this is stolen from vfs_readdir
- *
- */
-static
-int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
+#ifdef CONFIG_REISERFS_FS_XATTR
+/* Returns a dentry corresponding to a specific extended attribute file
+ * for the inode. If flags allow, the file is created. Otherwise, a
+ * valid or negative dentry, or an error is returned. */
+static struct dentry *xattr_lookup(struct inode *inode, const char *name,
+				    int flags)
 {
-	int res = -ENOENT;
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
-	if (!IS_DEADDIR(inode)) {
-		lock_kernel();
-		res = __xattr_readdir(inode, buf, filler);
-		unlock_kernel();
+	struct dentry *xadir, *xafile;
+	int err = 0;
+
+	xadir = open_xa_dir(inode, flags);
+	if (IS_ERR(xadir))
+		return ERR_CAST(xadir);
+
+	xafile = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(xafile)) {
+		err = PTR_ERR(xafile);
+		goto out;
 	}
-	mutex_unlock(&inode->i_mutex);
-	return res;
+
+	if (xafile->d_inode && (flags & XATTR_CREATE))
+		err = -EEXIST;
+
+	if (!xafile->d_inode) {
+		err = -ENODATA;
+		if (xattr_may_create(flags)) {
+			mutex_lock_nested(&xadir->d_inode->i_mutex,
+					  I_MUTEX_XATTR);
+			err = xattr_create(xadir->d_inode, xafile,
+					      0700|S_IFREG);
+			mutex_unlock(&xadir->d_inode->i_mutex);
+		}
+	}
+
+	if (err)
+		dput(xafile);
+out:
+	dput(xadir);
+	if (err)
+		return ERR_PTR(err);
+	return xafile;
 }
 
 /* Internal operations on file data */
@@ -375,14 +385,14 @@ static inline void reiserfs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
+static struct page *reiserfs_get_page(struct inode *dir, size_t n)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page;
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_mapping_page(mapping, n, NULL);
+	page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
 	if (!IS_ERR(page)) {
 		kmap(page);
 		if (PageError(page))
@@ -405,6 +415,45 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 int reiserfs_prepare_write(struct file *f, struct page *page,
 			   unsigned from, unsigned to);
 
+static void update_ctime(struct inode *inode)
+{
+	struct timespec now = current_fs_time(inode->i_sb);
+	if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+	    timespec_equal(&inode->i_ctime, &now))
+		return;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+}
+
+static int lookup_and_delete_xattr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct dentry *dentry, *xadir;
+
+	xadir = open_xa_dir(inode, XATTR_REPLACE);
+	if (IS_ERR(xadir))
+		return PTR_ERR(xadir);
+
+	dentry = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_dput;
+	}
+
+	if (dentry->d_inode) {
+		mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+		err = xattr_unlink(xadir->d_inode, dentry);
+		mutex_unlock(&xadir->d_inode->i_mutex);
+		update_ctime(inode);
+	}
+
+	dput(dentry);
+out_dput:
+	dput(xadir);
+	return err;
+}
+
 
 /* Generic extended attribute operations that can be used by xa plugins */
 
@@ -412,58 +461,32 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
  * inode->i_mutex: down
  */
 int
-reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
-		   size_t buffer_size, int flags)
+reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
+			  struct inode *inode, const char *name,
+			  const void *buffer, size_t buffer_size, int flags)
 {
 	int err = 0;
 	struct dentry *dentry;
 	struct page *page;
 	char *data;
-	struct address_space *mapping;
 	size_t file_pos = 0;
 	size_t buffer_pos = 0;
-	struct inode *xinode;
-	struct iattr newattrs;
+	size_t new_size;
 	__u32 xahash = 0;
 
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	/* Empty xattrs are ok, they're just empty files, no hash */
-	if (buffer && buffer_size)
-		xahash = xattr_hash(buffer, buffer_size);
+	if (!buffer)
+		return lookup_and_delete_xattr(inode, name);
 
-      open_file:
-	dentry = get_xa_file_dentry(inode, name, flags);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	}
-
-	xinode = dentry->d_inode;
-	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	dentry = xattr_lookup(inode, name, flags);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	/* we need to copy it off.. */
-	if (xinode->i_nlink > 1) {
-		dput(dentry);
-		err = reiserfs_xattr_del(inode, name);
-		if (err < 0)
-			goto out;
-		/* We just killed the old one, we're not replacing anymore */
-		if (flags & XATTR_REPLACE)
-			flags &= ~XATTR_REPLACE;
-		goto open_file;
-	}
+	down_write(&REISERFS_I(inode)->i_xattr_sem);
 
-	/* Resize it so we're ok to write there */
-	newattrs.ia_size = buffer_size;
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-	mutex_lock_nested(&xinode->i_mutex, I_MUTEX_XATTR);
-	err = notify_change(dentry, &newattrs);
-	if (err)
-		goto out_filp;
-
-	mapping = xinode->i_mapping;
+	xahash = xattr_hash(buffer, buffer_size);
 	while (buffer_pos < buffer_size || buffer_pos == 0) {
 		size_t chunk;
 		size_t skip = 0;
@@ -473,10 +496,10 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 		else
 			chunk = buffer_size - buffer_pos;
 
-		page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT);
+		page = reiserfs_get_page(dentry->d_inode, file_pos);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
-			goto out_filp;
+			goto out_unlock;
 		}
 
 		lock_page(page);
@@ -510,28 +533,61 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 			break;
 	}
 
-	/* We can't mark the inode dirty if it's not hashed. This is the case
-	 * when we're inheriting the default ACL. If we dirty it, the inode
-	 * gets marked dirty, but won't (ever) make it onto the dirty list until
-	 * it's synced explicitly to clear I_DIRTY. This is bad. */
-	if (!hlist_unhashed(&inode->i_hash)) {
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
+	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
+	if (!err && new_size < i_size_read(dentry->d_inode)) {
+		struct iattr newattrs = {
+			.ia_ctime = current_fs_time(inode->i_sb),
+			.ia_size = buffer_size,
+			.ia_valid = ATTR_SIZE | ATTR_CTIME,
+		};
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
+		down_write(&dentry->d_inode->i_alloc_sem);
+		err = reiserfs_setattr(dentry, &newattrs);
+		up_write(&dentry->d_inode->i_alloc_sem);
+		mutex_unlock(&dentry->d_inode->i_mutex);
+	} else
+		update_ctime(inode);
+out_unlock:
+	up_write(&REISERFS_I(inode)->i_xattr_sem);
+	dput(dentry);
+	return err;
+}
+
+/* We need to start a transaction to maintain lock ordering */
+int reiserfs_xattr_set(struct inode *inode, const char *name,
+		       const void *buffer, size_t buffer_size, int flags)
+{
+
+	struct reiserfs_transaction_handle th;
+	int error, error2;
+	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+
+	if (!(flags & XATTR_REPLACE))
+		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jbegin_count);
+	if (error) {
+		reiserfs_write_unlock(inode->i_sb);
+		return error;
 	}
 
-      out_filp:
-	mutex_unlock(&xinode->i_mutex);
-	dput(dentry);
+	error = reiserfs_xattr_set_handle(&th, inode, name,
+					  buffer, buffer_size, flags);
 
-      out:
-	return err;
+	error2 = journal_end(&th, inode->i_sb, jbegin_count);
+	if (error == 0)
+		error = error2;
+	reiserfs_write_unlock(inode->i_sb);
+
+	return error;
 }
 
 /*
  * inode->i_mutex: down
  */
 int
-reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
+reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
 		   size_t buffer_size)
 {
 	ssize_t err = 0;
@@ -540,7 +596,6 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 	size_t file_pos = 0;
 	size_t buffer_pos = 0;
 	struct page *page;
-	struct inode *xinode;
 	__u32 hash = 0;
 
 	if (name == NULL)
@@ -551,25 +606,25 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	dentry = get_xa_file_dentry(inode, name, FL_READONLY);
+	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out;
 	}
 
-	xinode = dentry->d_inode;
-	isize = xinode->i_size;
-	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	down_read(&REISERFS_I(inode)->i_xattr_sem);
+
+	isize = i_size_read(dentry->d_inode);
 
 	/* Just return the size needed */
 	if (buffer == NULL) {
 		err = isize - sizeof(struct reiserfs_xattr_header);
-		goto out_dput;
+		goto out_unlock;
 	}
 
 	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
 		err = -ERANGE;
-		goto out_dput;
+		goto out_unlock;
 	}
 
 	while (file_pos < isize) {
@@ -581,10 +636,10 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 		else
 			chunk = isize - file_pos;
 
-		page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT);
+		page = reiserfs_get_page(dentry->d_inode, file_pos);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
-			goto out_dput;
+			goto out_unlock;
 		}
 
 		lock_page(page);
@@ -598,12 +653,12 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
 				unlock_page(page);
 				reiserfs_put_page(page);
-				reiserfs_warning(inode->i_sb,
+				reiserfs_warning(inode->i_sb, "jdm-20001",
 						 "Invalid magic for xattr (%s) "
 						 "associated with %k", name,
 						 INODE_PKEY(inode));
 				err = -EIO;
-				goto out_dput;
+				goto out_unlock;
 			}
 			hash = le32_to_cpu(rxh->h_hash);
 		}
@@ -618,256 +673,83 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
 
 	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
 	    hash) {
-		reiserfs_warning(inode->i_sb,
+		reiserfs_warning(inode->i_sb, "jdm-20002",
 				 "Invalid hash for xattr (%s) associated "
 				 "with %k", name, INODE_PKEY(inode));
 		err = -EIO;
 	}
 
-      out_dput:
+out_unlock:
+	up_read(&REISERFS_I(inode)->i_xattr_sem);
 	dput(dentry);
 
-      out:
+out:
 	return err;
 }
 
-static int
-__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
-{
-	struct dentry *dentry;
-	struct inode *dir = xadir->d_inode;
-	int err = 0;
-
-	dentry = lookup_one_len(name, xadir, namelen);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	} else if (!dentry->d_inode) {
-		err = -ENODATA;
-		goto out_file;
-	}
-
-	/* Skip directories.. */
-	if (S_ISDIR(dentry->d_inode->i_mode))
-		goto out_file;
-
-	if (!is_reiserfs_priv_object(dentry->d_inode)) {
-		reiserfs_warning(dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
-				 "priv flag set [parent is %sset].",
-				 le32_to_cpu(INODE_PKEY(dentry->d_inode)->
-					     k_objectid), xadir->d_name.len,
-				 xadir->d_name.name, namelen, name,
-				 is_reiserfs_priv_object(xadir->
-							 d_inode) ? "" :
-				 "not ");
-		dput(dentry);
-		return -EIO;
-	}
-
-	err = dir->i_op->unlink(dir, dentry);
-	if (!err)
-		d_delete(dentry);
-
-      out_file:
-	dput(dentry);
-
-      out:
-	return err;
-}
-
-int reiserfs_xattr_del(struct inode *inode, const char *name)
-{
-	struct dentry *dir;
-	int err;
-
-	dir = open_xa_dir(inode, FL_READONLY);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	}
-
-	err = __reiserfs_xattr_del(dir, name, strlen(name));
-	dput(dir);
-
-	if (!err) {
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-	}
-
-      out:
-	return err;
-}
-
-/* The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc. */
-
-static int
-reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct dentry *xadir = (struct dentry *)buf;
-
-	return __reiserfs_xattr_del(xadir, name, namelen);
-
-}
-
-/* This is called w/ inode->i_mutex downed */
-int reiserfs_delete_xattrs(struct inode *inode)
-{
-	struct dentry *dir, *root;
-	int err = 0;
-
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (is_reiserfs_priv_object(inode) ||
-	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_xattrs(inode->i_sb)) {
-		return 0;
-	}
-	reiserfs_read_lock_xattrs(inode->i_sb);
-	dir = open_xa_dir(inode, FL_READONLY);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	} else if (!dir->d_inode) {
-		dput(dir);
-		return 0;
-	}
-
-	lock_kernel();
-	err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
-	if (err) {
-		unlock_kernel();
-		goto out_dir;
-	}
-
-	/* Leftovers besides . and .. -- that's not good. */
-	if (dir->d_inode->i_nlink <= 2) {
-		root = get_xa_root(inode->i_sb, XATTR_REPLACE);
-		reiserfs_write_lock_xattrs(inode->i_sb);
-		err = vfs_rmdir(root->d_inode, dir);
-		reiserfs_write_unlock_xattrs(inode->i_sb);
-		dput(root);
-	} else {
-		reiserfs_warning(inode->i_sb,
-				 "Couldn't remove all entries in directory");
-	}
-	unlock_kernel();
-
-      out_dir:
-	dput(dir);
-
-      out:
-	if (!err)
-		REISERFS_I(inode)->i_flags =
-		    REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
-	return err;
-}
-
-struct reiserfs_chown_buf {
-	struct inode *inode;
-	struct dentry *xadir;
-	struct iattr *attrs;
+/* Actual operations that are exported to VFS-land */
+struct xattr_handler *reiserfs_xattr_handlers[] = {
+	&reiserfs_xattr_user_handler,
+	&reiserfs_xattr_trusted_handler,
+#ifdef CONFIG_REISERFS_FS_SECURITY
+	&reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	&reiserfs_posix_acl_access_handler,
+	&reiserfs_posix_acl_default_handler,
+#endif
+	NULL
 };
 
-/* XXX: If there is a better way to do this, I'd love to hear about it */
-static int
-reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
-			     loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
-	struct dentry *xafile, *xadir = chown_buf->xadir;
-	struct iattr *attrs = chown_buf->attrs;
-	int err = 0;
-
-	xafile = lookup_one_len(name, xadir, namelen);
-	if (IS_ERR(xafile))
-		return PTR_ERR(xafile);
-	else if (!xafile->d_inode) {
-		dput(xafile);
-		return -ENODATA;
-	}
-
-	if (!S_ISDIR(xafile->d_inode->i_mode))
-		err = notify_change(xafile, attrs);
-	dput(xafile);
-
-	return err;
-}
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
 
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
+/* This is the implementation for the xattr plugin infrastructure */
+static inline struct xattr_handler *
+find_xattr_handler_prefix(struct xattr_handler **handlers,
+			   const char *name)
 {
-	struct dentry *dir;
-	int err = 0;
-	struct reiserfs_chown_buf buf;
-	unsigned int ia_valid = attrs->ia_valid;
+	struct xattr_handler *xah;
 
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (is_reiserfs_priv_object(inode) ||
-	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_xattrs(inode->i_sb)) {
-		return 0;
-	}
-	reiserfs_read_lock_xattrs(inode->i_sb);
-	dir = open_xa_dir(inode, FL_READONLY);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
-	if (IS_ERR(dir)) {
-		if (PTR_ERR(dir) != -ENODATA)
-			err = PTR_ERR(dir);
-		goto out;
-	} else if (!dir->d_inode) {
-		dput(dir);
-		goto out;
-	}
+	if (!handlers)
+		return NULL;
 
-	lock_kernel();
-
-	attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
-	buf.xadir = dir;
-	buf.attrs = attrs;
-	buf.inode = inode;
-
-	err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
-	if (err) {
-		unlock_kernel();
-		goto out_dir;
+	for_each_xattr_handler(handlers, xah) {
+		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+			break;
 	}
 
-	err = notify_change(dir, attrs);
-	unlock_kernel();
-
-      out_dir:
-	dput(dir);
-
-      out:
-	attrs->ia_valid = ia_valid;
-	return err;
+	return xah;
 }
 
-/* Actual operations that are exported to VFS-land */
 
 /*
  * Inode operation getxattr()
- * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 		  size_t size)
 {
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
-	int err;
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_read_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
-	err = xah->get(dentry->d_inode, name, buffer, size);
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
-	reiserfs_read_unlock_xattr_i(dentry->d_inode);
-	return err;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
@@ -879,27 +761,15 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags)
 {
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
-	int err;
-	int lock;
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_write_lock_xattr_i(dentry->d_inode);
-	lock = !has_xattr_dir(dentry->d_inode);
-	if (lock)
-		reiserfs_write_lock_xattrs(dentry->d_sb);
-	else
-		reiserfs_read_lock_xattrs(dentry->d_sb);
-	err = xah->set(dentry->d_inode, name, value, size, flags);
-	if (lock)
-		reiserfs_write_unlock_xattrs(dentry->d_sb);
-	else
-		reiserfs_read_unlock_xattrs(dentry->d_sb);
-	reiserfs_write_unlock_xattr_i(dentry->d_inode);
-	return err;
+	return handler->set(inode, name, value, size, flags);
 }
 
 /*
@@ -909,86 +779,66 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
  */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-	int err;
-	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler;
+	handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
 
-	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+	if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_write_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
-
-	/* Deletion pre-operation */
-	if (xah->del) {
-		err = xah->del(dentry->d_inode, name);
-		if (err)
-			goto out;
-	}
-
-	err = reiserfs_xattr_del(dentry->d_inode, name);
-
-	dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(dentry->d_inode);
-
-      out:
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
-	reiserfs_write_unlock_xattr_i(dentry->d_inode);
-	return err;
+	return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
 }
 
-/* This is what filldir will use:
- * r_pos will always contain the amount of space required for the entire
- * list. If r_pos becomes larger than r_size, we need more space and we
- * return an error indicating this. If r_pos is less than r_size, then we've
- * filled the buffer successfully and we return success */
-struct reiserfs_listxattr_buf {
-	int r_pos;
-	int r_size;
-	char *r_buf;
-	struct inode *r_inode;
+struct listxattr_buf {
+	size_t size;
+	size_t pos;
+	char *buf;
+	struct inode *inode;
 };
 
-static int
-reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
-			  loff_t offset, u64 ino, unsigned int d_type)
+static int listxattr_filler(void *buf, const char *name, int namelen,
+			    loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
-	int len = 0;
-	if (name[0] != '.'
-	    || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-		struct reiserfs_xattr_handler *xah =
-		    find_xattr_handler_prefix(name);
-		if (!xah)
-			return 0;	/* Unsupported xattr name, skip it */
-
-		/* We call ->list() twice because the operation isn't required to just
-		 * return the name back - we want to make sure we have enough space */
-		len += xah->list(b->r_inode, name, namelen, NULL);
-
-		if (len) {
-			if (b->r_pos + len + 1 <= b->r_size) {
-				char *p = b->r_buf + b->r_pos;
-				p += xah->list(b->r_inode, name, namelen, p);
-				*p++ = '\0';
-			}
-			b->r_pos += len + 1;
+	struct listxattr_buf *b = (struct listxattr_buf *)buf;
+	size_t size;
+	if (name[0] != '.' ||
+	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+		struct xattr_handler *handler;
+		handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr,
+						    name);
+		if (!handler)	/* Unsupported xattr name */
+			return 0;
+		if (b->buf) {
+			size = handler->list(b->inode, b->buf + b->pos,
+					 b->size, name, namelen);
+			if (size > b->size)
+				return -ERANGE;
+		} else {
+			size = handler->list(b->inode, NULL, 0, name, namelen);
 		}
-	}
 
+		b->pos += size;
+	}
 	return 0;
 }
 
 /*
  * Inode operation listxattr()
  *
- * Preliminary locking: we down dentry->d_inode->i_mutex
+ * We totally ignore the generic listxattr here because it would be stupid
+ * not to. Since the xattrs are organized in a directory, we can just
+ * readdir to find them.
  */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
 	struct dentry *dir;
 	int err = 0;
-	struct reiserfs_listxattr_buf buf;
+	loff_t pos = 0;
+	struct listxattr_buf buf = {
+		.inode = dentry->d_inode,
+		.buf = buffer,
+		.size = buffer ? size : 0,
+	};
 
 	if (!dentry->d_inode)
 		return -EINVAL;
@@ -997,130 +847,104 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	reiserfs_read_lock_xattr_i(dentry->d_inode);
-	reiserfs_read_lock_xattrs(dentry->d_sb);
-	dir = open_xa_dir(dentry->d_inode, FL_READONLY);
-	reiserfs_read_unlock_xattrs(dentry->d_sb);
+	dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
 		if (err == -ENODATA)
-			err = 0;	/* Not an error if there aren't any xattrs */
+			err = 0;  /* Not an error if there aren't any xattrs */
 		goto out;
 	}
 
-	buf.r_buf = buffer;
-	buf.r_size = buffer ? size : 0;
-	buf.r_pos = 0;
-	buf.r_inode = dentry->d_inode;
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+	err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
-	REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
-
-	err = xattr_readdir(dir->d_inode, reiserfs_listxattr_filler, &buf);
-	if (err)
-		goto out_dir;
-
-	if (buf.r_pos > buf.r_size && buffer != NULL)
-		err = -ERANGE;
-	else
-		err = buf.r_pos;
+	if (!err)
+		err = buf.pos;
 
-      out_dir:
 	dput(dir);
-
-      out:
-	reiserfs_read_unlock_xattr_i(dentry->d_inode);
+out:
 	return err;
 }
 
-/* This is the implementation for the xattr plugin infrastructure */
-static LIST_HEAD(xattr_handlers);
-static DEFINE_RWLOCK(handler_lock);
-
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
-								*prefix)
+static int reiserfs_check_acl(struct inode *inode, int mask)
 {
-	struct reiserfs_xattr_handler *xah = NULL;
-	struct list_head *p;
+	struct posix_acl *acl;
+	int error = -EAGAIN; /* do regular unix permission checks by default */
 
-	read_lock(&handler_lock);
-	list_for_each(p, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (strncmp(xah->prefix, prefix, strlen(xah->prefix)) == 0)
-			break;
-		xah = NULL;
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		if (!IS_ERR(acl)) {
+			error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+		} else if (PTR_ERR(acl) != -ENODATA)
+			error = PTR_ERR(acl);
 	}
 
-	read_unlock(&handler_lock);
-	return xah;
+	return error;
 }
 
-static void __unregister_handlers(void)
+int reiserfs_permission(struct inode *inode, int mask)
 {
-	struct reiserfs_xattr_handler *xah;
-	struct list_head *p, *tmp;
-
-	list_for_each_safe(p, tmp, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (xah->exit)
-			xah->exit();
-
-		list_del_init(p);
-	}
-	INIT_LIST_HEAD(&xattr_handlers);
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return generic_permission(inode, mask, NULL);
+	else
+		return generic_permission(inode, mask, reiserfs_check_acl);
 }
 
-int __init reiserfs_xattr_register_handlers(void)
+static int create_privroot(struct dentry *dentry)
 {
-	int err = 0;
-	struct reiserfs_xattr_handler *xah;
-	struct list_head *p;
-
-	write_lock(&handler_lock);
-
-	/* If we're already initialized, nothing to do */
-	if (!list_empty(&xattr_handlers)) {
-		write_unlock(&handler_lock);
-		return 0;
-	}
-
-	/* Add the handlers */
-	list_add_tail(&user_handler.handlers, &xattr_handlers);
-	list_add_tail(&trusted_handler.handlers, &xattr_handlers);
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	list_add_tail(&security_handler.handlers, &xattr_handlers);
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	list_add_tail(&posix_acl_access_handler.handlers, &xattr_handlers);
-	list_add_tail(&posix_acl_default_handler.handlers, &xattr_handlers);
-#endif
-
-	/* Run initializers, if available */
-	list_for_each(p, &xattr_handlers) {
-		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
-		if (xah->init) {
-			err = xah->init();
-			if (err) {
-				list_del_init(p);
-				break;
-			}
-		}
+	int err;
+	struct inode *inode = dentry->d_parent->d_inode;
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
+	err = xattr_mkdir(inode, dentry, 0700);
+	mutex_unlock(&inode->i_mutex);
+	if (err) {
+		dput(dentry);
+		dentry = NULL;
 	}
 
-	/* Clean up other handlers, if any failed */
-	if (err)
-		__unregister_handlers();
+	if (dentry && dentry->d_inode)
+		reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+			      "storage.\n", PRIVROOT_NAME);
 
-	write_unlock(&handler_lock);
 	return err;
 }
 
-void reiserfs_xattr_unregister_handlers(void)
+static int xattr_mount_check(struct super_block *s)
 {
-	write_lock(&handler_lock);
-	__unregister_handlers();
-	write_unlock(&handler_lock);
+	/* We need generation numbers to ensure that the oid mapping is correct
+	 * v3.5 filesystems don't have them. */
+	if (old_format_only(s)) {
+		if (reiserfs_xattrs_optional(s)) {
+			/* Old format filesystem, but optional xattrs have
+			 * been enabled. Error out. */
+			reiserfs_warning(s, "jdm-2005",
+					 "xattrs/ACLs not supported "
+					 "on pre-v3.6 format filesystems. "
+					 "Failing mount.");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
 }
 
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+#endif
+
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
@@ -1136,7 +960,7 @@ xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 	return 1;
 }
 
-static struct dentry_operations xattr_lookup_poison_ops = {
+static const struct dentry_operations xattr_lookup_poison_ops = {
 	.d_compare = xattr_lookup_poison,
 };
 
@@ -1147,48 +971,23 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
 	int err = 0;
 
-	/* We need generation numbers to ensure that the oid mapping is correct
-	 * v3.5 filesystems don't have them. */
-	if (!old_format_only(s)) {
-		set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-	} else if (reiserfs_xattrs_optional(s)) {
-		/* Old format filesystem, but optional xattrs have been enabled
-		 * at mount time. Error out. */
-		reiserfs_warning(s, "xattrs/ACLs not supported on pre v3.6 "
-				 "format filesystem. Failing mount.");
-		err = -EOPNOTSUPP;
+#ifdef CONFIG_REISERFS_FS_XATTR
+	err = xattr_mount_check(s);
+	if (err)
 		goto error;
-	} else {
-		/* Old format filesystem, but no optional xattrs have been enabled. This
-		 * means we silently disable xattrs on the filesystem. */
-		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-	}
+#endif
 
 	/* If we don't have the privroot located yet - go find it */
-	if (reiserfs_xattrs(s) && !REISERFS_SB(s)->priv_root) {
+	if (!REISERFS_SB(s)->priv_root) {
 		struct dentry *dentry;
 		dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
 					strlen(PRIVROOT_NAME));
 		if (!IS_ERR(dentry)) {
-			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
-				struct inode *inode = dentry->d_parent->d_inode;
-				mutex_lock_nested(&inode->i_mutex,
-						  I_MUTEX_XATTR);
-				err = inode->i_op->mkdir(inode, dentry, 0700);
-				mutex_unlock(&inode->i_mutex);
-				if (err) {
-					dput(dentry);
-					dentry = NULL;
-				}
-
-				if (dentry && dentry->d_inode)
-					reiserfs_warning(s,
-							 "Created %s on %s - reserved for "
-							 "xattr storage.",
-							 PRIVROOT_NAME,
-							 reiserfs_bdevname
-							 (inode->i_sb));
-			} else if (!dentry->d_inode) {
+#ifdef CONFIG_REISERFS_FS_XATTR
+			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
+				err = create_privroot(dentry);
+#endif
+			if (!dentry->d_inode) {
 				dput(dentry);
 				dentry = NULL;
 			}
@@ -1197,73 +996,41 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 
 		if (!err && dentry) {
 			s->s_root->d_op = &xattr_lookup_poison_ops;
-			reiserfs_mark_inode_private(dentry->d_inode);
+			dentry->d_inode->i_flags |= S_PRIVATE;
 			REISERFS_SB(s)->priv_root = dentry;
-		} else if (!(mount_flags & MS_RDONLY)) {	/* xattrs are unavailable */
-			/* If we're read-only it just means that the dir hasn't been
-			 * created. Not an error -- just no xattrs on the fs. We'll
-			 * check again if we go read-write */
-			reiserfs_warning(s, "xattrs/ACLs enabled and couldn't "
-					 "find/create .reiserfs_priv. Failing mount.");
+#ifdef CONFIG_REISERFS_FS_XATTR
+		/* xattrs are unavailable */
+		} else if (!(mount_flags & MS_RDONLY)) {
+			/* If we're read-only it just means that the dir
+			 * hasn't been created. Not an error -- just no
+			 * xattrs on the fs. We'll check again if we
+			 * go read-write */
+			reiserfs_warning(s, "jdm-20006",
+					 "xattrs/ACLs enabled and couldn't "
+					 "find/create .reiserfs_priv. "
+					 "Failing mount.");
 			err = -EOPNOTSUPP;
+#endif
 		}
 	}
 
-      error:
-	/* This is only nonzero if there was an error initializing the xattr
-	 * directory or if there is a condition where we don't support them. */
+#ifdef CONFIG_REISERFS_FS_XATTR
+	if (!err)
+		s->s_xattr = reiserfs_xattr_handlers;
+
+error:
 	if (err) {
-		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
 	}
+#endif
 
 	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
 	s->s_flags = s->s_flags & ~MS_POSIXACL;
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	if (reiserfs_posixacl(s))
 		s->s_flags |= MS_POSIXACL;
+#endif
 
 	return err;
 }
-
-static int reiserfs_check_acl(struct inode *inode, int mask)
-{
-	struct posix_acl *acl;
-	int error = -EAGAIN; /* do regular unix permission checks by default */
-
-	reiserfs_read_lock_xattr_i(inode);
-	reiserfs_read_lock_xattrs(inode->i_sb);
-
-	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-
-	reiserfs_read_unlock_xattrs(inode->i_sb);
-	reiserfs_read_unlock_xattr_i(inode);
-
-	if (acl) {
-		if (!IS_ERR(acl)) {
-			error = posix_acl_permission(inode, acl, mask);
-			posix_acl_release(acl);
-		} else if (PTR_ERR(acl) != -ENODATA)
-			error = PTR_ERR(acl);
-	}
-
-	return error;
-}
-
-int reiserfs_permission(struct inode *inode, int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (is_reiserfs_priv_object(inode))
-		return 0;
-
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return generic_permission(inode, mask, NULL);
-	else
-		return generic_permission(inode, mask, reiserfs_check_acl);
-}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index b7e4fa4539de..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -10,15 +10,17 @@
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
 
-static int reiserfs_set_acl(struct inode *inode, int type,
+static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+			    struct inode *inode, int type,
 			    struct posix_acl *acl);
 
 static int
 xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 {
 	struct posix_acl *acl;
-	int error;
-
+	int error, error2;
+	struct reiserfs_transaction_handle th;
+	size_t jcreate_blocks;
 	if (!reiserfs_posixacl(inode->i_sb))
 		return -EOPNOTSUPP;
 	if (!is_owner_or_cap(inode))
@@ -36,7 +38,21 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 	} else
 		acl = NULL;
 
-	error = reiserfs_set_acl(inode, type, acl);
+	/* Pessimism: We can't assume that anything from the xattr root up
+	 * has been created. */
+
+	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, size) * 2;
+
+	reiserfs_write_lock(inode->i_sb);
+	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
+	if (error == 0) {
+		error = reiserfs_set_acl(&th, inode, type, acl);
+		error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+		if (error2)
+			error = error2;
+	}
+	reiserfs_write_unlock(inode->i_sb);
 
       release_and_out:
 	posix_acl_release(acl);
@@ -172,6 +188,29 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
+			    struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != ERR_PTR(-ENODATA))
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static inline struct posix_acl *iget_acl(struct inode *inode,
+					 struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = ERR_PTR(-ENODATA);
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != ERR_PTR(-ENODATA))
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
 /*
  * Inode operation get_posix_acl().
  *
@@ -199,11 +238,11 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (IS_ERR(*p_acl)) {
-		if (PTR_ERR(*p_acl) == -ENODATA)
-			return NULL;
-	} else if (*p_acl != NULL)
-		return posix_acl_dup(*p_acl);
+	acl = iget_acl(inode, p_acl);
+	if (acl && !IS_ERR(acl))
+		return acl;
+	else if (PTR_ERR(acl) == -ENODATA)
+		return NULL;
 
 	size = reiserfs_xattr_get(inode, name, NULL, 0);
 	if (size < 0) {
@@ -229,7 +268,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 	} else {
 		acl = posix_acl_from_disk(value, retval);
 		if (!IS_ERR(acl))
-			*p_acl = posix_acl_dup(acl);
+			iset_acl(inode, p_acl, acl);
 	}
 
 	kfree(value);
@@ -243,12 +282,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
  * BKL held [before 2.5.x]
  */
 static int
-reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+		 int type, struct posix_acl *acl)
 {
 	char *name;
 	void *value = NULL;
 	struct posix_acl **p_acl;
-	size_t size;
+	size_t size = 0;
 	int error;
 	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
@@ -285,31 +325,28 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		value = posix_acl_to_disk(acl, &size);
 		if (IS_ERR(value))
 			return (int)PTR_ERR(value);
-		error = reiserfs_xattr_set(inode, name, value, size, 0);
-	} else {
-		error = reiserfs_xattr_del(inode, name);
-		if (error == -ENODATA) {
-			/* This may seem odd here, but it means that the ACL was set
-			 * with a value representable with mode bits. If there was
-			 * an ACL before, reiserfs_xattr_del already dirtied the inode.
-			 */
+	}
+
+	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
+
+	/*
+	 * Ensure that the inode gets dirtied if we're only using
+	 * the mode bits and an old ACL didn't exist. We don't need
+	 * to check if the inode is hashed here since we won't get
+	 * called by reiserfs_inherit_default_acl().
+	 */
+	if (error == -ENODATA) {
+		error = 0;
+		if (type == ACL_TYPE_ACCESS) {
+			inode->i_ctime = CURRENT_TIME_SEC;
 			mark_inode_dirty(inode);
-			error = 0;
 		}
 	}
 
 	kfree(value);
 
-	if (!error) {
-		/* Release the old one */
-		if (!IS_ERR(*p_acl) && *p_acl)
-			posix_acl_release(*p_acl);
-
-		if (acl == NULL)
-			*p_acl = ERR_PTR(-ENODATA);
-		else
-			*p_acl = posix_acl_dup(acl);
-	}
+	if (!error)
+		iset_acl(inode, p_acl, acl);
 
 	return error;
 }
@@ -317,7 +354,8 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 /* dir->i_mutex: locked,
  * inode is new and not released into the wild yet */
 int
-reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
+reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
+			     struct inode *dir, struct dentry *dentry,
 			     struct inode *inode)
 {
 	struct posix_acl *acl;
@@ -335,8 +373,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 	/* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
 	 * would be useless since permissions are ignored, and a pain because
 	 * it introduces locking cycles */
-	if (is_reiserfs_priv_object(dir)) {
-		reiserfs_mark_inode_private(inode);
+	if (IS_PRIVATE(dir)) {
+		inode->i_flags |= S_PRIVATE;
 		goto apply_umask;
 	}
 
@@ -354,7 +392,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 
 		/* Copy the default ACL to the default ACL of a new directory */
 		if (S_ISDIR(inode->i_mode)) {
-			err = reiserfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+					       acl);
 			if (err)
 				goto cleanup;
 		}
@@ -375,9 +414,9 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 
 			/* If we need an ACL.. */
 			if (need_acl > 0) {
-				err =
-				    reiserfs_set_acl(inode, ACL_TYPE_ACCESS,
-						     acl_copy);
+				err = reiserfs_set_acl(th, inode,
+						       ACL_TYPE_ACCESS,
+						       acl_copy);
 				if (err)
 					goto cleanup_copy;
 			}
@@ -389,31 +428,51 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
 	} else {
 	      apply_umask:
 		/* no ACL, apply umask */
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 	}
 
 	return err;
 }
 
-/* Looks up and caches the result of the default ACL.
- * We do this so that we don't need to carry the xattr_sem into
- * reiserfs_new_inode if we don't need to */
+/* This is used to cache the default acl before a new object is created.
+ * The biggest reason for this is to get an idea of how many blocks will
+ * actually be required for the create operation if we must inherit an ACL.
+ * An ACL write can add up to 3 object creations and an additional file write
+ * so we'd prefer not to reserve that many blocks in the journal if we can.
+ * It also has the advantage of not loading the ACL with a transaction open,
+ * this may seem silly, but if the owner of the directory is doing the
+ * creation, the ACL may not be loaded since the permissions wouldn't require
+ * it.
+ * We return the number of blocks required for the transaction.
+ */
 int reiserfs_cache_default_acl(struct inode *inode)
 {
-	int ret = 0;
-	if (reiserfs_posixacl(inode->i_sb) && !is_reiserfs_priv_object(inode)) {
-		struct posix_acl *acl;
-		reiserfs_read_lock_xattr_i(inode);
-		reiserfs_read_lock_xattrs(inode->i_sb);
-		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
-		reiserfs_read_unlock_xattrs(inode->i_sb);
-		reiserfs_read_unlock_xattr_i(inode);
-		ret = (acl && !IS_ERR(acl));
-		if (ret)
-			posix_acl_release(acl);
+	struct posix_acl *acl;
+	int nblocks = 0;
+
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+
+	if (acl && !IS_ERR(acl)) {
+		int size = reiserfs_acl_size(acl->a_count);
+
+		/* Other xattrs can be created during inode creation. We don't
+		 * want to claim too many blocks, so we check to see if we
+		 * we need to create the tree to the xattrs, and then we
+		 * just want two files. */
+		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
+		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
+
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+		/* We need to account for writes + bitmaps for two files */
+		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
+		posix_acl_release(acl);
 	}
 
-	return ret;
+	return nblocks;
 }
 
 int reiserfs_acl_chmod(struct inode *inode)
@@ -429,9 +488,7 @@ int reiserfs_acl_chmod(struct inode *inode)
 		return 0;
 	}
 
-	reiserfs_read_lock_xattrs(inode->i_sb);
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-	reiserfs_read_unlock_xattrs(inode->i_sb);
 	if (!acl)
 		return 0;
 	if (IS_ERR(acl))
@@ -442,18 +499,20 @@ int reiserfs_acl_chmod(struct inode *inode)
 		return -ENOMEM;
 	error = posix_acl_chmod_masq(clone, inode->i_mode);
 	if (!error) {
-		int lock = !has_xattr_dir(inode);
-		reiserfs_write_lock_xattr_i(inode);
-		if (lock)
-			reiserfs_write_lock_xattrs(inode->i_sb);
-		else
-			reiserfs_read_lock_xattrs(inode->i_sb);
-		error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
-		if (lock)
-			reiserfs_write_unlock_xattrs(inode->i_sb);
-		else
-			reiserfs_read_unlock_xattrs(inode->i_sb);
-		reiserfs_write_unlock_xattr_i(inode);
+		struct reiserfs_transaction_handle th;
+		size_t size = reiserfs_xattr_nblocks(inode,
+					     reiserfs_acl_size(clone->a_count));
+		reiserfs_write_lock(inode->i_sb);
+		error = journal_begin(&th, inode->i_sb, size * 2);
+		if (!error) {
+			int error2;
+			error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS,
+						 clone);
+			error2 = journal_end(&th, inode->i_sb, size * 2);
+			if (error2)
+				error = error2;
+		}
+		reiserfs_write_unlock(inode->i_sb);
 	}
 	posix_acl_release(clone);
 	return error;
@@ -477,38 +536,22 @@ posix_acl_access_set(struct inode *inode, const char *name,
 	return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
 
-static int posix_acl_access_del(struct inode *inode, const char *name)
+static size_t posix_acl_access_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
 {
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	struct posix_acl **acl = &reiserfs_i->i_acl_access;
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-		return -EINVAL;
-	if (!IS_ERR(*acl) && *acl) {
-		posix_acl_release(*acl);
-		*acl = ERR_PTR(-ENODATA);
-	}
-
-	return 0;
-}
-
-static int
-posix_acl_access_list(struct inode *inode, const char *name, int namelen,
-		      char *out)
-{
-	int len = namelen;
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
 	if (!reiserfs_posixacl(inode->i_sb))
 		return 0;
-	if (out)
-		memcpy(out, name, len);
-
-	return len;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
 }
 
-struct reiserfs_xattr_handler posix_acl_access_handler = {
+struct xattr_handler reiserfs_posix_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.get = posix_acl_access_get,
 	.set = posix_acl_access_set,
-	.del = posix_acl_access_del,
 	.list = posix_acl_access_list,
 };
 
@@ -530,37 +573,21 @@ posix_acl_default_set(struct inode *inode, const char *name,
 	return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
 
-static int posix_acl_default_del(struct inode *inode, const char *name)
+static size_t posix_acl_default_list(struct inode *inode, char *list,
+				     size_t list_size, const char *name,
+				     size_t name_len)
 {
-	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-	struct posix_acl **acl = &reiserfs_i->i_acl_default;
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-		return -EINVAL;
-	if (!IS_ERR(*acl) && *acl) {
-		posix_acl_release(*acl);
-		*acl = ERR_PTR(-ENODATA);
-	}
-
-	return 0;
-}
-
-static int
-posix_acl_default_list(struct inode *inode, const char *name, int namelen,
-		       char *out)
-{
-	int len = namelen;
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
 	if (!reiserfs_posixacl(inode->i_sb))
 		return 0;
-	if (out)
-		memcpy(out, name, len);
-
-	return len;
+	if (list && size <= list_size)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
 }
 
-struct reiserfs_xattr_handler posix_acl_default_handler = {
+struct xattr_handler reiserfs_posix_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.get = posix_acl_default_get,
 	.set = posix_acl_default_set,
-	.del = posix_acl_default_del,
 	.list = posix_acl_default_list,
 };
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 056008db1377..4d3c20e787c3 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -4,6 +4,7 @@
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include <linux/reiserfs_xattr.h>
+#include <linux/security.h>
 #include <asm/uaccess.h>
 
 static int
@@ -12,7 +13,7 @@ security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_get(inode, name, buffer, size);
@@ -25,41 +26,84 @@ security_set(struct inode *inode, const char *name, const void *buffer,
 	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
 		return -EINVAL;
 
-	if (is_reiserfs_priv_object(inode))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int security_del(struct inode *inode, const char *name)
+static size_t security_list(struct inode *inode, char *list, size_t list_len,
+			    const char *name, size_t namelen)
 {
-	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
-		return -EINVAL;
+	const size_t len = namelen + 1;
 
-	if (is_reiserfs_priv_object(inode))
-		return -EPERM;
+	if (IS_PRIVATE(inode))
+		return 0;
+
+	if (list && len <= list_len) {
+		memcpy(list, name, namelen);
+		list[namelen] = '\0';
+	}
 
-	return 0;
+	return len;
 }
 
-static int
-security_list(struct inode *inode, const char *name, int namelen, char *out)
+/* Initializes the security context for a new inode and returns the number
+ * of blocks needed for the transaction. If successful, reiserfs_security
+ * must be released using reiserfs_security_free when the caller is done. */
+int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   struct reiserfs_security_handle *sec)
 {
-	int len = namelen;
+	int blocks = 0;
+	int error = security_inode_init_security(inode, dir, &sec->name,
+						 &sec->value, &sec->length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			error = 0;
 
-	if (is_reiserfs_priv_object(inode))
-		return 0;
+		sec->name = NULL;
+		sec->value = NULL;
+		sec->length = 0;
+		return error;
+	}
 
-	if (out)
-		memcpy(out, name, len);
+	if (sec->length) {
+		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+			 reiserfs_xattr_nblocks(inode, sec->length);
+		/* We don't want to count the directories twice if we have
+		 * a default ACL. */
+		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+	}
+	return blocks;
+}
 
-	return len;
+int reiserfs_security_write(struct reiserfs_transaction_handle *th,
+			    struct inode *inode,
+			    struct reiserfs_security_handle *sec)
+{
+	int error;
+	if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
+
+	error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
+					  sec->length, XATTR_CREATE);
+	if (error == -ENODATA || error == -EOPNOTSUPP)
+		error = 0;
+
+	return error;
+}
+
+void reiserfs_security_free(struct reiserfs_security_handle *sec)
+{
+	kfree(sec->name);
+	kfree(sec->value);
+	sec->name = NULL;
+	sec->value = NULL;
 }
 
-struct reiserfs_xattr_handler security_handler = {
+struct xattr_handler reiserfs_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = security_get,
 	.set = security_set,
-	.del = security_del,
 	.list = security_list,
 };
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 60abe2bb1f98..a865042f75e2 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -13,10 +13,7 @@ trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_get(inode, name, buffer, size);
@@ -29,50 +26,30 @@ trusted_set(struct inode *inode, const char *name, const void *buffer,
 	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int trusted_del(struct inode *inode, const char *name)
+static size_t trusted_list(struct inode *inode, char *list, size_t list_size,
+			   const char *name, size_t name_len)
 {
-	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
-		return -EINVAL;
+	const size_t len = name_len + 1;
 
-	if (!reiserfs_xattrs(inode->i_sb))
-		return -EOPNOTSUPP;
-
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-		return -EPERM;
-
-	return 0;
-}
-
-static int
-trusted_list(struct inode *inode, const char *name, int namelen, char *out)
-{
-	int len = namelen;
-
-	if (!reiserfs_xattrs(inode->i_sb))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return 0;
 
-	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-		return 0;
-
-	if (out)
-		memcpy(out, name, len);
-
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
 	return len;
 }
 
-struct reiserfs_xattr_handler trusted_handler = {
+struct xattr_handler reiserfs_xattr_trusted_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.get = trusted_get,
 	.set = trusted_set,
-	.del = trusted_del,
 	.list = trusted_list,
 };
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 1384efcb938e..e3238dc4f3db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -6,10 +6,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-# include <linux/reiserfs_acl.h>
-#endif
-
 static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
@@ -25,7 +21,6 @@ static int
 user_set(struct inode *inode, const char *name, const void *buffer,
 	 size_t size, int flags)
 {
-
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
 
@@ -34,33 +29,23 @@ user_set(struct inode *inode, const char *name, const void *buffer,
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int user_del(struct inode *inode, const char *name)
+static size_t user_list(struct inode *inode, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
-	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
-		return -EINVAL;
-
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return 0;
-}
+	const size_t len = name_len + 1;
 
-static int
-user_list(struct inode *inode, const char *name, int namelen, char *out)
-{
-	int len = namelen;
 	if (!reiserfs_xattrs_user(inode->i_sb))
 		return 0;
-
-	if (out)
-		memcpy(out, name, len);
-
+	if (list && len <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	}
 	return len;
 }
 
-struct reiserfs_xattr_handler user_handler = {
+struct xattr_handler reiserfs_xattr_user_handler = {
 	.prefix = XATTR_USER_PREFIX,
 	.get = user_get,
 	.set = user_set,
-	.del = user_del,
 	.list = user_list,
 };
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
 	tristate "ROM file system support"
-	depends on BLOCK
+	depends on BLOCK || MTD
 	---help---
 	  This is a very small read-only file system mainly intended for
 	  initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
 
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+	prompt "RomFS backing stores"
+	depends on ROMFS_FS
+	default ROMFS_BACKED_BY_BLOCK
+	help
+	  Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+	bool "Block device-backed ROM file system support"
+	depends on BLOCK
+	help
+	  This permits ROMFS to use block devices buffered through the page
+	  cache as the medium from which to retrieve data.  It does not allow
+	  direct mapping of the medium.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+	bool "MTD-backed ROM file system support"
+	depends on MTD=y || (ROMFS_FS=m && MTD)
+	help
+	  This permits ROMFS to use MTD based devices directly, without the
+	  intercession of the block layer (which may have been disabled).  It
+	  also allows direct mapping of MTD devices through romfs files under
+	  NOMMU conditions if the underlying device is directly addressable by
+	  the CPU.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+	bool "Both the above"
+	depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+	bool
+	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+	bool
+	default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 
 obj-$(CONFIG_ROMFS_FS) += romfs.o
 
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *					Changed for 2.1.19 modules
- *	Jan 1997			Initial release
- *	Jun 1997			2.1.43+ changes
- *					Proper page locking in readpage
- *					Changed to work with 2.1.45+ fs
- *	Jul 1997			Fixed follow_link
- *			2.1.47
- *					lookup shouldn't return -ENOENT
- *					from Horst von Brand:
- *					  fail on wrong checksum
- *					  double unlock_super was possible
- *					  correct namelen for statfs
- *					spotted by Bill Hawes:
- *					  readlink shouldn't iput()
- *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
- *					  exposed a problem in readdir
- *			2.1.107		code-freeze spellchecker run
- *	Aug 1998			2.1.118+ VFS changes
- *	Sep 1998	2.1.122		another VFS change (follow_link)
- *	Apr 1999	2.2.7		no more EBADF checking in
- *					  lookup/readdir, use ERR_PTR
- *	Jun 1999	2.3.6		d_alloc_root use changed
- *			2.3.9		clean up usage of ENOENT/negative
- *					  dentries in lookup
- *					clean up page flags setting
- *					  (error, uptodate, locking) in
- *					  in readpage
- *					use init_special_inode for
- *					  fifos/sockets (and streamline) in
- *					  read_inode, fix _ops table order
- *	Aug 1999	2.3.16		__initfunc() => __init change
- *	Oct 1999	2.3.24		page->owner hack obsoleted
- *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
- */
-
-/* todo:
- *	- see Documentation/filesystems/romfs.txt
- *	- use allocated, not stack memory for file names?
- *	- considering write access...
- *	- network (tftp) files?
- *	- merge back some _op tables
- */
-
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-
-#include <asm/uaccess.h>
-
-struct romfs_inode_info {
-	unsigned long i_metasize;	/* size of non-data area */
-	unsigned long i_dataoffset;	/* from the start of fs */
-	struct inode vfs_inode;
-};
-
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-	return (unsigned long)sb->s_fs_info;
-}
-
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-	return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-
-static __u32
-romfs_checksum(void *data, int size)
-{
-	__u32 sum;
-	__be32 *ptr;
-
-	sum = 0; ptr = data;
-	size>>=2;
-	while (size>0) {
-		sum += be32_to_cpu(*ptr++);
-		size--;
-	}
-	return sum;
-}
-
-static const struct super_operations romfs_ops;
-
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct buffer_head *bh;
-	struct romfs_super_block *rsb;
-	struct inode *root;
-	int sz, ret = -EINVAL;
-
-	/* I would parse the options here, but there are none.. :) */
-
-	sb_set_blocksize(s, ROMBSIZE);
-	s->s_maxbytes = 0xFFFFFFFF;
-
-	bh = sb_bread(s, 0);
-	if (!bh) {
-		/* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-		goto outnobh;
-	}
-
-	rsb = (struct romfs_super_block *)bh->b_data;
-	sz = be32_to_cpu(rsb->size);
-	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-	   || sz < ROMFH_SIZE) {
-		if (!silent)
-			printk ("VFS: Can't find a romfs filesystem on dev "
-				"%s.\n", s->s_id);
-		goto out;
-	}
-	if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-		printk ("romfs: bad initial checksum on dev "
-			"%s.\n", s->s_id);
-		goto out;
-	}
-
-	s->s_magic = ROMFS_MAGIC;
-	s->s_fs_info = (void *)(long)sz;
-
-	s->s_flags |= MS_RDONLY;
-
-	/* Find the start of the fs */
-	sz = (ROMFH_SIZE +
-	      strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-	     & ROMFH_MASK;
-
-	s->s_op	= &romfs_ops;
-	root = romfs_iget(s, sz);
-	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
-	}
-
-	ret = -ENOMEM;
-	s->s_root = d_alloc_root(root);
-	if (!s->s_root)
-		goto outiput;
-
-	brelse(bh);
-	return 0;
-
-outiput:
-	iput(root);
-out:
-	brelse(bh);
-outnobh:
-	return ret;
-}
-
-/* That's simple too. */
-
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	buf->f_type = ROMFS_MAGIC;
-	buf->f_bsize = ROMBSIZE;
-	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-	buf->f_namelen = ROMFS_MAXFN;
-	return 0;
-}
-
-/* some helper routines */
-
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize)
-		return -1;
-
-	/* strnlen is almost always valid */
-	if (count > maxsize || offset+count > maxsize)
-		count = maxsize-offset;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-	brelse(bh);
-
-	if (res < maxsize)
-		return res;		/* found all of it */
-
-	while (res < count) {
-		offset += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		avail = strnlen(bh->b_data, maxsize);
-		res += avail;
-		brelse(bh);
-		if (avail < maxsize)
-			return res;
-	}
-	return res;
-}
-
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-		return -1;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-	brelse(bh);
-
-	res = maxsize;			/* all of it */
-
-	while (res < count) {
-		offset += maxsize;
-		dest += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		memcpy(dest, bh->b_data, maxsize);
-		brelse(bh);
-		res += maxsize;
-	}
-	return res;
-}
-
-static unsigned char romfs_dtype_table[] = {
-	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *i = filp->f_path.dentry->d_inode;
-	struct romfs_inode ri;
-	unsigned long offset, maxoff;
-	int j, ino, nextfh;
-	int stored = 0;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-
-	lock_kernel();
-
-	maxoff = romfs_maxsize(i->i_sb);
-
-	offset = filp->f_pos;
-	if (!offset) {
-		offset = i->i_ino & ROMFH_MASK;
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	/* Not really failsafe, but we are read-only... */
-	for(;;) {
-		if (!offset || offset >= maxoff) {
-			offset = maxoff;
-			filp->f_pos = offset;
-			goto out;
-		}
-		filp->f_pos = offset;
-
-		/* Fetch inode info */
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-
-		j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-		if (j < 0)
-			goto out;
-
-		fsname[j]=0;
-		romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-
-		ino = offset;
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-			ino = be32_to_cpu(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino,
-			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-			goto out;
-		}
-		stored++;
-		offset = nextfh & ROMFH_MASK;
-	}
-out:
-	unlock_kernel();
-	return stored;
-}
-
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	unsigned long offset, maxoff;
-	long res;
-	int fslen;
-	struct inode *inode = NULL;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-	struct romfs_inode ri;
-	const char *name;		/* got from dentry */
-	int len;
-
-	res = -EACCES;			/* placeholder for "no data here" */
-	offset = dir->i_ino & ROMFH_MASK;
-	lock_kernel();
-	if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-		goto error;
-
-	maxoff = romfs_maxsize(dir->i_sb);
-	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	/* OK, now find the file whose name is in "dentry" in the
-	 * directory specified by "dir".  */
-
-	name = dentry->d_name.name;
-	len = dentry->d_name.len;
-
-	for(;;) {
-		if (!offset || offset >= maxoff)
-			goto success; /* negative success */
-		if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-			goto error;
-
-		/* try to match the first 16 bytes of name */
-		fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-		if (len < ROMFH_SIZE) {
-			if (len == fslen) {
-				/* both are shorter, and same size */
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp (name, fsname, len) == 0)
-					break;
-			}
-		} else if (fslen >= ROMFH_SIZE) {
-			/* both are longer; XXX optimize max size */
-			fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-			if (len == fslen) {
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp(name, fsname, len) == 0)
-					break;
-			}
-		}
-		/* next entry */
-		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-	}
-
-	/* Hard link handling */
-	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	inode = romfs_iget(dir->i_sb, offset);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto error;
-	}
-
-success:
-	d_add(dentry, inode);
-	res = 0;
-error:
-	unlock_kernel();
-	return ERR_PTR(res);
-}
-
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t offset, size;
-	unsigned long filled;
-	void *buf;
-	int result = -EIO;
-
-	page_cache_get(page);
-	lock_kernel();
-	buf = kmap(page);
-	if (!buf)
-		goto err_out;
-
-	/* 32 bit warning -- but not for us :) */
-	offset = page_offset(page);
-	size = i_size_read(inode);
-	filled = 0;
-	result = 0;
-	if (offset < size) {
-		unsigned long readlen;
-
-		size -= offset;
-		readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-
-		filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-
-		if (filled != readlen) {
-			SetPageError(page);
-			filled = 0;
-			result = -EIO;
-		}
-	}
-
-	if (filled < PAGE_SIZE)
-		memset(buf + filled, 0, PAGE_SIZE-filled);
-
-	if (!result)
-		SetPageUptodate(page);
-	flush_dcache_page(page);
-
-	unlock_page(page);
-
-	kunmap(page);
-err_out:
-	page_cache_release(page);
-	unlock_kernel();
-
-	return result;
-}
-
-/* Mapping from our types to the kernel */
-
-static const struct address_space_operations romfs_aops = {
-	.readpage = romfs_readpage
-};
-
-static const struct file_operations romfs_dir_operations = {
-	.read		= generic_read_dir,
-	.readdir	= romfs_readdir,
-};
-
-static const struct inode_operations romfs_dir_inode_operations = {
-	.lookup		= romfs_lookup,
-};
-
-static mode_t romfs_modemap[] =
-{
-	0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-	S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-	int nextfh, ret;
-	struct romfs_inode ri;
-	struct inode *i;
-
-	ino &= ROMFH_MASK;
-	i = iget_locked(sb, ino);
-	if (!i)
-		return ERR_PTR(-ENOMEM);
-	if (!(i->i_state & I_NEW))
-		return i;
-
-	i->i_mode = 0;
-
-	/* Loop for finding the real hard link */
-	for(;;) {
-		if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-			printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-				ino);
-			iget_failed(i);
-			return ERR_PTR(-EIO);
-		}
-		/* XXX: do romfs_checksum here too (with name) */
-
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-			break;
-
-		ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	i->i_nlink = 1;		/* Hard to decide.. */
-	i->i_size = be32_to_cpu(ri.size);
-	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-
-        /* Precalculate the data offset */
-	ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-	if (ret >= 0)
-		ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-	else
-		ino = 0;
-
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-	/* only "normal" files have ops */
-	switch (nextfh & ROMFH_TYPE) {
-		case 1:
-			i->i_size = ROMFS_I(i)->i_metasize;
-			i->i_op = &romfs_dir_inode_operations;
-			i->i_fop = &romfs_dir_operations;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 2:
-			i->i_fop = &generic_ro_fops;
-			i->i_data.a_ops = &romfs_aops;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 3:
-			i->i_op = &page_symlink_inode_operations;
-			i->i_data.a_ops = &romfs_aops;
-			i->i_mode = ino | S_IRWXUGO;
-			break;
-		default:
-			/* depending on MBZ for sock/fifos */
-			nextfh = be32_to_cpu(ri.spec);
-			init_special_inode(i, ino,
-					MKDEV(nextfh>>16,nextfh&0xffff));
-	}
-	unlock_new_inode(i);
-	return i;
-}
-
-static struct kmem_cache * romfs_inode_cachep;
-
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-	struct romfs_inode_info *ei;
-	ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void romfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct romfs_inode_info *ei = foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-					     sizeof(struct romfs_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (romfs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(romfs_inode_cachep);
-}
-
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_RDONLY;
-	return 0;
-}
-
-static const struct super_operations romfs_ops = {
-	.alloc_inode	= romfs_alloc_inode,
-	.destroy_inode	= romfs_destroy_inode,
-	.statfs		= romfs_statfs,
-	.remount_fs	= romfs_remount,
-};
-
-static int romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-			   mnt);
-}
-
-static struct file_system_type romfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "romfs",
-	.get_sb		= romfs_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_romfs_fs(void)
-{
-	int err = init_inodecache();
-	if (err)
-		goto out1;
-        err = register_filesystem(&romfs_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	destroy_inodecache();
-out1:
-	return err;
-}
-
-static void __exit exit_romfs_fs(void)
-{
-	unregister_filesystem(&romfs_fs_type);
-	destroy_inodecache();
-}
-
-/* Yes, works even as a module... :) */
-
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+	struct inode	vfs_inode;
+	unsigned long	i_metasize;	/* size of non-data area */
+	unsigned long	i_dataoffset;	/* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+	return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops	generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct mtd_info *mtd = inode->i_sb->s_mtd;
+	unsigned long isize, offset;
+
+	if (!mtd)
+		goto cant_map_directly;
+
+	isize = i_size_read(inode);
+	offset = pgoff << PAGE_SHIFT;
+	if (offset > isize || len > isize || offset > isize - len)
+		return (unsigned long) -EINVAL;
+
+	/* we need to call down to the MTD layer to do the actual mapping */
+	if (mtd->get_unmapped_area) {
+		if (addr != 0)
+			return (unsigned long) -EINVAL;
+
+		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+			return (unsigned long) -EINVAL;
+
+		offset += ROMFS_I(inode)->i_dataoffset;
+		if (offset > mtd->size - len)
+			return (unsigned long) -EINVAL;
+
+		return mtd->get_unmapped_area(mtd, len, offset, flags);
+	}
+
+cant_map_directly:
+	return (unsigned long) -ENOSYS;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+const struct file_operations romfs_ro_fops = {
+	.llseek			= generic_file_llseek,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= romfs_mmap,
+	.get_unmapped_area	= romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	size_t rlen;
+	int ret;
+
+	ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+	return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen)
+{
+	ssize_t n = 0;
+	size_t segment;
+	u_char buf[16], *p;
+	size_t len;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (maxlen > 0) {
+		segment = min_t(size_t, maxlen, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		p = memchr(buf, 0, len);
+		if (p)
+			return n + (p - buf);
+		maxlen -= len;
+		pos += len;
+		n += len;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size)
+{
+	u_char buf[16];
+	size_t len, segment;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (size > 0) {
+		segment = min_t(size_t, size, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		if (memcmp(buf, str, len) != 0)
+			return 0;
+		size -= len;
+		pos += len;
+		str += len;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+
+	/* copy the string up to blocksize bytes at a time */
+	while (buflen > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, buflen, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		memcpy(buf, bh->b_data + offset, segment);
+		brelse(bh);
+		buflen -= segment;
+		pos += segment;
+	}
+
+	return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t limit)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ssize_t n = 0;
+	size_t segment;
+	u_char *buf, *p;
+
+	/* scan the string up to blocksize bytes at a time */
+	while (limit > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, limit, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		buf = bh->b_data + offset;
+		p = memchr(buf, 0, segment);
+		brelse(bh);
+		if (p)
+			return n + (p - buf);
+		limit -= segment;
+		pos += segment;
+		n += segment;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+	bool x;
+
+	/* scan the string up to 16 bytes at a time */
+	while (size > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, size, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		x = (memcmp(bh->b_data + offset, str, segment) != 0);
+		brelse(bh);
+		if (x)
+			return 0;
+		size -= segment;
+		pos += segment;
+		str += segment;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+		   void *buf, size_t buflen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (buflen > limit - pos)
+		buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+	return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+			  unsigned long pos, size_t maxlen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (maxlen > limit - pos)
+		maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strnlen(sb, pos, limit);
+#endif
+	return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+		      const char *str, size_t size)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (size > ROMFS_MAXFN)
+		return -ENAMETOOLONG;
+	if (size > limit - pos)
+		return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strncmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strncmp(sb, pos, str, size);
+#endif
+	return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..1e548a4975ba
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,648 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *					Changed for 2.1.19 modules
+ *	Jan 1997			Initial release
+ *	Jun 1997			2.1.43+ changes
+ *					Proper page locking in readpage
+ *					Changed to work with 2.1.45+ fs
+ *	Jul 1997			Fixed follow_link
+ *			2.1.47
+ *					lookup shouldn't return -ENOENT
+ *					from Horst von Brand:
+ *					  fail on wrong checksum
+ *					  double unlock_super was possible
+ *					  correct namelen for statfs
+ *					spotted by Bill Hawes:
+ *					  readlink shouldn't iput()
+ *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
+ *					  exposed a problem in readdir
+ *			2.1.107		code-freeze spellchecker run
+ *	Aug 1998			2.1.118+ VFS changes
+ *	Sep 1998	2.1.122		another VFS change (follow_link)
+ *	Apr 1999	2.2.7		no more EBADF checking in
+ *					  lookup/readdir, use ERR_PTR
+ *	Jun 1999	2.3.6		d_alloc_root use changed
+ *			2.3.9		clean up usage of ENOENT/negative
+ *					  dentries in lookup
+ *					clean up page flags setting
+ *					  (error, uptodate, locking) in
+ *					  in readpage
+ *					use init_special_inode for
+ *					  fifos/sockets (and streamline) in
+ *					  read_inode, fix _ops table order
+ *	Aug 1999	2.3.16		__initfunc() => __init change
+ *	Oct 1999	2.3.24		page->owner hack obsoleted
+ *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+	0,			/* hard link */
+	S_IFDIR  | 0644,	/* directory */
+	S_IFREG  | 0644,	/* regular file */
+	S_IFLNK  | 0777,	/* symlink */
+	S_IFBLK  | 0600,	/* blockdev */
+	S_IFCHR  | 0600,	/* chardev */
+	S_IFSOCK | 0644,	/* socket */
+	S_IFIFO  | 0644		/* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t offset, size;
+	unsigned long fillsize, pos;
+	void *buf;
+	int ret;
+
+	buf = kmap(page);
+	if (!buf)
+		return -ENOMEM;
+
+	/* 32 bit warning -- but not for us :) */
+	offset = page_offset(page);
+	size = i_size_read(inode);
+	fillsize = 0;
+	ret = 0;
+	if (offset < size) {
+		size -= offset;
+		fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+		ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+		if (ret < 0) {
+			SetPageError(page);
+			fillsize = 0;
+			ret = -EIO;
+		}
+	}
+
+	if (fillsize < PAGE_SIZE)
+		memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+	if (ret == 0)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+	.readpage	= romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *i = filp->f_dentry->d_inode;
+	struct romfs_inode ri;
+	unsigned long offset, maxoff;
+	int j, ino, nextfh;
+	int stored = 0;
+	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
+	int ret;
+
+	maxoff = romfs_maxsize(i->i_sb);
+
+	offset = filp->f_pos;
+	if (!offset) {
+		offset = i->i_ino & ROMFH_MASK;
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* Not really failsafe, but we are read-only... */
+	for (;;) {
+		if (!offset || offset >= maxoff) {
+			offset = maxoff;
+			filp->f_pos = offset;
+			goto out;
+		}
+		filp->f_pos = offset;
+
+		/* Fetch inode info */
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+
+		j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+				      sizeof(fsname) - 1);
+		if (j < 0)
+			goto out;
+
+		ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+		if (ret < 0)
+			goto out;
+		fsname[j] = '\0';
+
+		ino = offset;
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+			ino = be32_to_cpu(ri.spec);
+		if (filldir(dirent, fsname, j, offset, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+			goto out;
+
+		stored++;
+		offset = nextfh & ROMFH_MASK;
+	}
+
+out:
+	return stored;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	unsigned long offset, maxoff;
+	struct inode *inode;
+	struct romfs_inode ri;
+	const char *name;		/* got from dentry */
+	int len, ret;
+
+	offset = dir->i_ino & ROMFH_MASK;
+	ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+	if (ret < 0)
+		goto error;
+
+	/* search all the file entries in the list starting from the one
+	 * pointed to by the directory's special data */
+	maxoff = romfs_maxsize(dir->i_sb);
+	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	for (;;) {
+		if (!offset || offset >= maxoff)
+			goto out0;
+
+		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* try to match the first 16 bytes of name */
+		ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+					len);
+		if (ret < 0)
+			goto error;
+		if (ret == 1)
+			break;
+
+		/* next entry */
+		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+	}
+
+	/* Hard link handling */
+	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	inode = romfs_iget(dir->i_sb, offset);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+	goto outi;
+
+	/*
+	 * it's a bit funky, _lookup needs to return an error code
+	 * (negative) or a NULL, both as a dentry.  ENOENT should not
+	 * be returned, instead we need to create a negative dentry by
+	 * d_add(dentry, NULL); and return 0 as no error.
+	 * (Although as I see, it only matters on writable file
+	 * systems).
+	 */
+out0:
+	inode = NULL;
+outi:
+	d_add(dentry, inode);
+	ret = 0;
+error:
+	return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= romfs_readdir,
+};
+
+static struct inode_operations romfs_dir_inode_operations = {
+	.lookup		= romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+	struct romfs_inode_info *inode;
+	struct romfs_inode ri;
+	struct inode *i;
+	unsigned long nlen;
+	unsigned nextfh, ret;
+	umode_t mode;
+
+	/* we might have to traverse a chain of "hard link" file entries to get
+	 * to the actual file */
+	for (;;) {
+		ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* XXX: do romfs_checksum here too (with name) */
+
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+			break;
+
+		pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* determine the length of the filename */
+	nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+	if (IS_ERR_VALUE(nlen))
+		goto eio;
+
+	/* get an inode for this image position */
+	i = iget_locked(sb, pos);
+	if (!i)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(i->i_state & I_NEW))
+		return i;
+
+	/* precalculate the data offset */
+	inode = ROMFS_I(i);
+	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+	inode->i_dataoffset = pos + inode->i_metasize;
+
+	i->i_nlink = 1;		/* Hard to decide.. */
+	i->i_size = be32_to_cpu(ri.size);
+	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+	/* set up mode and ops */
+	mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+	switch (nextfh & ROMFH_TYPE) {
+	case ROMFH_DIR:
+		i->i_size = ROMFS_I(i)->i_metasize;
+		i->i_op = &romfs_dir_inode_operations;
+		i->i_fop = &romfs_dir_operations;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_REG:
+		i->i_fop = &romfs_ro_fops;
+		i->i_data.a_ops = &romfs_aops;
+		if (i->i_sb->s_mtd)
+			i->i_data.backing_dev_info =
+				i->i_sb->s_mtd->backing_dev_info;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_SYM:
+		i->i_op = &page_symlink_inode_operations;
+		i->i_data.a_ops = &romfs_aops;
+		mode |= S_IRWXUGO;
+		break;
+	default:
+		/* depending on MBZ for sock/fifos */
+		nextfh = be32_to_cpu(ri.spec);
+		init_special_inode(i, mode, MKDEV(nextfh >> 16,
+						  nextfh & 0xffff));
+		break;
+	}
+
+	i->i_mode = mode;
+
+	unlock_new_inode(i);
+	return i;
+
+eio:
+	ret = -EIO;
+error:
+	printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+	return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+	struct romfs_inode_info *inode;
+	inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+	return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	buf->f_type = ROMFS_MAGIC;
+	buf->f_namelen = ROMFS_MAXFN;
+	buf->f_bsize = ROMBSIZE;
+	buf->f_bfree = buf->f_bavail = buf->f_ffree;
+	buf->f_blocks =
+		(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+	return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+	.alloc_inode	= romfs_alloc_inode,
+	.destroy_inode	= romfs_destroy_inode,
+	.statfs		= romfs_statfs,
+	.remount_fs	= romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+	const __be32 *ptr = data;
+	__u32 sum;
+
+	sum = 0;
+	size >>= 2;
+	while (size > 0) {
+		sum += be32_to_cpu(*ptr++);
+		size--;
+	}
+	return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct romfs_super_block *rsb;
+	struct inode *root;
+	unsigned long pos, img_size;
+	const char *storage;
+	size_t len;
+	int ret;
+
+#ifdef CONFIG_BLOCK
+	if (!sb->s_mtd) {
+		sb_set_blocksize(sb, ROMBSIZE);
+	} else {
+		sb->s_blocksize = ROMBSIZE;
+		sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+	}
+#endif
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_magic = ROMFS_MAGIC;
+	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_op = &romfs_super_ops;
+
+	/* read the image superblock and check it */
+	rsb = kmalloc(512, GFP_KERNEL);
+	if (!rsb)
+		return -ENOMEM;
+
+	sb->s_fs_info = (void *) 512;
+	ret = romfs_dev_read(sb, 0, rsb, 512);
+	if (ret < 0)
+		goto error_rsb;
+
+	img_size = be32_to_cpu(rsb->size);
+
+	if (sb->s_mtd && img_size > sb->s_mtd->size)
+		goto error_rsb_inval;
+
+	sb->s_fs_info = (void *) img_size;
+
+	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+	    img_size < ROMFH_SIZE) {
+		if (!silent)
+			printk(KERN_WARNING "VFS:"
+			       " Can't find a romfs filesystem on dev %s.\n",
+			       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+		printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+		       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	storage = sb->s_mtd ? "MTD" : "the block layer";
+
+	len = strnlen(rsb->name, ROMFS_MAXFN);
+	if (!silent)
+		printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+		       (unsigned) len, (unsigned) len, rsb->name, storage);
+
+	kfree(rsb);
+	rsb = NULL;
+
+	/* find the root directory */
+	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+	root = romfs_iget(sb, pos);
+	if (!root)
+		goto error;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto error_i;
+
+	return 0;
+
+error_i:
+	iput(root);
+error:
+	return -EINVAL;
+error_rsb_inval:
+	ret = -EINVAL;
+error_rsb:
+	return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	int ret = -EINVAL;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+			 mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (ret == -EINVAL)
+		ret = get_sb_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super, mnt);
+#endif
+	return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd) {
+		kill_mtd_super(sb);
+		return;
+	}
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev) {
+		kill_block_super(sb);
+		return;
+	}
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "romfs",
+	.get_sb		= romfs_get_sb,
+	.kill_sb	= romfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+	struct romfs_inode_info *inode = _inode;
+
+	inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+	int ret;
+
+	printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+	romfs_inode_cachep =
+		kmem_cache_create("romfs_i",
+				  sizeof(struct romfs_inode_info), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				  romfs_i_init_once);
+
+	if (!romfs_inode_cachep) {
+		printk(KERN_ERR
+		       "ROMFS error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = register_filesystem(&romfs_fs_type);
+	if (ret) {
+		printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+		goto error_register;
+	}
+	return 0;
+
+error_register:
+	kmem_cache_destroy(romfs_inode_cachep);
+	return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+	unregister_filesystem(&romfs_fs_type);
+	kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532bf..7f40f30c55c5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	 */
 	file->f_version = 0;
 
-	/* SEQ files support lseek, but not pread/pwrite */
-	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
 	return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
 	if (!copied)
 		copied = err;
-	else
+	else {
 		*ppos += copied;
+		m->read_pos += copied;
+	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
 	return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			if (offset < 0)
 				break;
 			retval = offset;
-			if (offset != file->f_pos) {
+			if (offset != m->read_pos) {
 				while ((retval=traverse(m, offset)) == -EAGAIN)
 					;
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->read_pos = 0;
 					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
+					m->read_pos = offset;
 					retval = file->f_pos = offset;
 				}
 			}
@@ -485,7 +513,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 }
 EXPORT_SYMBOL(seq_bitmap);
 
-int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
 		unsigned int nr_bits)
 {
 	if (m->count < m->size) {
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index e7ddd0328ddc..3e4803b4427e 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -277,7 +277,7 @@ static int smb_hash_dentry(struct dentry *, struct qstr *);
 static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 static int smb_delete_dentry(struct dentry *);
 
-static struct dentry_operations smbfs_dentry_operations =
+static const struct dentry_operations smbfs_dentry_operations =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_hash		= smb_hash_dentry,
@@ -285,7 +285,7 @@ static struct dentry_operations smbfs_dentry_operations =
 	.d_delete	= smb_delete_dentry,
 };
 
-static struct dentry_operations smbfs_dentry_operations_case =
+static const struct dentry_operations smbfs_dentry_operations_case =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_delete	= smb_delete_dentry,
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..dd727d43e5b7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL))
 			goto out_unlock;
 
 		/*
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c6..2a7960310349 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with zlib).
  */
 int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-			int length, u64 *next_index, int srclength)
+			int length, u64 *next_index, int srclength, int pages)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 				offset = 0;
 			}
 
-			if (msblk->stream.avail_out == 0) {
+			if (msblk->stream.avail_out == 0 && page < pages) {
 				msblk->stream.next_out = buffer[page++];
 				msblk->stream.avail_out = PAGE_CACHE_SIZE;
 			}
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 				zlib_init = 1;
 			}
 
-			zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+			zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
 
 			if (msblk->stream.avail_in == 0 && k < b)
 				put_bh(bh[k++]);
 		} while (zlib_err == Z_OK);
 
 		if (zlib_err != Z_STREAM_END) {
-			ERROR("zlib_inflate returned unexpected result"
-				" 0x%x, srclength %d, avail_in %d,"
-				" avail_out %d\n", zlib_err, srclength,
-				msblk->stream.avail_in,
-				msblk->stream.avail_out);
+			ERROR("zlib_inflate error, data probably corrupt\n");
 			goto release_mutex;
 		}
 
 		zlib_err = zlib_inflateEnd(&msblk->stream);
 		if (zlib_err != Z_OK) {
-			ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
-				" srclength %d\n", zlib_err, srclength);
+			ERROR("zlib_inflate error, data probably corrupt\n");
 			goto release_mutex;
 		}
 		length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
 		put_bh(bh[k]);
 
 read_failure:
-	ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
 	kfree(bh);
 	return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25e..1c4739e33af6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 
 			entry->length = squashfs_read_data(sb, entry->data,
 				block, length, &entry->next_index,
-				cache->block_size);
+				cache->block_size, cache->pages);
 
 			spin_lock(&cache->lock);
 
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
 	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
 		data[i] = buffer;
 	res = squashfs_read_data(sb, data, block, length |
-		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
 	kfree(data);
 	return res;
 }
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb855..9101dbde39ec 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	type = le16_to_cpu(sqshb_ino->inode_type);
 	switch (type) {
 	case SQUASHFS_REG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
 
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		break;
 	}
 	case SQUASHFS_LREG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d5..0e9feb6adf7e 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-				int);
+				int, int);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b491..ffa6edcd2d0c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
 static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+	u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
 
 	TRACE("Entered squashfs_statfs\n");
 
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = msblk->inodes;
 	buf->f_ffree = 0;
 	buf->f_namelen = SQUASHFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	return 0;
 }
@@ -389,7 +392,7 @@ static int __init init_squashfs_fs(void)
 		return err;
 	}
 
-	printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
 		"Phillip Lougher\n");
 
 	return 0;
diff --git a/fs/super.c b/fs/super.c
index 61dce001dd57..77cb4ec919b9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		 * lock ordering than usbfs:
 		 */
 		lockdep_set_class(&s->s_lock, &type->s_lock_key);
-		down_write(&s->s_umount);
+		/*
+		 * sget() can have s_umount recursion.
+		 *
+		 * When it cannot find a suitable sb, it allocates a new
+		 * one (this one), and tries again to find a suitable old
+		 * one.
+		 *
+		 * In case that succeeds, it will acquire the s_umount
+		 * lock of the old one. Since these are clearly distrinct
+		 * locks, and this object isn't exposed yet, there's no
+		 * risk of deadlocks.
+		 *
+		 * Annotate this by putting this lock in a different
+		 * subclass.
+		 */
+		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
@@ -182,7 +197,7 @@ void deactivate_super(struct super_block *s)
 	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
 		s->s_count -= S_BIAS-1;
 		spin_unlock(&sb_lock);
-		DQUOT_OFF(s, 0);
+		vfs_dq_off(s, 0);
 		down_write(&s->s_umount);
 		fs->kill_sb(s);
 		put_filesystem(fs);
@@ -251,7 +266,7 @@ EXPORT_SYMBOL(unlock_super);
 void __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
-	DQUOT_SYNC(sb);
+	vfs_dq_sync(sb);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
@@ -272,6 +287,7 @@ int fsync_super(struct super_block *sb)
 	__fsync_super(sb);
 	return sync_blockdev(sb->s_bdev);
 }
+EXPORT_SYMBOL_GPL(fsync_super);
 
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
@@ -356,8 +372,10 @@ retry:
 				continue;
 			if (!grab_super(old))
 				goto retry;
-			if (s)
+			if (s) {
+				up_write(&s->s_umount);
 				destroy_super(s);
+			}
 			return old;
 		}
 	}
@@ -372,6 +390,7 @@ retry:
 	err = set(s, data);
 	if (err) {
 		spin_unlock(&sb_lock);
+		up_write(&s->s_umount);
 		destroy_super(s);
 		return ERR_PTR(err);
 	}
@@ -637,7 +656,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 			mark_files_ro(sb);
 		else if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-		retval = DQUOT_OFF(sb, 1);
+		retval = vfs_dq_off(sb, 1);
 		if (retval < 0 && retval != -ENOSYS)
 			return -EBUSY;
 	}
@@ -652,11 +671,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw)
-		DQUOT_ON_REMOUNT(sb);
+		vfs_dq_quota_on_remount(sb);
 	return 0;
 }
 
-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(struct work_struct *work)
 {
 	struct super_block *sb;
 
@@ -679,12 +698,19 @@ static void do_emergency_remount(unsigned long foo)
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
+	kfree(work);
 	printk("Emergency Remount complete\n");
 }
 
 void emergency_remount(void)
 {
-	pdflush_operation(do_emergency_remount, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_emergency_remount);
+		schedule_work(work);
+	}
 }
 
 /*
@@ -813,7 +839,8 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 error_s:
 	error = PTR_ERR(s);
@@ -859,7 +886,8 @@ int get_sb_nodev(struct file_system_type *fs_type,
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -891,7 +919,8 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_single);
diff --git a/fs/sync.c b/fs/sync.c
index a16d53e5fe9d..7abc65fbf21d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@ static void do_sync(unsigned long wait)
 {
 	wakeup_pdflush(0);
 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	DQUOT_SYNC(NULL);
+	vfs_dq_sync(NULL);
 	sync_supers();		/* Write the superblocks */
 	sync_filesystems(0);	/* Start syncing the filesystems */
 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
@@ -42,9 +42,21 @@ SYSCALL_DEFINE0(sync)
 	return 0;
 }
 
+static void do_sync_work(struct work_struct *work)
+{
+	do_sync(0);
+	kfree(work);
+}
+
 void emergency_sync(void)
 {
-	pdflush_operation(do_sync, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_sync_work);
+		schedule_work(work);
+	}
 }
 
 /*
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index f2c478c3424e..93e0c0281d45 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -21,15 +21,28 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 
 #include <asm/uaccess.h>
 
 #include "sysfs.h"
 
+/*
+ * There's one bin_buffer for each open file.
+ *
+ * filp->private_data points to bin_buffer and
+ * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
+ * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
+ */
+static DEFINE_MUTEX(sysfs_bin_lock);
+
 struct bin_buffer {
-	struct mutex	mutex;
-	void		*buffer;
-	int		mmapped;
+	struct mutex			mutex;
+	void				*buffer;
+	int				mmapped;
+	struct vm_operations_struct 	*vm_ops;
+	struct file			*file;
+	struct hlist_node		list;
 };
 
 static int
@@ -168,6 +181,175 @@ out_free:
 	return count;
 }
 
+static void bin_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+
+	if (!bb->vm_ops || !bb->vm_ops->open)
+		return;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return;
+
+	bb->vm_ops->open(vma);
+
+	sysfs_put_active_two(attr_sd);
+}
+
+static void bin_vma_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+
+	if (!bb->vm_ops || !bb->vm_ops->close)
+		return;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return;
+
+	bb->vm_ops->close(vma);
+
+	sysfs_put_active_two(attr_sd);
+}
+
+static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->fault)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = bb->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!bb->vm_ops->page_mkwrite)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = bb->vm_ops->page_mkwrite(vma, vmf);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static int bin_access(struct vm_area_struct *vma, unsigned long addr,
+		  void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->access)
+		return -EINVAL;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return -EINVAL;
+
+	ret = bb->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->set_policy)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return -EINVAL;
+
+	ret = bb->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
+					unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct mempolicy *pol;
+
+	if (!bb->vm_ops || !bb->vm_ops->get_policy)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return vma->vm_policy;
+
+	pol = bb->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active_two(attr_sd);
+	return pol;
+}
+
+static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
+			const nodemask_t *to, unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->migrate)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return 0;
+
+	ret = bb->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+#endif
+
+static struct vm_operations_struct bin_vm_ops = {
+	.open		= bin_vma_open,
+	.close		= bin_vma_close,
+	.fault		= bin_fault,
+	.page_mkwrite	= bin_page_mkwrite,
+	.access		= bin_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= bin_set_policy,
+	.get_policy	= bin_get_policy,
+	.migrate	= bin_migrate,
+#endif
+};
+
 static int mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct bin_buffer *bb = file->private_data;
@@ -179,18 +361,37 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	mutex_lock(&bb->mutex);
 
 	/* need attr_sd for attr, its parent for kobj */
+	rc = -ENODEV;
 	if (!sysfs_get_active_two(attr_sd))
-		return -ENODEV;
+		goto out_unlock;
 
 	rc = -EINVAL;
-	if (attr->mmap)
-		rc = attr->mmap(kobj, attr, vma);
+	if (!attr->mmap)
+		goto out_put;
+
+	rc = attr->mmap(kobj, attr, vma);
+	if (rc)
+		goto out_put;
 
-	if (rc == 0 && !bb->mmapped)
-		bb->mmapped = 1;
-	else
-		sysfs_put_active_two(attr_sd);
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
 
+	rc = -EINVAL;
+	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	rc = 0;
+	bb->mmapped = 1;
+	bb->vm_ops = vma->vm_ops;
+	vma->vm_ops = &bin_vm_ops;
+out_put:
+	sysfs_put_active_two(attr_sd);
+out_unlock:
 	mutex_unlock(&bb->mutex);
 
 	return rc;
@@ -223,8 +424,13 @@ static int open(struct inode * inode, struct file * file)
 		goto err_out;
 
 	mutex_init(&bb->mutex);
+	bb->file = file;
 	file->private_data = bb;
 
+	mutex_lock(&sysfs_bin_lock);
+	hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
+	mutex_unlock(&sysfs_bin_lock);
+
 	/* open succeeded, put active references */
 	sysfs_put_active_two(attr_sd);
 	return 0;
@@ -237,11 +443,12 @@ static int open(struct inode * inode, struct file * file)
 
 static int release(struct inode * inode, struct file * file)
 {
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_buffer *bb = file->private_data;
 
-	if (bb->mmapped)
-		sysfs_put_active_two(attr_sd);
+	mutex_lock(&sysfs_bin_lock);
+	hlist_del(&bb->list);
+	mutex_unlock(&sysfs_bin_lock);
+
 	kfree(bb->buffer);
 	kfree(bb);
 	return 0;
@@ -256,6 +463,26 @@ const struct file_operations bin_fops = {
 	.release	= release,
 };
 
+
+void unmap_bin_file(struct sysfs_dirent *attr_sd)
+{
+	struct bin_buffer *bb;
+	struct hlist_node *tmp;
+
+	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
+		return;
+
+	mutex_lock(&sysfs_bin_lock);
+
+	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
+		struct inode *inode = bb->file->f_path.dentry->d_inode;
+
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+
+	mutex_unlock(&sysfs_bin_lock);
+}
+
 /**
  *	sysfs_create_bin_file - create binary file for object.
  *	@kobj:	object.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 82d3b79d0e08..d88d0fac9fa5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -302,7 +302,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 	iput(inode);
 }
 
-static struct dentry_operations sysfs_dentry_ops = {
+static const struct dentry_operations sysfs_dentry_ops = {
 	.d_iput		= sysfs_d_iput,
 };
 
@@ -434,6 +434,26 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 }
 
 /**
+ *	sysfs_pathname - return full path to sysfs dirent
+ *	@sd: sysfs_dirent whose path we want
+ *	@path: caller allocated buffer
+ *
+ *	Gives the name "/" to the sysfs_root entry; any path returned
+ *	is relative to wherever sysfs is mounted.
+ *
+ *	XXX: does no error checking on @path size
+ */
+static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+{
+	if (sd->s_parent) {
+		sysfs_pathname(sd->s_parent, path);
+		strcat(path, "/");
+	}
+	strcat(path, sd->s_name);
+	return path;
+}
+
+/**
  *	sysfs_add_one - add sysfs_dirent to parent
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
@@ -458,8 +478,16 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	int ret;
 
 	ret = __sysfs_add_one(acxt, sd);
-	WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
-		       "can not be created\n", sd->s_name);
+	if (ret == -EEXIST) {
+		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
+		WARN(1, KERN_WARNING
+		     "sysfs: cannot create duplicate filename '%s'\n",
+		     (path == NULL) ? sd->s_name :
+		     strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
+		            sd->s_name));
+		kfree(path);
+	}
+
 	return ret;
 }
 
@@ -581,6 +609,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 
 		sysfs_drop_dentry(sd);
 		sysfs_deactivate(sd);
+		unmap_bin_file(sd);
 		sysfs_put(sd);
 	}
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1f4a3f877262..289c43a47263 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -659,13 +659,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
 
 struct sysfs_schedule_callback_struct {
-	struct kobject 		*kobj;
+	struct list_head	workq_list;
+	struct kobject		*kobj;
 	void			(*func)(void *);
 	void			*data;
 	struct module		*owner;
 	struct work_struct	work;
 };
 
+static DEFINE_MUTEX(sysfs_workq_mutex);
+static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
 {
 	struct sysfs_schedule_callback_struct *ss = container_of(work,
@@ -674,6 +677,9 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
 	(ss->func)(ss->data);
 	kobject_put(ss->kobj);
 	module_put(ss->owner);
+	mutex_lock(&sysfs_workq_mutex);
+	list_del(&ss->workq_list);
+	mutex_unlock(&sysfs_workq_mutex);
 	kfree(ss);
 }
 
@@ -695,15 +701,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
  * until @func returns.
  *
  * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available.
+ * be allocated, -ENODEV if a reference to @owner isn't available,
+ * -EAGAIN if a callback has already been scheduled for @kobj.
  */
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 		void *data, struct module *owner)
 {
-	struct sysfs_schedule_callback_struct *ss;
+	struct sysfs_schedule_callback_struct *ss, *tmp;
 
 	if (!try_module_get(owner))
 		return -ENODEV;
+
+	mutex_lock(&sysfs_workq_mutex);
+	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
+		if (ss->kobj == kobj) {
+			mutex_unlock(&sysfs_workq_mutex);
+			return -EAGAIN;
+		}
+	mutex_unlock(&sysfs_workq_mutex);
+
 	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
 	if (!ss) {
 		module_put(owner);
@@ -715,6 +731,10 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	ss->data = data;
 	ss->owner = owner;
 	INIT_WORK(&ss->work, sysfs_schedule_callback_work);
+	INIT_LIST_HEAD(&ss->workq_list);
+	mutex_lock(&sysfs_workq_mutex);
+	list_add_tail(&ss->workq_list, &sysfs_workq);
+	mutex_unlock(&sysfs_workq_mutex);
 	schedule_work(&ss->work);
 	return 0;
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index dfa3d94cfc74..555f0ff988df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -147,6 +147,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
 
+	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
@@ -214,6 +215,22 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
 	return inode;
 }
 
+/*
+ * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
+ * To prevent the sysfs inode numbers from being freed prematurely we take a
+ * reference to sysfs_dirent from the sysfs inode.  A
+ * super_operations.delete_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void sysfs_delete_inode(struct inode *inode)
+{
+	struct sysfs_dirent *sd  = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+	sysfs_put(sd);
+}
+
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 {
 	struct sysfs_addrm_cxt acxt;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index ab343e371d64..49749955ccaf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -17,11 +17,10 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/magic.h>
 
 #include "sysfs.h"
 
-/* Random magic number */
-#define SYSFS_MAGIC 0x62656572
 
 static struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
@@ -30,6 +29,7 @@ struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.delete_inode	= sysfs_delete_inode,
 };
 
 struct sysfs_dirent sysfs_root = {
@@ -53,7 +53,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	sysfs_sb = sb;
 
 	/* get root inode, initialize and unlock it */
+	mutex_lock(&sysfs_mutex);
 	inode = sysfs_get_inode(&sysfs_root);
+	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93c6d6b27c4d..3fa0d98481e2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,6 +28,7 @@ struct sysfs_elem_attr {
 
 struct sysfs_elem_bin_attr {
 	struct bin_attribute	*bin_attr;
+	struct hlist_head	buffers;
 };
 
 /*
@@ -145,6 +146,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
  * inode.c
  */
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+void sysfs_delete_inode(struct inode *inode);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
@@ -163,6 +165,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
  * bin.c
  */
 extern const struct file_operations bin_fops;
+void unmap_bin_file(struct sysfs_dirent *attr_sd);
 
 /*
  * symlink.c
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = sb->s_magic;
 	buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = sbi->s_ninodes;
 	buf->f_ffree = sysv_count_free_inodes(sb);
 	buf->f_namelen = SYSV_NAMELEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 	return 0;
 }
 
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index a1f1ef33e81c..33e047b59b8d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -38,7 +38,7 @@ static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-struct dentry_operations sysv_dentry_operations = {
+const struct dentry_operations sysv_dentry_operations = {
 	.d_hash		= sysv_hash,
 };
 
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 38ebe3f85b3d..5784a318c883 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -170,7 +170,7 @@ extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
 extern const struct address_space_operations sysv_aops;
 extern const struct super_operations sysv_sops;
-extern struct dentry_operations sysv_dentry_operations;
+extern const struct dentry_operations sysv_dentry_operations;
 
 
 enum {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
-		return -EINVAL;
-	if (clockid != CLOCK_MONOTONIC &&
-	    clockid != CLOCK_REALTIME)
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & (O_CLOEXEC | O_NONBLOCK));
+			       flags & TFD_SHARED_FCNTL_FLAGS);
 	if (ufd < 0)
 		kfree(ctx);
 
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
 
-	if (!timespec_valid(&ktmr.it_value) ||
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
 	depends on UBIFS_FS
 	help
 	  This option allows to explicitly choose which compressions, if any,
-	  are enabled in UBIFS. Removing compressors means inbility to read
+	  are enabled in UBIFS. Removing compressors means inability to read
 	  existing file systems.
 
 	  If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
 	depends on UBIFS_FS
 	default y
 	help
-	   LZO compressor is generally faster then zlib but compresses worse.
+	   LZO compressor is generally faster than zlib but compresses worse.
 	   Say 'Y' if unsure.
 
 config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
  * @c: UBIFS file-system description object
  *
- * This function calculates and returns the number of eraseblocks which should
- * be kept for index usage.
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
  */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-	int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+	int idx_lebs;
 	long long idx_size;
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-
 	/* And make sure we have thrice the index size of space reserved */
-	idx_size = idx_size + (idx_size << 1);
-
+	idx_size += idx_size << 1;
 	/*
 	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
 	 * pair, nor similarly the two variables for the new index size, so we
 	 * have to do this costly 64-bit division on fast-path.
 	 */
-	idx_size += eff_leb_size - 1;
-	idx_lebs = div_u64(idx_size, eff_leb_size);
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
 	/*
 	 * The index head is not available for the in-the-gaps method, so add an
 	 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
  * do_budget_space - reserve flash space for index and data growth.
  * @c: UBIFS file-system description object
  *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
- * and data.
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
  *
  * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
  * would take if it was consolidated and written to the flash. This guarantees
  * that the "in-the-gaps" commit method always succeeds and UBIFS will always
  * be able to commit dirty index. So this function basically adds amount of
  * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
  *
  * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
  * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
  *    be large, because UBIFS does not do any index consolidation as long as
  *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
  *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
  *
  * This function returns zero in case of success, and %-ENOSPC in case of
  * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * This function calculates amount of free space to report to user-space.
  *
  * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
- * amount of free flash space it has (well, because not all dirty space is
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectations about what free space is. Users seem to
- * accustomed to assume that if the file-system reports N bytes of free space,
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 					  "bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
-				printk("%c", dent->name[i]);
+				printk(KERN_CONT "%c", dent->name[i]);
 		}
-		printk("\n");
+		printk(KERN_CONT "\n");
 
 		break;
 	}
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
 
 			/*
 			 * Make sure the last key in our znode is less or
-			 * equivalent than the the key in zbranch which goes
+			 * equivalent than the key in the zbranch which goes
 			 * after our pointing zbranch.
 			 */
 			cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			 * the media.
 			 */
 			SetPageChecked(page);
-		else {
+			skipped_read = 1;
+		} else {
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err)) {
 		ubifs_assert(err == -ENOSPC);
 		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
 		 * Budgeting failed which means it would have to force
 		 * write-back but didn't, because we set the @fast flag in the
 		 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
  * whole index and correct all inode sizes, which is long an unacceptable.
  *
  * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
  * written to the flash media last time. Otherwise, UBIFS forces inode
  * write-back, thus making sure the on-flash inode contains current inode size,
  * and then keeps writing pages back.
@@ -1434,8 +1444,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1458,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
 	if (unlikely(c->ro_media))
-		return -EROFS;
+		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
 	 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1491,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		if (err == -ENOSPC)
 			ubifs_warn("out of space for mmapped file "
 				   "(inode number %lu)", inode->i_ino);
-		return err;
+		return VM_FAULT_SIGBUS;
 	}
 
 	lock_page(page);
@@ -1520,6 +1531,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
 	unlock_page(page);
 	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
 	return err;
 }
 
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * ubifs_find_free_space - find a data LEB with free space.
  * @c: the UBIFS file-system description object
  * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
  * @squeeze: whether to try to find space in a non-empty LEB first
  *
  * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * failed to find a LEB with @min_space bytes of free space and other a negative
  * error codes in case of failure.
  */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze)
 {
 	const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		spin_unlock(&c->space_lock);
 	}
 
-	*free = lprops->free;
+	*offs = c->leb_size - lprops->free;
 	ubifs_release_lprops(c);
 
-	if (*free == c->leb_size) {
+	if (*offs == 0) {
 		/*
 		 * Ensure that empty LEBs have been unmapped. They may not have
 		 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 			return err;
 	}
 
-	dbg_find("found LEB %d, free %d", lnum, *free);
-	ubifs_assert(*free >= min_space);
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
 	return lnum;
 
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
  * have to waste large pieces of free space at the end of LEB B, because nodes
  * from LEB A would not fit. And the worst situation is when all nodes are of
  * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
  * be unable to reclaim it. So, LEBs with free + dirty greater than dark
  * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
  * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-
-/*
  * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 
 /**
- * joinup - bring data nodes for an inode together.
- * @c: UBIFS file-system description object
- * @sleb: describes scanned LEB
- * @inum: inode number
- * @blk: block number
- * @data: list to which to add data nodes
+ * list_sort - sort a list.
+ * @priv: private data, passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
  *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
- * them to @data if they are data nodes from @inum and have a larger block
- * number than @blk. This function returns %0 on success and a negative error
- * code on failure.
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
+ * in ascending order.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
  */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
-		  unsigned int blk, struct list_head *data)
+static void list_sort(void *priv, struct list_head *head,
+		      int (*cmp)(void *priv, struct list_head *a,
+				 struct list_head *b))
 {
-	int err, cnt = 6, lnum = sleb->lnum, offs;
-	struct ubifs_scan_node *snod, *tmp;
-	union ubifs_key *key;
+	struct list_head *p, *q, *e, *list, *tail, *oldhead;
+	int insize, nmerges, psize, qsize, i;
+
+	if (list_empty(head))
+		return;
+
+	list = head->next;
+	list_del(head);
+	insize = 1;
+	for (;;) {
+		p = oldhead = list;
+		list = tail = NULL;
+		nmerges = 0;
+
+		while (p) {
+			nmerges++;
+			q = p;
+			psize = 0;
+			for (i = 0; i < insize; i++) {
+				psize++;
+				q = q->next == oldhead ? NULL : q->next;
+				if (!q)
+					break;
+			}
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		key = &snod->key;
-		if (key_inum(c, key) == inum &&
-		    key_type(c, key) == UBIFS_DATA_KEY &&
-		    key_block(c, key) > blk) {
-			offs = snod->offs;
-			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
-			if (err < 0)
-				return err;
-			list_del(&snod->list);
-			if (err) {
-				list_add_tail(&snod->list, data);
-				blk = key_block(c, key);
-			} else
-				kfree(snod);
-			cnt = 6;
-		} else if (--cnt == 0)
+			qsize = insize;
+			while (psize > 0 || (qsize > 0 && q)) {
+				if (!psize) {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				} else if (!qsize || !q) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else if (cmp(priv, p, q) <= 0) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				}
+				if (tail)
+					tail->next = e;
+				else
+					list = e;
+				e->prev = tail;
+				tail = e;
+			}
+			p = q;
+		}
+
+		tail->next = list;
+		list->prev = tail;
+
+		if (nmerges <= 1)
 			break;
+
+		insize *= 2;
 	}
-	return 0;
+
+	head->next = list;
+	head->prev = list->prev;
+	list->prev->next = head;
+	list->prev = head;
 }
 
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	int typea, typeb;
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	typea = key_type(c, &sa->key);
+	typeb = key_type(c, &sb->key);
+	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+	/* Inodes go before directory entries */
+	if (typea == UBIFS_INO_KEY) {
+		if (typeb == UBIFS_INO_KEY)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (typeb == UBIFS_INO_KEY)
+		return 1;
+
+	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
  * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
  *
- * This function moves valid nodes from data LEB described by @sleb to the GC
- * journal head. The obsolete nodes are dropped.
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
  *
- * When moving nodes we have to deal with classical bin-packing problem: the
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
- * where the nodes in the @sleb->nodes list are the elements which should be
- * fit optimally to the bins. This function uses the "first fit decreasing"
- * strategy, although it does not really sort the nodes but just split them on
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
  *
- * This function returns zero in case of success, %-EAGAIN if commit is
- * required, and other negative error codes in case of other failures.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
  */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head data, large, medium, small;
-	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-	int avail, err, min = INT_MAX;
-	unsigned int blk = 0;
-	ino_t inum = 0;
 
-	INIT_LIST_HEAD(&data);
-	INIT_LIST_HEAD(&large);
-	INIT_LIST_HEAD(&medium);
-	INIT_LIST_HEAD(&small);
+	*min = INT_MAX;
 
-	while (!list_empty(&sleb->nodes)) {
-		struct list_head *lst = sleb->nodes.next;
-
-		snod = list_entry(lst, struct ubifs_scan_node, list);
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		int err;
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
 		if (err < 0)
-			goto out;
+			return err;
 
-		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
 			kfree(snod);
 			continue;
 		}
 
-		/*
-		 * Sort the list of nodes so that data nodes go first, large
-		 * nodes go second, and small nodes go last.
-		 */
-		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
-			if (inum != key_inum(c, &snod->key)) {
-				if (inum) {
-					/*
-					 * Try to move data nodes from the same
-					 * inode together.
-					 */
-					err = joinup(c, sleb, inum, blk, &data);
-					if (err)
-						goto out;
-				}
-				inum = key_inum(c, &snod->key);
-				blk = key_block(c, &snod->key);
-			}
-			list_add_tail(lst, &data);
-		} else if (snod->len > MEDIUM_NODE_WM)
-			list_add_tail(lst, &large);
-		else if (snod->len > SMALL_NODE_WM)
-			list_add_tail(lst, &medium);
-		else
-			list_add_tail(lst, &small);
-
-		/* And find the smallest node */
-		if (snod->len < min)
-			min = snod->len;
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
 	}
 
-	/*
-	 * Join the tree lists so that we'd have one roughly sorted list
-	 * ('large' will be the head of the joined list).
-	 */
-	list_splice(&data, &large);
-	list_splice(&medium, large.prev);
-	list_splice(&small, large.prev);
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	if (wbuf->lnum == -1) {
 		/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		 */
 		err = switch_gc_head(c);
 		if (err)
-			goto out;
+			return err;
 	}
 
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
 	/* Write nodes to their new location. Use the first-fit strategy */
 	while (1) {
-		avail = c->leb_size - wbuf->offs - wbuf->used;
-		list_for_each_entry_safe(snod, tmp, &large, list) {
-			int new_lnum, new_offs;
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
 
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
 			if (avail < min)
 				break;
 
-			if (snod->len > avail)
-				/* This node does not fit */
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
 				continue;
+			}
 
-			cond_resched();
-
-			new_lnum = wbuf->lnum;
-			new_offs = wbuf->offs + wbuf->used;
-			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-						      snod->len);
+			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
-			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-						snod->offs, new_lnum, new_offs,
-						snod->len);
-			if (err)
-				goto out;
-
-			avail = c->leb_size - wbuf->offs - wbuf->used;
-			list_del(&snod->list);
-			kfree(snod);
 		}
 
-		if (list_empty(&large))
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
 			break;
 
 		/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	return 0;
 
 out:
-	list_for_each_entry_safe(snod, tmp, &large, list) {
-		list_del(&snod->list);
-		kfree(snod);
-	}
+	list_splice_tail(&nondata, &sleb->nodes);
 	return err;
 }
 
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
  */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-	int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
 	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
 
 	/*
@@ -139,10 +139,9 @@ again:
 	 * Write buffer wasn't seek'ed or there is no enough space - look for an
 	 * LEB with some empty space.
 	 */
-	lnum = ubifs_find_free_space(c, len, &free, squeeze);
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
 	if (lnum >= 0) {
 		/* Found an LEB, add it to the journal head */
-		offs = c->leb_size - free;
 		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
 		if (err)
 			goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
  * @host: host inode
  *
  * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
  * after the extended attribute inode in order to guarantee that the extended
  * attribute will be flushed when the inode is synchronized by 'fsync()' and
  * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
  * @c: UBIFS file-system description object
  * @key: the key to get hash from
  */
-static inline int key_hash(const struct ubifs_info *c,
-			   const union ubifs_key *key)
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
 {
 	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @k: the key to get hash from
  */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	/*
-	 * Make sure the the amount of space in buds will not exceed
+	 * Make sure the amount of space in buds will not exceed the
 	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
 	 * limits.
 	 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
 				bud->jhead, c->leb_size - bud->start,
 				c->cmt_bud_bytes);
 			rb_erase(p1, &c->buds);
-			list_del(&bud->list);
 			/*
 			 * If the commit does not finish, the recovery will need
 			 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
 			 * commit i.e. do not allow them to be garbage
 			 * collected.
 			 */
-			list_add(&bud->list, &c->old_buds);
+			list_move(&bud->list, &c->old_buds);
 		}
 	}
 	spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
-				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
-			dbg_chk_lpt_sz(c, 2, 0);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
-			from = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
  * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
  * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
  * @len: length written
  *
  * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
  */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 				       lnum, offs);
 			err = ubifs_unpack_nnode(c, buf, &nnode);
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-				printk("%d:%d", nnode.nbranch[i].lnum,
+				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
 				if (i != UBIFS_LPT_FANOUT - 1)
-					printk(", ");
+					printk(KERN_CONT ", ");
 			}
-			printk("\n");
+			printk(KERN_CONT "\n");
 			break;
 		}
 		case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
  * @lnum: LEB number of the LEB from which @buf was read
  * @offs: offset from which @buf was read
  *
- * This function scans @buf for more nodes and returns %0 is a node is found and
- * %1 if no more nodes are found.
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
  */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 			int lnum, int offs)
 {
-	int skip, next_offs = 0;
+	struct ubifs_ch *ch = buf;
+	int skip, dlen = le32_to_cpu(ch->len);
 
-	if (len > UBIFS_DATA_NODE_SZ) {
-		struct ubifs_ch *ch = buf;
-		int dlen = le32_to_cpu(ch->len);
-
-		if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
-		    dlen <= UBIFS_MAX_DATA_NODE_SZ)
-			/* The corrupt node looks like a data node */
-			next_offs = ALIGN(offs + dlen, 8);
-	}
-
-	if (c->min_io_size == 1)
-		skip = 8;
-	else
-		skip = ALIGN(offs + 1, c->min_io_size) - offs;
-
-	offs += skip;
-	buf += skip;
-	len -= skip;
-	while (len > 8) {
-		struct ubifs_ch *ch = buf;
-		uint32_t magic = le32_to_cpu(ch->magic);
-		int ret;
-
-		if (magic == UBIFS_NODE_MAGIC) {
-			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-			if (ret == SCANNED_A_NODE || ret > 0) {
-				/*
-				 * There is a small chance this is just data in
-				 * a data node, so check that possibility. e.g.
-				 * this is part of a file that itself contains
-				 * a UBIFS image.
-				 */
-				if (next_offs && offs + le32_to_cpu(ch->len) <=
-				    next_offs)
-					continue;
-				dbg_rcvry("unexpected node at %d:%d", lnum,
-					  offs);
-				return 0;
-			}
-		}
-		offs += 8;
-		buf += 8;
-		len -= 8;
+	/* Check for empty space after the corrupt node's common header */
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	/*
+	 * The area after the common header size is not empty, so the common
+	 * header must be intact. Check it.
+	 */
+	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+		return 0;
 	}
-	return 1;
+	/* Now we know the corrupt node's length we can skip over it */
+	skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+	/* After which there should be empty space */
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+	return 0;
 }
 
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 		dirty -= c->leb_size - lp->free;
 		/*
 		 * If the replay order was perfect the dirty space would now be
-		 * zero. The order is not perfect because the the journal heads
+		 * zero. The order is not perfect because the journal heads
 		 * race with each other. This is not a problem but is does mean
 		 * that the dirty space may temporarily exceed c->leb_size
 		 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	if (tmp64 > DEFAULT_MAX_RP_SIZE)
 		tmp64 = DEFAULT_MAX_RP_SIZE;
 	sup->rp_size = cpu_to_le64(tmp64);
+	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
 
 	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
 	kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	if (IS_ERR(sup))
 		return PTR_ERR(sup);
 
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
 	/*
 	 * The software supports all previous versions but not future versions,
 	 * due to the unavailability of time-travelling equipment.
 	 */
-	c->fmt_version = le32_to_cpu(sup->fmt_version);
 	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-		ubifs_err("on-flash format version is %d, but software only "
-			  "supports up to version %d", c->fmt_version,
-			  UBIFS_FORMAT_VERSION);
-		err = -EINVAL;
-		goto out;
+		struct super_block *sb = c->vfs_sb;
+		int mounting_ro = sb->s_flags & MS_RDONLY;
+
+		ubifs_assert(!c->ro_media || mounting_ro);
+		if (!mounting_ro ||
+		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+			ubifs_err("on-flash format version is w%d/r%d, but "
+				  "software only supports up to version "
+				  "w%d/r%d", c->fmt_version,
+				  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+				  UBIFS_RO_COMPAT_VERSION);
+			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+				ubifs_msg("only R/O mounting is possible");
+				err = -EROFS;
+			} else
+				err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The FS is mounted R/O, and the media format is
+		 * R/O-compatible with the UBIFS implementation, so we can
+		 * mount.
+		 */
+		c->rw_incompat = 1;
 	}
 
 	if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
 	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
 	c->main_first = c->leb_cnt - c->main_lebs;
-	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
 		 * Move this one to the end of the list to provide some
 		 * fairness.
 		 */
-		list_del(&c->infos_list);
-		list_add_tail(&c->infos_list, &ubifs_infos);
+		list_move_tail(&c->infos_list, &ubifs_infos);
 		mutex_unlock(&c->umount_mutex);
 		if (freed >= nr)
 			break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
 			}
 
 			if (i == 1) {
-				list_del(&c->infos_list);
-				list_add_tail(&c->infos_list, &ubifs_infos);
+				list_move_tail(&c->infos_list, &ubifs_infos);
 				spin_unlock(&ubifs_infos_lock);
 
 				ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1182b66a5491..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",no_chk_data_crc");
 
 	if (c->mount_opts.override_compr) {
-		seq_printf(s, ",compr=");
-		seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+		seq_printf(s, ",compr=%s",
+			   ubifs_compr_name(c->mount_opts.compr_type));
 	}
 
 	return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
 	if (err)
 		return err;
 
+	/* Initialize effective LEB size used in budgeting calculations */
+	c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
 	return 0;
 }
 
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
 	long long tmp64;
 
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	/*
 	 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_cbuf;
 
 		/* Create background thread */
-		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+		c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 		if (IS_ERR(c->bgt)) {
 			err = PTR_ERR(c->bgt);
 			c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
-			/* GC LEB has to be empty and taken at this point */
-			ubifs_assert(c->lst.taken_empty_lebs == 1);
+			/*
+			 * GC LEB has to be empty and taken at this point. But
+			 * the journal head LEBs may also be accounted as
+			 * "empty taken" if they are empty.
+			 */
+			ubifs_assert(c->lst.taken_empty_lebs > 0);
 		}
 	} else
-		ubifs_assert(c->lst.taken_empty_lebs == 1);
+		ubifs_assert(c->lst.taken_empty_lebs > 0);
 
 	err = dbg_check_filesystem(c);
 	if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
 	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
 	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
 		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("media format:       %d (latest is %d)",
-		  c->fmt_version, UBIFS_FORMAT_VERSION);
+	ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
+		  c->fmt_version, c->ro_compat_version,
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
 	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
 	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
 		c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
+	if (c->rw_incompat) {
+		ubifs_err("the file-system is not R/W-compatible");
+		ubifs_msg("on-flash format version is w%d/r%d, but software "
+			  "only supports up to version w%d/r%d", c->fmt_version,
+			  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+			  UBIFS_RO_COMPAT_VERSION);
+		return -EROFS;
+	}
+
 	mutex_lock(&c->umount_mutex);
 	dbg_save_space_info(c);
 	c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	ubifs_create_buds_lists(c);
 
 	/* Create background thread */
-	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+	c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 	if (IS_ERR(c->bgt)) {
 		err = PTR_ERR(c->bgt);
 		c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
-	ubifs_assert(c->lst.taken_empty_lebs == 1);
+	ubifs_assert(c->lst.taken_empty_lebs > 0);
 	return 0;
 }
 
@@ -2034,7 +2051,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 out_deact:
 	up_write(&sb->s_umount);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	 * splitting in the middle of the colliding sequence. Also, when
 	 * removing the leftmost key, we would have to correct the key of the
 	 * parent node, which would introduce additional complications. Namely,
-	 * if we changed the the leftmost key of the parent znode, the garbage
+	 * if we changed the leftmost key of the parent znode, the garbage
 	 * collector would be unable to find it (GC is doing this when GC'ing
 	 * indexing LEBs). Although we already have an additional RB-tree where
 	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
 
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
 
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
 
@@ -53,7 +75,7 @@
 
 /*
  * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
  * node uncompress, because it'll be read faster.
  */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
  * @padding2: reserved for future, zeroes
  * @time_gran: time granularity in nanoseconds
  * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
  */
 struct ubifs_sb_node {
 	struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
 	__le64 rp_size;
 	__le32 time_gran;
 	__u8 uuid[16];
-	__u8 padding2[3972];
+	__le32 ro_compat_version;
+	__u8 padding2[3968];
 } __attribute__ ((packed));
 
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
  *          by @commit_sem
  * @cnt_lock: protects @highest_inum and @max_sqnum counters
  * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
  * @uuid: UUID from super block
  *
  * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
  *                   recovery)
  * @bulk_read: enable bulk-reads
  * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
  * @min_io_shift: number of bits in @min_io_size minus one
  * @leb_size: logical eraseblock size in bytes
  * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
  * @leb_cnt: count of logical eraseblocks
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
  *             previous commit start
  * @uncat_list: list of un-categorized LEBs
  * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
  * @freeable_cnt: number of freeable LEBs in @freeable_list
  *
  * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
 	unsigned long long cmt_no;
 	spinlock_t cnt_lock;
 	int fmt_version;
+	int ro_compat_version;
 	unsigned char uuid[16];
 
 	int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 	unsigned int default_compr:2;
+	unsigned int rw_incompat:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
 	int min_io_shift;
 	int leb_size;
 	int half_leb_size;
+	int idx_leb_size;
 	int leb_cnt;
 	int max_leb_cnt;
 	int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1b809bd494bd..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
 {
 	struct buffer_head *bh = NULL;
 	int retval = 0;
-	kernel_lb_addr loc;
+	struct kernel_lb_addr loc;
 
 	loc.logicalBlockNum = bitmap->s_extPosition;
 	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
 
-	bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block));
+	bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
 	if (!bh)
 		retval = -EIO;
 
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
 	return slot;
 }
 
-static bool udf_add_free_space(struct udf_sb_info *sbi,
-				u16 partition, u32 cnt)
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
 {
+	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct logicalVolIntegrityDesc *lvid;
 
-	if (sbi->s_lvid_bh == NULL)
-		return false;
+	if (!sbi->s_lvid_bh)
+		return;
 
 	lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
 	le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
-	return true;
+	udf_updated_lvid(sb);
 }
 
 static void udf_bitmap_free_blocks(struct super_block *sb,
 				   struct inode *inode,
 				   struct udf_bitmap *bitmap,
-				   kernel_lb_addr bloc, uint32_t offset,
+				   struct kernel_lb_addr *bloc,
+				   uint32_t offset,
 				   uint32_t count)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = NULL;
+	struct udf_part_map *partmap;
 	unsigned long block;
 	unsigned long block_group;
 	unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 	unsigned long overflow;
 
 	mutex_lock(&sbi->s_alloc_mutex);
-	if (bloc.logicalBlockNum < 0 ||
-	    (bloc.logicalBlockNum + count) >
-		sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum < 0 ||
+	    (bloc->logicalBlockNum + count) >
+		partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
-			  bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-			  sbi->s_partmaps[bloc.partitionReferenceNum].
-							s_partition_len);
+			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
+			  count, partmap->s_partition_len);
 		goto error_return;
 	}
 
-	block = bloc.logicalBlockNum + offset +
+	block = bloc->logicalBlockNum + offset +
 		(sizeof(struct spaceBitmapDesc) << 3);
 
 	do {
@@ -206,8 +208,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 					((char *)bh->b_data)[(bit + i) >> 3]);
 			} else {
 				if (inode)
-					DQUOT_FREE_BLOCK(inode, 1);
-				udf_add_free_space(sbi, sbi->s_partition, 1);
+					vfs_dq_free_block(inode, 1);
+				udf_add_free_space(sb, sbi->s_partition, 1);
 			}
 		}
 		mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 	} while (overflow);
 
 error_return:
-	sb->s_dirt = 1;
-	if (sbi->s_lvid_bh)
-		mark_buffer_dirty(sbi->s_lvid_bh);
 	mutex_unlock(&sbi->s_alloc_mutex);
 }
 
@@ -261,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
 		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
 			if (!udf_test_bit(bit, bh->b_data))
 				goto out;
-			else if (DQUOT_PREALLOC_BLOCK(inode, 1))
+			else if (vfs_dq_prealloc_block(inode, 1))
 				goto out;
 			else if (!udf_clear_bit(bit, bh->b_data)) {
 				udf_debug("bit already cleared for block %d\n", bit);
-				DQUOT_FREE_BLOCK(inode, 1);
+				vfs_dq_free_block(inode, 1);
 				goto out;
 			}
 			block_count--;
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
 	} while (block_count > 0);
 
 out:
-	if (udf_add_free_space(sbi, partition, -alloc_count))
-		mark_buffer_dirty(sbi->s_lvid_bh);
-	sb->s_dirt = 1;
+	udf_add_free_space(sb, partition, -alloc_count);
 	mutex_unlock(&sbi->s_alloc_mutex);
 	return alloc_count;
 }
@@ -393,7 +390,7 @@ got_block:
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) {
+	if (inode && vfs_dq_alloc_block(inode, 1)) {
 		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
 		return 0;
@@ -409,9 +406,7 @@ got_block:
 
 	mark_buffer_dirty(bh);
 
-	if (udf_add_free_space(sbi, partition, -1))
-		mark_buffer_dirty(sbi->s_lvid_bh);
-	sb->s_dirt = 1;
+	udf_add_free_space(sb, partition, -1);
 	mutex_unlock(&sbi->s_alloc_mutex);
 	*err = 0;
 	return newblock;
@@ -425,26 +420,28 @@ error_return:
 static void udf_table_free_blocks(struct super_block *sb,
 				  struct inode *inode,
 				  struct inode *table,
-				  kernel_lb_addr bloc, uint32_t offset,
+				  struct kernel_lb_addr *bloc,
+				  uint32_t offset,
 				  uint32_t count)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *partmap;
 	uint32_t start, end;
 	uint32_t elen;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	struct extent_position oepos, epos;
 	int8_t etype;
 	int i;
 	struct udf_inode_info *iinfo;
 
 	mutex_lock(&sbi->s_alloc_mutex);
-	if (bloc.logicalBlockNum < 0 ||
-	    (bloc.logicalBlockNum + count) >
-		sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+	if (bloc->logicalBlockNum < 0 ||
+	    (bloc->logicalBlockNum + count) >
+		partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
 			  bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-			  sbi->s_partmaps[bloc.partitionReferenceNum].
-							s_partition_len);
+			  partmap->s_partition_len);
 		goto error_return;
 	}
 
@@ -452,12 +449,11 @@ static void udf_table_free_blocks(struct super_block *sb,
 	/* We do this up front - There are some error conditions that
 	   could occure, but.. oh well */
 	if (inode)
-		DQUOT_FREE_BLOCK(inode, count);
-	if (udf_add_free_space(sbi, sbi->s_partition, count))
-		mark_buffer_dirty(sbi->s_lvid_bh);
+		vfs_dq_free_block(inode, count);
+	udf_add_free_space(sb, sbi->s_partition, count);
 
-	start = bloc.logicalBlockNum + offset;
-	end = bloc.logicalBlockNum + offset + count - 1;
+	start = bloc->logicalBlockNum + offset;
+	end = bloc->logicalBlockNum + offset + count - 1;
 
 	epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
 	elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 				start += count;
 				count = 0;
 			}
-			udf_write_aext(table, &oepos, eloc, elen, 1);
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
 		} else if (eloc.logicalBlockNum == (end + 1)) {
 			if ((0x3FFFFFFF - elen) <
 					(count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 				end -= count;
 				count = 0;
 			}
-			udf_write_aext(table, &oepos, eloc, elen, 1);
+			udf_write_aext(table, &oepos, &eloc, elen, 1);
 		}
 
 		if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
 		 */
 
 		int adsize;
-		short_ad *sad = NULL;
-		long_ad *lad = NULL;
+		struct short_ad *sad = NULL;
+		struct long_ad *lad = NULL;
 		struct allocExtDesc *aed;
 
 		eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
 			(count << sb->s_blocksize_bits);
 
 		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-			adsize = sizeof(short_ad);
+			adsize = sizeof(struct short_ad);
 		else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-			adsize = sizeof(long_ad);
+			adsize = sizeof(struct long_ad);
 		else {
 			brelse(oepos.bh);
 			brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 			elen -= sb->s_blocksize;
 
 			epos.bh = udf_tread(sb,
-					udf_get_lb_pblock(sb, epos.block, 0));
+					udf_get_lb_pblock(sb, &epos.block, 0));
 			if (!epos.bh) {
 				brelse(oepos.bh);
 				goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
 			if (sbi->s_udfrev >= 0x0200)
 				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
 					    3, 1, epos.block.logicalBlockNum,
-					    sizeof(tag));
+					    sizeof(struct tag));
 			else
 				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
 					    2, 1, epos.block.logicalBlockNum,
-					    sizeof(tag));
+					    sizeof(struct tag));
 
 			switch (iinfo->i_alloc_type) {
 			case ICBTAG_FLAG_AD_SHORT:
-				sad = (short_ad *)sptr;
+				sad = (struct short_ad *)sptr;
 				sad->extLength = cpu_to_le32(
 					EXT_NEXT_EXTENT_ALLOCDECS |
 					sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 					cpu_to_le32(epos.block.logicalBlockNum);
 				break;
 			case ICBTAG_FLAG_AD_LONG:
-				lad = (long_ad *)sptr;
+				lad = (struct long_ad *)sptr;
 				lad->extLength = cpu_to_le32(
 					EXT_NEXT_EXTENT_ALLOCDECS |
 					sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 
 		/* It's possible that stealing the block emptied the extent */
 		if (elen) {
-			udf_write_aext(table, &epos, eloc, elen, 1);
+			udf_write_aext(table, &epos, &eloc, elen, 1);
 
 			if (!epos.bh) {
 				iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
 	brelse(oepos.bh);
 
 error_return:
-	sb->s_dirt = 1;
 	mutex_unlock(&sbi->s_alloc_mutex);
 	return;
 }
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	int alloc_count = 0;
 	uint32_t elen, adsize;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	struct extent_position epos;
 	int8_t etype = -1;
 	struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 
 	iinfo = UDF_I(table);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		return 0;
 
@@ -700,14 +695,14 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 		epos.offset -= adsize;
 
 		alloc_count = (elen >> sb->s_blocksize_bits);
-		if (inode && DQUOT_PREALLOC_BLOCK(inode,
+		if (inode && vfs_dq_prealloc_block(inode,
 			alloc_count > block_count ? block_count : alloc_count))
 			alloc_count = 0;
 		else if (alloc_count > block_count) {
 			alloc_count = block_count;
 			eloc.logicalBlockNum += alloc_count;
 			elen -= (alloc_count << sb->s_blocksize_bits);
-			udf_write_aext(table, &epos, eloc,
+			udf_write_aext(table, &epos, &eloc,
 					(etype << 30) | elen, 1);
 		} else
 			udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 
 	brelse(epos.bh);
 
-	if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) {
-		mark_buffer_dirty(sbi->s_lvid_bh);
-		sb->s_dirt = 1;
-	}
+	if (alloc_count)
+		udf_add_free_space(sb, partition, -alloc_count);
 	mutex_unlock(&sbi->s_alloc_mutex);
 	return alloc_count;
 }
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
 	uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
 	uint32_t newblock = 0, adsize;
 	uint32_t elen, goal_elen = 0;
-	kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+	struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
 	struct extent_position epos, goal_epos;
 	int8_t etype;
 	struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
 	*err = -ENOSPC;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		return newblock;
 
@@ -806,7 +799,7 @@ static int udf_table_new_block(struct super_block *sb,
 	goal_eloc.logicalBlockNum++;
 	goal_elen -= sb->s_blocksize;
 
-	if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) {
+	if (inode && vfs_dq_alloc_block(inode, 1)) {
 		brelse(goal_epos.bh);
 		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
 	}
 
 	if (goal_elen)
-		udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
+		udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
 	else
 		udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
 	brelse(goal_epos.bh);
 
-	if (udf_add_free_space(sbi, partition, -1))
-		mark_buffer_dirty(sbi->s_lvid_bh);
+	udf_add_free_space(sb, partition, -1);
 
-	sb->s_dirt = 1;
 	mutex_unlock(&sbi->s_alloc_mutex);
 	*err = 0;
 	return newblock;
 }
 
-inline void udf_free_blocks(struct super_block *sb,
-			    struct inode *inode,
-			    kernel_lb_addr bloc, uint32_t offset,
-			    uint32_t count)
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
+		     struct kernel_lb_addr *bloc, uint32_t offset,
+		     uint32_t count)
 {
-	uint16_t partition = bloc.partitionReferenceNum;
+	uint16_t partition = bloc->partitionReferenceNum;
 	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
 
 	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
-		return udf_bitmap_free_blocks(sb, inode,
-					      map->s_uspace.s_bitmap,
-					      bloc, offset, count);
+		udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+				       bloc, offset, count);
 	} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
-		return udf_table_free_blocks(sb, inode,
-					     map->s_uspace.s_table,
-					     bloc, offset, count);
+		udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+				      bloc, offset, count);
 	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-		return udf_bitmap_free_blocks(sb, inode,
-					      map->s_fspace.s_bitmap,
-					      bloc, offset, count);
+		udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+				       bloc, offset, count);
 	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-		return udf_table_free_blocks(sb, inode,
-					     map->s_fspace.s_table,
-					     bloc, offset, count);
-	} else {
-		return;
+		udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+				      bloc, offset, count);
 	}
 }
 
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	uint8_t lfi;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
 	struct buffer_head *tmp, *bha[16];
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t offset;
 	int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 			ret = -ENOENT;
 			goto out;
 		}
-		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-				epos.offset -= sizeof(short_ad);
+				epos.offset -= sizeof(struct short_ad);
 			else if (iinfo->i_alloc_type ==
 					ICBTAG_FLAG_AD_LONG)
-				epos.offset -= sizeof(long_ad);
+				epos.offset -= sizeof(struct long_ad);
 		} else {
 			offset = 0;
 		}
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 			if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
 				i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
 			for (num = 0; i > 0; i--) {
-				block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i);
+				block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
 				tmp = udf_tgetblk(dir->i_sb, block);
 				if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
 					bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 			memcpy(fname, "..", flen);
 			dt_type = DT_DIR;
 		} else {
-			kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+			struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
 
-			iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0);
+			iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
 			flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
 			dt_type = DT_UNKNOWN;
 		}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
 
 #if 0
 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-				uint8_t ad_size, kernel_lb_addr fe_loc,
+				uint8_t ad_size, struct kernel_lb_addr fe_loc,
 				int *pos, int *offset, struct buffer_head **bh,
 				int *error)
 {
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					 struct udf_fileident_bh *fibh,
 					 struct fileIdentDesc *cfi,
 					 struct extent_position *epos,
-					 kernel_lb_addr *eloc, uint32_t *elen,
+					 struct kernel_lb_addr *eloc, uint32_t *elen,
 					 sector_t *offset)
 {
 	struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 		    (EXT_RECORDED_ALLOCATED >> 30))
 			return NULL;
 
-		block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
 
 		(*offset)++;
 
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 			if (i + *offset > (*elen >> blocksize_bits))
 				i = (*elen >> blocksize_bits)-*offset;
 			for (num = 0; i > 0; i--) {
-				block = udf_get_lb_pblock(dir->i_sb, *eloc,
+				block = udf_get_lb_pblock(dir->i_sb, eloc,
 							  *offset + i);
 				tmp = udf_tgetblk(dir->i_sb, block);
 				if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 		    (EXT_RECORDED_ALLOCATED >> 30))
 			return NULL;
 
-		block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+		block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
 
 		(*offset)++;
 
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 }
 
 #if 0
-static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
+static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
 {
-	extent_ad *ext;
+	struct extent_ad *ext;
 	struct fileEntry *fe;
 	uint8_t *ptr;
 
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
 	if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
 		ptr += *offset;
 
-	ext = (extent_ad *)ptr;
+	ext = (struct extent_ad *)ptr;
 
-	*offset = *offset + sizeof(extent_ad);
+	*offset = *offset + sizeof(struct extent_ad);
 	return ext;
 }
 #endif
 
-short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
 			      int inc)
 {
-	short_ad *sa;
+	struct short_ad *sa;
 
 	if ((!ptr) || (!offset)) {
 		printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
 		return NULL;
 	}
 
-	if ((*offset + sizeof(short_ad)) > maxoffset)
+	if ((*offset + sizeof(struct short_ad)) > maxoffset)
 		return NULL;
 	else {
-		sa = (short_ad *)ptr;
+		sa = (struct short_ad *)ptr;
 		if (sa->extLength == 0)
 			return NULL;
 	}
 
 	if (inc)
-		*offset += sizeof(short_ad);
+		*offset += sizeof(struct short_ad);
 	return sa;
 }
 
-long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
 {
-	long_ad *la;
+	struct long_ad *la;
 
 	if ((!ptr) || (!offset)) {
 		printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
 		return NULL;
 	}
 
-	if ((*offset + sizeof(long_ad)) > maxoffset)
+	if ((*offset + sizeof(struct long_ad)) > maxoffset)
 		return NULL;
 	else {
-		la = (long_ad *)ptr;
+		la = (struct long_ad *)ptr;
 		if (la->extLength == 0)
 			return NULL;
 	}
 
 	if (inc)
-		*offset += sizeof(long_ad);
+		*offset += sizeof(struct long_ad);
 	return la;
 }
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
 #define _ECMA_167_H 1
 
 /* Character set specification (ECMA 167r3 1/7.2.1) */
-typedef struct {
+struct charspec {
 	uint8_t		charSetType;
 	uint8_t		charSetInfo[63];
-} __attribute__ ((packed)) charspec;
+} __attribute__ ((packed));
 
 /* Character Set Type (ECMA 167r3 1/7.2.1.1) */
 #define CHARSPEC_TYPE_CS0		0x00	/* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
 typedef uint8_t		dstring;
 
 /* Timestamp (ECMA 167r3 1/7.3) */
-typedef struct {
+struct timestamp {
 	__le16		typeAndTimezone;
 	__le16		year;
 	uint8_t		month;
@@ -68,7 +68,7 @@ typedef struct {
 	uint8_t		centiseconds;
 	uint8_t		hundredsOfMicroseconds;
 	uint8_t		microseconds;
-} __attribute__ ((packed)) timestamp;
+} __attribute__ ((packed));
 
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK		0xF000
@@ -78,11 +78,11 @@ typedef struct {
 #define TIMESTAMP_TIMEZONE_MASK		0x0FFF
 
 /* Entity identifier (ECMA 167r3 1/7.4) */
-typedef struct {
+struct regid {
 	uint8_t		flags;
 	uint8_t		ident[23];
 	uint8_t		identSuffix[8];
-} __attribute__ ((packed)) regid;
+} __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 1/7.4.1) */
 #define ENTITYID_FLAGS_DIRTY		0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
 
 /* Boot Descriptor (ECMA 167r3 2/9.4) */
 struct bootDesc {
-	uint8_t		structType;
-	uint8_t		stdIdent[VSD_STD_ID_LEN];
-	uint8_t		structVersion;
-	uint8_t		reserved1;
-	regid		archType;
-	regid		bootIdent;
-	__le32		bootExtLocation;
-	__le32		bootExtLength;
-	__le64		loadAddress;
-	__le64		startAddress;
-	timestamp	descCreationDateAndTime;
-	__le16		flags;
-	uint8_t		reserved2[32];
-	uint8_t		bootUse[1906];
+	uint8_t			structType;
+	uint8_t			stdIdent[VSD_STD_ID_LEN];
+	uint8_t			structVersion;
+	uint8_t			reserved1;
+	struct regid		archType;
+	struct regid		bootIdent;
+	__le32			bootExtLocation;
+	__le32			bootExtLength;
+	__le64			loadAddress;
+	__le64			startAddress;
+	struct timestamp	descCreationDateAndTime;
+	__le16			flags;
+	uint8_t			reserved2[32];
+	uint8_t			bootUse[1906];
 } __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 2/9.4.12) */
 #define BOOT_FLAGS_ERASE		0x01
 
 /* Extent Descriptor (ECMA 167r3 3/7.1) */
-typedef struct {
+struct extent_ad {
 	__le32		extLength;
 	__le32		extLocation;
-} __attribute__ ((packed)) extent_ad;
+} __attribute__ ((packed));
 
-typedef struct {
+struct kernel_extent_ad {
 	uint32_t	extLength;
 	uint32_t	extLocation;
-} kernel_extent_ad;
+};
 
 /* Descriptor Tag (ECMA 167r3 3/7.2) */
-typedef struct {
+struct tag {
 	__le16		tagIdent;
 	__le16		descVersion;
 	uint8_t		tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
 	__le16		descCRC;
 	__le16		descCRCLength;
 	__le32		tagLocation;
-} __attribute__ ((packed)) tag;
+} __attribute__ ((packed));
 
 /* Tag Identifier (ECMA 167r3 3/7.2.1) */
 #define TAG_IDENT_PVD			0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
 
 /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
 struct primaryVolDesc {
-	tag		descTag;
-	__le32		volDescSeqNum;
-	__le32		primaryVolDescNum;
-	dstring		volIdent[32];
-	__le16		volSeqNum;
-	__le16		maxVolSeqNum;
-	__le16		interchangeLvl;
-	__le16		maxInterchangeLvl;
-	__le32		charSetList;
-	__le32		maxCharSetList;
-	dstring		volSetIdent[128];
-	charspec	descCharSet;
-	charspec	explanatoryCharSet;
-	extent_ad	volAbstract;
-	extent_ad	volCopyright;
-	regid		appIdent;
-	timestamp	recordingDateAndTime;
-	regid		impIdent;
-	uint8_t		impUse[64];
-	__le32		predecessorVolDescSeqLocation;
-	__le16		flags;
-	uint8_t		reserved[22];
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			primaryVolDescNum;
+	dstring			volIdent[32];
+	__le16			volSeqNum;
+	__le16			maxVolSeqNum;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	dstring			volSetIdent[128];
+	struct charspec		descCharSet;
+	struct charspec		explanatoryCharSet;
+	struct extent_ad	volAbstract;
+	struct extent_ad	volCopyright;
+	struct regid		appIdent;
+	struct timestamp	recordingDateAndTime;
+	struct regid		impIdent;
+	uint8_t			impUse[64];
+	__le32			predecessorVolDescSeqLocation;
+	__le16			flags;
+	uint8_t			reserved[22];
 } __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
 
 /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
 struct anchorVolDescPtr {
-	tag		descTag;
-	extent_ad	mainVolDescSeqExt;
-	extent_ad	reserveVolDescSeqExt;
-	uint8_t	 	reserved[480];
+	struct tag		descTag;
+	struct extent_ad	mainVolDescSeqExt;
+	struct extent_ad	reserveVolDescSeqExt;
+	uint8_t	 		reserved[480];
 } __attribute__ ((packed));
 
 /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
 struct volDescPtr {
-	tag		descTag;
-	__le32		volDescSeqNum;
-	extent_ad	nextVolDescSeqExt;
-	uint8_t		reserved[484];
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct extent_ad	nextVolDescSeqExt;
+	uint8_t			reserved[484];
 } __attribute__ ((packed));
 
 /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
 struct impUseVolDesc {
-	tag		descTag;
+	struct tag	descTag;
 	__le32		volDescSeqNum;
-	regid		impIdent;
+	struct regid	impIdent;
 	uint8_t		impUse[460];
 } __attribute__ ((packed));
 
 /* Partition Descriptor (ECMA 167r3 3/10.5) */
 struct partitionDesc {
-	tag descTag;
+	struct tag descTag;
 	__le32 volDescSeqNum;
 	__le16 partitionFlags;
 	__le16 partitionNumber;
-	regid partitionContents;
+	struct regid partitionContents;
 	uint8_t partitionContentsUse[128];
 	__le32 accessType;
 	__le32 partitionStartingLocation;
 	__le32 partitionLength;
-	regid impIdent;
+	struct regid impIdent;
 	uint8_t impUse[128];
 	uint8_t reserved[156];
 } __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
 
 /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
 struct logicalVolDesc {
-	tag		descTag;
-	__le32		volDescSeqNum;
-	charspec	descCharSet;
-	dstring		logicalVolIdent[128];
-	__le32		logicalBlockSize;
-	regid		domainIdent;
-	uint8_t		logicalVolContentsUse[16];
-	__le32		mapTableLength;
-	__le32		numPartitionMaps;
-	regid		impIdent;
-	uint8_t		impUse[128];
-	extent_ad	integritySeqExt;
-	uint8_t		partitionMaps[0];
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	struct charspec		descCharSet;
+	dstring			logicalVolIdent[128];
+	__le32			logicalBlockSize;
+	struct regid		domainIdent;
+	uint8_t			logicalVolContentsUse[16];
+	__le32			mapTableLength;
+	__le32			numPartitionMaps;
+	struct regid		impIdent;
+	uint8_t			impUse[128];
+	struct extent_ad	integritySeqExt;
+	uint8_t			partitionMaps[0];
 } __attribute__ ((packed));
 
 /* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
 
 /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
 struct unallocSpaceDesc {
-	tag		descTag;
-	__le32		volDescSeqNum;
-	__le32		numAllocDescs;
-	extent_ad	allocDescs[0];
+	struct tag		descTag;
+	__le32			volDescSeqNum;
+	__le32			numAllocDescs;
+	struct extent_ad	allocDescs[0];
 } __attribute__ ((packed));
 
 /* Terminating Descriptor (ECMA 167r3 3/10.9) */
 struct terminatingDesc {
-	tag		descTag;
+	struct tag	descTag;
 	uint8_t		reserved[496];
 } __attribute__ ((packed));
 
 /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
 struct logicalVolIntegrityDesc {
-	tag		descTag;
-	timestamp	recordingDateAndTime;
-	__le32		integrityType;
-	extent_ad	nextIntegrityExt;
-	uint8_t		logicalVolContentsUse[32];
-	__le32		numOfPartitions;
-	__le32		lengthOfImpUse;
-	__le32		freeSpaceTable[0];
-	__le32		sizeTable[0];
-	uint8_t		impUse[0];
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le32			integrityType;
+	struct extent_ad	nextIntegrityExt;
+	uint8_t			logicalVolContentsUse[32];
+	__le32			numOfPartitions;
+	__le32			lengthOfImpUse;
+	__le32			freeSpaceTable[0];
+	__le32			sizeTable[0];
+	uint8_t			impUse[0];
 } __attribute__ ((packed));
 
 /* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
 #define LVID_INTEGRITY_TYPE_CLOSE	0x00000001
 
 /* Recorded Address (ECMA 167r3 4/7.1) */
-typedef struct {
+struct lb_addr {
 	__le32		logicalBlockNum;
 	__le16	 	partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
+} __attribute__ ((packed));
 
 /* ... and its in-core analog */
-typedef struct {
+struct kernel_lb_addr {
 	uint32_t		logicalBlockNum;
 	uint16_t	 	partitionReferenceNum;
-} kernel_lb_addr;
+};
 
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
-typedef struct {
+struct short_ad {
         __le32		extLength;
         __le32		extPosition;
-} __attribute__ ((packed)) short_ad;
+} __attribute__ ((packed));
 
 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
-typedef struct {
+struct long_ad {
 	__le32		extLength;
-	lb_addr		extLocation;
+	struct lb_addr	extLocation;
 	uint8_t		impUse[6];
-} __attribute__ ((packed)) long_ad;
+} __attribute__ ((packed));
 
-typedef struct {
-	uint32_t	extLength;
-	kernel_lb_addr	extLocation;
-	uint8_t		impUse[6];
-} kernel_long_ad;
+struct kernel_long_ad {
+	uint32_t		extLength;
+	struct kernel_lb_addr	extLocation;
+	uint8_t			impUse[6];
+};
 
 /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
-typedef struct {
+struct ext_ad {
 	__le32		extLength;
 	__le32		recordedLength;
 	__le32		informationLength;
-	lb_addr		extLocation;
-} __attribute__ ((packed)) ext_ad;
+	struct lb_addr	extLocation;
+} __attribute__ ((packed));
 
-typedef struct {
-	uint32_t	extLength;
-	uint32_t	recordedLength;
-	uint32_t	informationLength;
-	kernel_lb_addr	extLocation;
-} kernel_ext_ad;
+struct kernel_ext_ad {
+	uint32_t		extLength;
+	uint32_t		recordedLength;
+	uint32_t		informationLength;
+	struct kernel_lb_addr	extLocation;
+};
 
 /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
 
@@ -415,44 +415,44 @@ typedef struct {
 
 /* File Set Descriptor (ECMA 167r3 4/14.1) */
 struct fileSetDesc {
-	tag		descTag;
-	timestamp	recordingDateAndTime;
-	__le16		interchangeLvl;
-	__le16		maxInterchangeLvl;
-	__le32		charSetList;
-	__le32		maxCharSetList;
-	__le32		fileSetNum;
-	__le32		fileSetDescNum;
-	charspec	logicalVolIdentCharSet;
-	dstring		logicalVolIdent[128];
-	charspec	fileSetCharSet;
-	dstring		fileSetIdent[32];
-	dstring		copyrightFileIdent[32];
-	dstring		abstractFileIdent[32];
-	long_ad		rootDirectoryICB;
-	regid		domainIdent;
-	long_ad		nextExt;
-	long_ad		streamDirectoryICB;
-	uint8_t		reserved[32];
+	struct tag		descTag;
+	struct timestamp	recordingDateAndTime;
+	__le16			interchangeLvl;
+	__le16			maxInterchangeLvl;
+	__le32			charSetList;
+	__le32			maxCharSetList;
+	__le32			fileSetNum;
+	__le32			fileSetDescNum;
+	struct charspec		logicalVolIdentCharSet;
+	dstring			logicalVolIdent[128];
+	struct charspec		fileSetCharSet;
+	dstring			fileSetIdent[32];
+	dstring			copyrightFileIdent[32];
+	dstring			abstractFileIdent[32];
+	struct long_ad		rootDirectoryICB;
+	struct regid		domainIdent;
+	struct long_ad		nextExt;
+	struct long_ad		streamDirectoryICB;
+	uint8_t			reserved[32];
 } __attribute__ ((packed));
 
 /* Partition Header Descriptor (ECMA 167r3 4/14.3) */
 struct partitionHeaderDesc {
-	short_ad	unallocSpaceTable;
-	short_ad	unallocSpaceBitmap;
-	short_ad	partitionIntegrityTable;
-	short_ad	freedSpaceTable;
-	short_ad	freedSpaceBitmap;
+	struct short_ad	unallocSpaceTable;
+	struct short_ad	unallocSpaceBitmap;
+	struct short_ad	partitionIntegrityTable;
+	struct short_ad	freedSpaceTable;
+	struct short_ad	freedSpaceBitmap;
 	uint8_t		reserved[88];
 } __attribute__ ((packed));
 
 /* File Identifier Descriptor (ECMA 167r3 4/14.4) */
 struct fileIdentDesc {
-	tag		descTag;
+	struct tag	descTag;
 	__le16		fileVersionNum;
 	uint8_t		fileCharacteristics;
 	uint8_t		lengthFileIdent;
-	long_ad		icb;
+	struct long_ad	icb;
 	__le16		lengthOfImpUse;
 	uint8_t		impUse[0];
 	uint8_t		fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
 
 /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
 struct allocExtDesc {
-	tag		descTag;
+	struct tag	descTag;
 	__le32		previousAllocExtLocation;
 	__le32		lengthAllocDescs;
 } __attribute__ ((packed));
 
 /* ICB Tag (ECMA 167r3 4/14.6) */
-typedef struct {
+struct icbtag {
 	__le32		priorRecordedNumDirectEntries;
 	__le16		strategyType;
 	__le16		strategyParameter;
 	__le16		numEntries;
 	uint8_t		reserved;
 	uint8_t		fileType;
-	lb_addr		parentICBLocation;
+	struct lb_addr	parentICBLocation;
 	__le16		flags;
-} __attribute__ ((packed)) icbtag;
+} __attribute__ ((packed));
 
 /* Strategy Type (ECMA 167r3 4/14.6.2) */
 #define ICBTAG_STRATEGY_TYPE_UNDEF	0x0000
@@ -528,41 +528,41 @@ typedef struct {
 
 /* Indirect Entry (ECMA 167r3 4/14.7) */
 struct indirectEntry {
-	tag		descTag;
-	icbtag		icbTag;
-	long_ad		indirectICB;
+	struct tag	descTag;
+	struct icbtag	icbTag;
+	struct long_ad	indirectICB;
 } __attribute__ ((packed));
 
 /* Terminal Entry (ECMA 167r3 4/14.8) */
 struct terminalEntry {
-	tag		descTag;
-	icbtag		icbTag;
+	struct tag	descTag;
+	struct icbtag	icbTag;
 } __attribute__ ((packed));
 
 /* File Entry (ECMA 167r3 4/14.9) */
 struct fileEntry {
-	tag		descTag;
-	icbtag		icbTag;
-	__le32		uid;
-	__le32		gid;
-	__le32		permissions;
-	__le16		fileLinkCount;
-	uint8_t		recordFormat;
-	uint8_t		recordDisplayAttr;
-	__le32		recordLength;
-	__le64		informationLength;
-	__le64		logicalBlocksRecorded;
-	timestamp	accessTime;
-	timestamp	modificationTime;
-	timestamp	attrTime;
-	__le32		checkpoint;
-	long_ad		extendedAttrICB;
-	regid		impIdent;
-	__le64		uniqueID;
-	__le32		lengthExtendedAttr;
-	__le32		lengthAllocDescs;
-	uint8_t		extendedAttr[0];
-	uint8_t		allocDescs[0];
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	struct long_ad		extendedAttrICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
 } __attribute__ ((packed));
 
 /* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
 
 /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
 struct extendedAttrHeaderDesc {
-	tag		descTag;
+	struct tag	descTag;
 	__le32		impAttrLocation;
 	__le32		appAttrLocation;
 } __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
 	uint8_t		reserved[3];
 	__le32		attrLength;
 	__le32		impUseLength;
-	regid		impIdent;
+	struct regid	impIdent;
 	uint8_t		impUse[0];
 } __attribute__ ((packed));
 
@@ -698,7 +698,7 @@ struct appUseExtAttr {
 	uint8_t		reserved[3];
 	__le32		attrLength;
 	__le32		appUseLength;
-	regid		appIdent;
+	struct regid	appIdent;
 	uint8_t		appUse[0];
 } __attribute__ ((packed));
 
@@ -712,15 +712,15 @@ struct appUseExtAttr {
 
 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */
 struct unallocSpaceEntry {
-	tag		descTag;
-	icbtag		icbTag;
+	struct tag	descTag;
+	struct icbtag	icbTag;
 	__le32		lengthAllocDescs;
 	uint8_t		allocDescs[0];
 } __attribute__ ((packed));
 
 /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
 struct spaceBitmapDesc {
-	tag		descTag;
+	struct tag	descTag;
 	__le32		numOfBits;
 	__le32		numOfBytes;
 	uint8_t		bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
 
 /* Partition Integrity Entry (ECMA 167r3 4/14.13) */
 struct partitionIntegrityEntry {
-	tag		descTag;
-	icbtag		icbTag;
-	timestamp	recordingDateAndTime;
-	uint8_t		integrityType;
-	uint8_t		reserved[175];
-	regid		impIdent;
-	uint8_t		impUse[256];
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	struct timestamp	recordingDateAndTime;
+	uint8_t			integrityType;
+	uint8_t			reserved[175];
+	struct regid		impIdent;
+	uint8_t			impUse[256];
 } __attribute__ ((packed));
 
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
 
 /* File Entry (ECMA 167r3 4/14.17) */
 struct extendedFileEntry {
-	tag		descTag;
-	icbtag		icbTag;
-	__le32		uid;
-	__le32		gid;
-	__le32		permissions;
-	__le16		fileLinkCount;
-	uint8_t		recordFormat;
-	uint8_t		recordDisplayAttr;
-	__le32		recordLength;
-	__le64		informationLength;
-	__le64		objectSize;
-	__le64		logicalBlocksRecorded;
-	timestamp	accessTime;
-	timestamp	modificationTime;
-	timestamp	createTime;
-	timestamp	attrTime;
-	__le32		checkpoint;
-	__le32		reserved;
-	long_ad		extendedAttrICB;
-	long_ad		streamDirectoryICB;
-	regid		impIdent;
-	__le64		uniqueID;
-	__le32		lengthExtendedAttr;
-	__le32		lengthAllocDescs;
-	uint8_t		extendedAttr[0];
-	uint8_t		allocDescs[0];
+	struct tag		descTag;
+	struct icbtag		icbTag;
+	__le32			uid;
+	__le32			gid;
+	__le32			permissions;
+	__le16			fileLinkCount;
+	uint8_t			recordFormat;
+	uint8_t			recordDisplayAttr;
+	__le32			recordLength;
+	__le64			informationLength;
+	__le64			objectSize;
+	__le64			logicalBlocksRecorded;
+	struct timestamp	accessTime;
+	struct timestamp	modificationTime;
+	struct timestamp	createTime;
+	struct timestamp	attrTime;
+	__le32			checkpoint;
+	__le32			reserved;
+	struct long_ad		extendedAttrICB;
+	struct long_ad		streamDirectoryICB;
+	struct regid		impIdent;
+	__le64			uniqueID;
+	__le32			lengthExtendedAttr;
+	__le32			lengthAllocDescs;
+	uint8_t			extendedAttr[0];
+	uint8_t			allocDescs[0];
 } __attribute__ ((packed));
 
 #endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 31fc84297ddb..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	clear_inode(inode);
 
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
 			le32_add_cpu(&lvidiu->numDirs, -1);
 		else
 			le32_add_cpu(&lvidiu->numFiles, -1);
-
-		mark_buffer_dirty(sbi->s_lvid_bh);
+		udf_updated_lvid(sb);
 	}
 	mutex_unlock(&sbi->s_alloc_mutex);
 
-	udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1);
+	udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
 
 struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		if (!(++uniqueID & 0x00000000FFFFFFFFUL))
 			uniqueID += 16;
 		lvhd->uniqueID = cpu_to_le64(uniqueID);
-		mark_buffer_dirty(sbi->s_lvid_bh);
+		udf_updated_lvid(sb);
 	}
 	mutex_unlock(&sbi->s_alloc_mutex);
 	inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	iinfo->i_location.logicalBlockNum = block;
 	iinfo->i_location.partitionReferenceNum =
 				dinfo->i_location.partitionReferenceNum;
-	inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0);
+	inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
 	inode->i_blocks = 0;
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenAlloc = 0;
@@ -154,8 +153,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
+	if (vfs_dq_alloc_inode(inode)) {
+		vfs_dq_drop(inode);
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
 					sector_t *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
-			      kernel_lb_addr, uint32_t);
+			      struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
-			      kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_prealloc_extents(struct inode *, int, int,
-				 kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+				 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_merge_extents(struct inode *,
-			      kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_update_extents(struct inode *,
-			       kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+			       struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
 			       struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 {
 	int newblock;
 	struct buffer_head *dbh = NULL;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	uint8_t alloctype;
 	struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 	epos.bh = NULL;
 	epos.block = iinfo->i_location;
 	epos.offset = udf_file_entry_alloc_offset(inode);
-	udf_add_aext(inode, &epos, eloc, elen, 0);
+	udf_add_aext(inode, &epos, &eloc, elen, 0);
 	/* UniqueID stuff */
 
 	brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 
 /* Extend the file by 'blocks' blocks, return the number of extents added */
 int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
-		    kernel_long_ad *last_ext, sector_t blocks)
+		    struct kernel_long_ad *last_ext, sector_t blocks)
 {
 	sector_t add;
 	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
 	struct super_block *sb = inode->i_sb;
-	kernel_lb_addr prealloc_loc = {};
+	struct kernel_lb_addr prealloc_loc = {};
 	int prealloc_len = 0;
 	struct udf_inode_info *iinfo;
 
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 	}
 
 	if (fake) {
-		udf_add_aext(inode, last_pos, last_ext->extLocation,
+		udf_add_aext(inode, last_pos, &last_ext->extLocation,
 			     last_ext->extLength, 1);
 		count++;
 	} else
-		udf_write_aext(inode, last_pos, last_ext->extLocation,
+		udf_write_aext(inode, last_pos, &last_ext->extLocation,
 				last_ext->extLength, 1);
 
 	/* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 	/* Create enough extents to cover the whole hole */
 	while (blocks > add) {
 		blocks -= add;
-		if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+		if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
 				 last_ext->extLength, 1) == -1)
 			return -1;
 		count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 	if (blocks) {
 		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
 			(blocks << sb->s_blocksize_bits);
-		if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+		if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
 				 last_ext->extLength, 1) == -1)
 			return -1;
 		count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 out:
 	/* Do we have some preallocated blocks saved? */
 	if (prealloc_len) {
-		if (udf_add_aext(inode, last_pos, prealloc_loc,
+		if (udf_add_aext(inode, last_pos, &prealloc_loc,
 				 prealloc_len, 1) == -1)
 			return -1;
 		last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
 
 	/* last_pos should point to the last written extent... */
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		last_pos->offset -= sizeof(short_ad);
+		last_pos->offset -= sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		last_pos->offset -= sizeof(long_ad);
+		last_pos->offset -= sizeof(struct long_ad);
 	else
 		return -1;
 
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 {
 	static sector_t last_block;
 	struct buffer_head *result = NULL;
-	kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+	struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
 	struct extent_position prev_epos, cur_epos, next_epos;
 	int count = 0, startnum = 0, endnum = 0;
 	uint32_t elen = 0, tmpelen;
-	kernel_lb_addr eloc, tmpeloc;
+	struct kernel_lb_addr eloc, tmpeloc;
 	int c = 1;
 	loff_t lbcount = 0, b_off = 0;
 	uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 			elen = EXT_RECORDED_ALLOCATED |
 				((elen + inode->i_sb->s_blocksize - 1) &
 				 ~(inode->i_sb->s_blocksize - 1));
-			etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
+			etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
 		}
 		brelse(prev_epos.bh);
 		brelse(cur_epos.bh);
 		brelse(next_epos.bh);
-		newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+		newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
 		*phys = newblock;
 		return NULL;
 	}
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 		} else {
 			/* Create a fake extent when there's not one */
 			memset(&laarr[0].extLocation, 0x00,
-				sizeof(kernel_lb_addr));
+				sizeof(struct kernel_lb_addr));
 			laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
 			/* Will udf_extend_file() create real extent from
 			   a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 			laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
 				inode->i_sb->s_blocksize;
 			memset(&laarr[c].extLocation, 0x00,
-				sizeof(kernel_lb_addr));
+				sizeof(struct kernel_lb_addr));
 			count++;
 			endnum++;
 		}
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 
 static void udf_split_extents(struct inode *inode, int *c, int offset,
 			      int newblocknum,
-			      kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
 			      int *endnum)
 {
 	unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 		if (offset) {
 			if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
 				udf_free_blocks(inode->i_sb, inode,
-						laarr[curr].extLocation,
+						&laarr[curr].extLocation,
 						0, offset);
 				laarr[curr].extLength =
 					EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 }
 
 static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
-				 kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+				 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
 				 int *endnum)
 {
 	int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 					 inode->i_sb->s_blocksize_bits);
 			else {
 				memmove(&laarr[c + 2], &laarr[c + 1],
-					sizeof(long_ad) * (*endnum - (c + 1)));
+					sizeof(struct long_ad) * (*endnum - (c + 1)));
 				(*endnum)++;
 				laarr[c + 1].extLocation.logicalBlockNum = next;
 				laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 					if (*endnum > (i + 1))
 						memmove(&laarr[i],
 							&laarr[i + 1],
-							sizeof(long_ad) *
+							sizeof(struct long_ad) *
 							(*endnum - (i + 1)));
 					i--;
 					(*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 }
 
 static void udf_merge_extents(struct inode *inode,
-			      kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
 			      int *endnum)
 {
 	int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 
 	for (i = 0; i < (*endnum - 1); i++) {
-		kernel_long_ad *li /*l[i]*/ = &laarr[i];
-		kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+		struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
+		struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
 
 		if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
 			(((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
 					 blocksize - 1) & ~(blocksize - 1));
 				if (*endnum > (i + 2))
 					memmove(&laarr[i + 1], &laarr[i + 2],
-						sizeof(long_ad) *
+						sizeof(struct long_ad) *
 						(*endnum - (i + 2)));
 				i--;
 				(*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
 				(EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
 			   ((lip1->extLength >> 30) ==
 				(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
-			udf_free_blocks(inode->i_sb, inode, li->extLocation, 0,
+			udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
 					((li->extLength &
 					  UDF_EXTENT_LENGTH_MASK) +
 					 blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
 					  blocksize - 1) & ~(blocksize - 1));
 				if (*endnum > (i + 2))
 					memmove(&laarr[i + 1], &laarr[i + 2],
-						sizeof(long_ad) *
+						sizeof(struct long_ad) *
 						(*endnum - (i + 2)));
 				i--;
 				(*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
 		} else if ((li->extLength >> 30) ==
 					(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
 			udf_free_blocks(inode->i_sb, inode,
-					li->extLocation, 0,
+					&li->extLocation, 0,
 					((li->extLength &
 						UDF_EXTENT_LENGTH_MASK) +
 					 blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
 }
 
 static void udf_update_extents(struct inode *inode,
-			       kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			       struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
 			       int startnum, int endnum,
 			       struct extent_position *epos)
 {
 	int start = 0, i;
-	kernel_lb_addr tmploc;
+	struct kernel_lb_addr tmploc;
 	uint32_t tmplen;
 
 	if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
 
 	for (i = start; i < endnum; i++) {
 		udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
-		udf_write_aext(inode, epos, laarr[i].extLocation,
+		udf_write_aext(inode, epos, &laarr[i].extLocation,
 			       laarr[i].extLength, 1);
 	}
 }
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
 	 *      i_nlink = 1
 	 *      i_op = NULL;
 	 */
-	bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident);
+	bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
 	if (!bh) {
 		printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
 		       inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
 	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
 		struct buffer_head *ibh;
 
-		ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
+		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
 					&ident);
 		if (ident == TAG_IDENT_IE && ibh) {
 			struct buffer_head *nbh = NULL;
-			kernel_lb_addr loc;
+			struct kernel_lb_addr loc;
 			struct indirectEntry *ie;
 
 			ie = (struct indirectEntry *)ibh->b_data;
 			loc = lelb_to_cpu(ie->indirectICB.extLocation);
 
 			if (ie->indirectICB.extLength &&
-				(nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+				(nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
 							&ident))) {
 				if (ident == TAG_IDENT_FE ||
 					ident == TAG_IDENT_EFE) {
 					memcpy(&iinfo->i_location,
 						&loc,
-						sizeof(kernel_lb_addr));
+						sizeof(struct kernel_lb_addr));
 					brelse(bh);
 					brelse(ibh);
 					brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	inode->i_size = le64_to_cpu(fe->informationLength);
 	iinfo->i_lenExtents = inode->i_size;
 
-	inode->i_mode = udf_convert_permissions(fe);
-	inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask;
+	if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_fmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_fmode;
+	else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+			sbi->s_dmode != UDF_INVALID_MODE)
+		inode->i_mode = sbi->s_dmode;
+	else
+		inode->i_mode = udf_convert_permissions(fe);
+	inode->i_mode &= ~sbi->s_umask;
 
 	if (iinfo->i_efe == 0) {
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 
 	bh = udf_tread(inode->i_sb,
 			udf_get_lb_pblock(inode->i_sb,
-					  iinfo->i_location, 0));
+					  &iinfo->i_location, 0));
 	if (!bh) {
 		udf_debug("bread failure\n");
 		return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
 					sizeof(struct unallocSpaceEntry));
 		crclen = sizeof(struct unallocSpaceEntry) +
-				iinfo->i_lenAlloc - sizeof(tag);
+				iinfo->i_lenAlloc - sizeof(struct tag);
 		use->descTag.tagLocation = cpu_to_le32(
 						iinfo->i_location.
 							logicalBlockNum);
 		use->descTag.descCRCLength = cpu_to_le16(crclen);
 		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
-							   sizeof(tag),
+							   sizeof(struct tag),
 							   crclen));
 		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
 
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	fe->informationLength = cpu_to_le64(inode->i_size);
 
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		regid *eid;
+		struct regid *eid;
 		struct deviceSpec *dsea =
 			(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
 		if (!dsea) {
 			dsea = (struct deviceSpec *)
 				udf_add_extendedattr(inode,
 						     sizeof(struct deviceSpec) +
-						     sizeof(regid), 12, 0x3);
+						     sizeof(struct regid), 12, 0x3);
 			dsea->attrType = cpu_to_le32(12);
 			dsea->attrSubtype = 1;
 			dsea->attrLength = cpu_to_le32(
 						sizeof(struct deviceSpec) +
-						sizeof(regid));
-			dsea->impUseLength = cpu_to_le32(sizeof(regid));
+						sizeof(struct regid));
+			dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
 		}
-		eid = (regid *)dsea->impUse;
-		memset(eid, 0, sizeof(regid));
+		eid = (struct regid *)dsea->impUse;
+		memset(eid, 0, sizeof(struct regid));
 		strcpy(eid->ident, UDF_ID_DEVELOPER);
 		eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
 		eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
 		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
 		udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
-		memset(&(fe->impIdent), 0, sizeof(regid));
+		memset(&(fe->impIdent), 0, sizeof(struct regid));
 		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
 		fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 		fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
 		udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
 
-		memset(&(efe->impIdent), 0, sizeof(regid));
+		memset(&(efe->impIdent), 0, sizeof(struct regid));
 		strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
 		efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 		efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	fe->descTag.tagLocation = cpu_to_le32(
 					iinfo->i_location.logicalBlockNum);
 	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
-								sizeof(tag);
+								sizeof(struct tag);
 	fe->descTag.descCRCLength = cpu_to_le16(crclen);
-	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
 						  crclen));
 	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
 
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	return err;
 }
 
-struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
 	unsigned long block = udf_get_lb_pblock(sb, ino, 0);
 	struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 		return NULL;
 
 	if (inode->i_state & I_NEW) {
-		memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr));
+		memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
 		__udf_read_inode(inode);
 		unlock_new_inode(inode);
 	}
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 	if (is_bad_inode(inode))
 		goto out_iput;
 
-	if (ino.logicalBlockNum >= UDF_SB(sb)->
-			s_partmaps[ino.partitionReferenceNum].s_partition_len) {
+	if (ino->logicalBlockNum >= UDF_SB(sb)->
+			s_partmaps[ino->partitionReferenceNum].s_partition_len) {
 		udf_debug("block=%d, partition=%d out of range\n",
-			  ino.logicalBlockNum, ino.partitionReferenceNum);
+			  ino->logicalBlockNum, ino->partitionReferenceNum);
 		make_bad_inode(inode);
 		goto out_iput;
 	}
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 }
 
 int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
-		    kernel_lb_addr eloc, uint32_t elen, int inc)
+		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
 	int adsize;
-	short_ad *sad = NULL;
-	long_ad *lad = NULL;
+	struct short_ad *sad = NULL;
+	struct long_ad *lad = NULL;
 	struct allocExtDesc *aed;
 	int8_t etype;
 	uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		ptr = epos->bh->b_data + epos->offset;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		return -1;
 
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		char *sptr, *dptr;
 		struct buffer_head *nbh;
 		int err, loffset;
-		kernel_lb_addr obloc = epos->block;
+		struct kernel_lb_addr obloc = epos->block;
 
 		epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
 						obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		if (!epos->block.logicalBlockNum)
 			return -1;
 		nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-								 epos->block,
+								 &epos->block,
 								 0));
 		if (!nbh)
 			return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		}
 		if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
 			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-				    epos->block.logicalBlockNum, sizeof(tag));
+				    epos->block.logicalBlockNum, sizeof(struct tag));
 		else
 			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-				    epos->block.logicalBlockNum, sizeof(tag));
+				    epos->block.logicalBlockNum, sizeof(struct tag));
 		switch (iinfo->i_alloc_type) {
 		case ICBTAG_FLAG_AD_SHORT:
-			sad = (short_ad *)sptr;
+			sad = (struct short_ad *)sptr;
 			sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
 						     inode->i_sb->s_blocksize);
 			sad->extPosition =
 				cpu_to_le32(epos->block.logicalBlockNum);
 			break;
 		case ICBTAG_FLAG_AD_LONG:
-			lad = (long_ad *)sptr;
+			lad = (struct long_ad *)sptr;
 			lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
 						     inode->i_sb->s_blocksize);
 			lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
-		      kernel_lb_addr eloc, uint32_t elen, int inc)
+		      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
 	int adsize;
 	uint8_t *ptr;
-	short_ad *sad;
-	long_ad *lad;
+	struct short_ad *sad;
+	struct long_ad *lad;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 
 	switch (iinfo->i_alloc_type) {
 	case ICBTAG_FLAG_AD_SHORT:
-		sad = (short_ad *)ptr;
+		sad = (struct short_ad *)ptr;
 		sad->extLength = cpu_to_le32(elen);
-		sad->extPosition = cpu_to_le32(eloc.logicalBlockNum);
-		adsize = sizeof(short_ad);
+		sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
+		adsize = sizeof(struct short_ad);
 		break;
 	case ICBTAG_FLAG_AD_LONG:
-		lad = (long_ad *)ptr;
+		lad = (struct long_ad *)ptr;
 		lad->extLength = cpu_to_le32(elen);
-		lad->extLocation = cpu_to_lelb(eloc);
+		lad->extLocation = cpu_to_lelb(*eloc);
 		memset(lad->impUse, 0x00, sizeof(lad->impUse));
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 		break;
 	default:
 		return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
-		     kernel_lb_addr *eloc, uint32_t *elen, int inc)
+		     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
 	int8_t etype;
 
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
 		epos->block = *eloc;
 		epos->offset = sizeof(struct allocExtDesc);
 		brelse(epos->bh);
-		block = udf_get_lb_pblock(inode->i_sb, epos->block, 0);
+		block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
 		epos->bh = udf_tread(inode->i_sb, block);
 		if (!epos->bh) {
 			udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
-			kernel_lb_addr *eloc, uint32_t *elen, int inc)
+			struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
 	int alen;
 	int8_t etype;
 	uint8_t *ptr;
-	short_ad *sad;
-	long_ad *lad;
+	struct short_ad *sad;
+	struct long_ad *lad;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
 }
 
 static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
-			      kernel_lb_addr neloc, uint32_t nelen)
+			      struct kernel_lb_addr neloc, uint32_t nelen)
 {
-	kernel_lb_addr oeloc;
+	struct kernel_lb_addr oeloc;
 	uint32_t oelen;
 	int8_t etype;
 
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
 		get_bh(epos.bh);
 
 	while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
-		udf_write_aext(inode, &epos, neloc, nelen, 1);
+		udf_write_aext(inode, &epos, &neloc, nelen, 1);
 		neloc = oeloc;
 		nelen = (etype << 30) | oelen;
 	}
-	udf_add_aext(inode, &epos, neloc, nelen, 1);
+	udf_add_aext(inode, &epos, &neloc, nelen, 1);
 	brelse(epos.bh);
 
 	return (nelen >> 30);
 }
 
 int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
-		       kernel_lb_addr eloc, uint32_t elen)
+		       struct kernel_lb_addr eloc, uint32_t elen)
 {
 	struct extent_position oepos;
 	int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 
 	iinfo = UDF_I(inode);
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		adsize = 0;
 
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 		return -1;
 
 	while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-		udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+		udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
 		if (oepos.bh != epos.bh) {
 			oepos.block = epos.block;
 			brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 			oepos.offset = epos.offset - adsize;
 		}
 	}
-	memset(&eloc, 0x00, sizeof(kernel_lb_addr));
+	memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
 	elen = 0;
 
 	if (epos.bh != oepos.bh) {
-		udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
-		udf_write_aext(inode, &oepos, eloc, elen, 1);
-		udf_write_aext(inode, &oepos, eloc, elen, 1);
+		udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
 		if (!oepos.bh) {
 			iinfo->i_lenAlloc -= (adsize * 2);
 			mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 			mark_buffer_dirty_inode(oepos.bh, inode);
 		}
 	} else {
-		udf_write_aext(inode, &oepos, eloc, elen, 1);
+		udf_write_aext(inode, &oepos, &eloc, elen, 1);
 		if (!oepos.bh) {
 			iinfo->i_lenAlloc -= adsize;
 			mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 }
 
 int8_t inode_bmap(struct inode *inode, sector_t block,
-		  struct extent_position *pos, kernel_lb_addr *eloc,
+		  struct extent_position *pos, struct kernel_lb_addr *eloc,
 		  uint32_t *elen, sector_t *offset)
 {
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 
 long udf_block_map(struct inode *inode, sector_t block)
 {
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t offset;
 	struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
 
 	if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
 						(EXT_RECORDED_ALLOCATED >> 30))
-		ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+		ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
 	else
 		ret = 0;
 
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
 			}
 		}
 		/* rewrite CRC + checksum of eahd */
-		crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
+		crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
 		eahd->descTag.descCRCLength = cpu_to_le16(crclen);
 		eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
-						sizeof(tag), crclen));
+						sizeof(struct tag), crclen));
 		eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
 		iinfo->i_lenEAttr += size;
 		return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
 struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 				    uint32_t location, uint16_t *ident)
 {
-	tag *tag_p;
+	struct tag *tag_p;
 	struct buffer_head *bh = NULL;
 
 	/* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 		return NULL;
 	}
 
-	tag_p = (tag *)(bh->b_data);
+	tag_p = (struct tag *)(bh->b_data);
 
 	*ident = le16_to_cpu(tag_p->tagIdent);
 
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 	}
 
 	/* Verify the descriptor CRC */
-	if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
+	if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
 	    le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
-					bh->b_data + sizeof(tag),
+					bh->b_data + sizeof(struct tag),
 					le16_to_cpu(tag_p->descCRCLength)))
 		return bh;
 
@@ -255,27 +255,28 @@ error_out:
 	return NULL;
 }
 
-struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc,
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+				     struct kernel_lb_addr *loc,
 				     uint32_t offset, uint16_t *ident)
 {
 	return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
-			       loc.logicalBlockNum + offset, ident);
+			       loc->logicalBlockNum + offset, ident);
 }
 
 void udf_update_tag(char *data, int length)
 {
-	tag *tptr = (tag *)data;
-	length -= sizeof(tag);
+	struct tag *tptr = (struct tag *)data;
+	length -= sizeof(struct tag);
 
 	tptr->descCRCLength = cpu_to_le16(length);
-	tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
+	tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
 	tptr->tagChecksum = udf_tag_checksum(tptr);
 }
 
 void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
 		 uint32_t loc, int length)
 {
-	tag *tptr = (tag *)data;
+	struct tag *tptr = (struct tag *)data;
 	tptr->tagIdent = cpu_to_le16(ident);
 	tptr->descVersion = cpu_to_le16(version);
 	tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
 	udf_update_tag(data, length);
 }
 
-u8 udf_tag_checksum(const tag *t)
+u8 udf_tag_checksum(const struct tag *t)
 {
 	u8 *data = (u8 *)t;
 	u8 checksum = 0;
 	int i;
-	for (i = 0; i < sizeof(tag); ++i)
+	for (i = 0; i < sizeof(struct tag); ++i)
 		if (i != 4) /* position of checksum */
 			checksum += data[i];
 	return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 		 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
 		 uint8_t *impuse, uint8_t *fileident)
 {
-	uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag);
+	uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
 	uint16_t crc;
 	int offset;
 	uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 		memset(fibh->ebh->b_data, 0x00, padlen + offset);
 	}
 
-	crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
-		      sizeof(struct fileIdentDesc) - sizeof(tag));
+	crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
+		      sizeof(struct fileIdentDesc) - sizeof(struct tag));
 
 	if (fibh->sbh == fibh->ebh) {
 		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
-			      crclen + sizeof(tag) -
+			      crclen + sizeof(struct tag) -
 			      sizeof(struct fileIdentDesc));
 	} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
 		crc = crc_itu_t(crc, fibh->ebh->b_data +
 					sizeof(struct fileIdentDesc) +
 					fibh->soffset,
-			      crclen + sizeof(tag) -
+			      crclen + sizeof(struct tag) -
 					sizeof(struct fileIdentDesc));
 	} else {
 		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	uint8_t lfi;
 	uint16_t liu;
 	loff_t size;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t offset;
 	struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
 		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
 			goto out_err;
-		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-				epos.offset -= sizeof(short_ad);
+				epos.offset -= sizeof(struct short_ad);
 			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-				epos.offset -= sizeof(long_ad);
+				epos.offset -= sizeof(struct long_ad);
 		} else
 			offset = 0;
 
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #ifdef UDF_RECOVERY
 	/* temporary shorthand for specifying files by inode number */
 	if (!strncmp(dentry->d_name.name, ".B=", 3)) {
-		kernel_lb_addr lb = {
+		struct kernel_lb_addr lb = {
 			.logicalBlockNum = 0,
 			.partitionReferenceNum =
 				simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #endif /* UDF_RECOVERY */
 
 	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+		struct kernel_lb_addr loc;
+
 		if (fibh.sbh != fibh.ebh)
 			brelse(fibh.ebh);
 		brelse(fibh.sbh);
 
-		inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
+		loc = lelb_to_cpu(cfi.icb.extLocation);
+		inode = udf_iget(dir->i_sb, &loc);
 		if (!inode) {
 			unlock_kernel();
 			return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	uint8_t lfi;
 	uint16_t liu;
 	int block;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen = 0;
 	sector_t offset;
 	struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
 		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
 			block = udf_get_lb_pblock(dir->i_sb,
-					dinfo->i_location, 0);
+					&dinfo->i_location, 0);
 			fibh->soffset = fibh->eoffset = sb->s_blocksize;
 			goto add;
 		}
-		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-				epos.offset -= sizeof(short_ad);
+				epos.offset -= sizeof(struct short_ad);
 			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-				epos.offset -= sizeof(long_ad);
+				epos.offset -= sizeof(struct long_ad);
 		} else
 			offset = 0;
 
@@ -409,10 +412,10 @@ add:
 	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
 		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
 		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-			epos.offset -= sizeof(short_ad);
+			epos.offset -= sizeof(struct short_ad);
 		else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-			epos.offset -= sizeof(long_ad);
-		udf_write_aext(dir, &epos, eloc, elen, 1);
+			epos.offset -= sizeof(struct long_ad);
+		udf_write_aext(dir, &epos, &eloc, elen, 1);
 	}
 	f_pos += nfidlen;
 
@@ -494,10 +497,10 @@ add:
 	memset(cfi, 0, sizeof(struct fileIdentDesc));
 	if (UDF_SB(sb)->s_udfrev >= 0x0200)
 		udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
-			    sizeof(tag));
+			    sizeof(struct tag));
 	else
 		udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
-			    sizeof(tag));
+			    sizeof(struct tag));
 	cfi->fileVersionNum = cpu_to_le16(1);
 	cfi->lengthFileIdent = namelen;
 	cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
 	cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
 
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
-		memset(&(cfi->icb), 0x00, sizeof(long_ad));
+		memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
 
 	return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
 	loff_t f_pos;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
 	int block;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t offset;
 	struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
 	else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
 			      &epos, &eloc, &elen, &offset) ==
 					(EXT_RECORDED_ALLOCATED >> 30)) {
-		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+		block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-				epos.offset -= sizeof(short_ad);
+				epos.offset -= sizeof(struct short_ad);
 			else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-				epos.offset -= sizeof(long_ad);
+				epos.offset -= sizeof(struct long_ad);
 		} else
 			offset = 0;
 
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct udf_fileident_bh fibh;
 	struct fileIdentDesc *fi, cfi;
-	kernel_lb_addr tloc;
+	struct kernel_lb_addr tloc;
 
 	retval = -ENOENT;
 	lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 
 	retval = -EIO;
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
-	if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
 		goto end_rmdir;
 	retval = -ENOTEMPTY;
 	if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 	struct udf_fileident_bh fibh;
 	struct fileIdentDesc *fi;
 	struct fileIdentDesc cfi;
-	kernel_lb_addr tloc;
+	struct kernel_lb_addr tloc;
 
 	retval = -ENOENT;
 	lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 
 	retval = -EIO;
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
-	if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+	if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &page_symlink_inode_operations;
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-		kernel_lb_addr eloc;
+		struct kernel_lb_addr eloc;
 		uint32_t bsize;
 
 		block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 				iinfo->i_location.partitionReferenceNum;
 		bsize = inode->i_sb->s_blocksize;
 		iinfo->i_lenExtents = bsize;
-		udf_add_aext(inode, &epos, eloc, bsize, 0);
+		udf_add_aext(inode, &epos, &eloc, bsize, 0);
 		brelse(epos.bh);
 
 		block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct fileIdentDesc ocfi, ncfi;
 	struct buffer_head *dir_bh = NULL;
 	int retval = -ENOENT;
-	kernel_lb_addr tloc;
+	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
 	lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 		brelse(ofibh.sbh);
 	}
 	tloc = lelb_to_cpu(ocfi.icb.extLocation);
-	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
+	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
 	    != old_inode->i_ino)
 		goto end_rename;
 
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (!dir_fi)
 			goto end_rename;
 		tloc = lelb_to_cpu(dir_fi->icb.extLocation);
-		if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) !=
+		if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
 				old_dir->i_ino)
 			goto end_rename;
 
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	ncfi.fileVersionNum = ocfi.fileVersionNum;
 	ncfi.fileCharacteristics = ocfi.fileCharacteristics;
-	memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad));
+	memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
 	udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
 
 	/* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
 
 static struct dentry *udf_get_parent(struct dentry *child)
 {
+	struct kernel_lb_addr tloc;
 	struct inode *inode = NULL;
 	struct qstr dotdot = {.name = "..", .len = 2};
 	struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
 
-	inode = udf_iget(child->d_inode->i_sb,
-			 lelb_to_cpu(cfi.icb.extLocation));
+	tloc = lelb_to_cpu(cfi.icb.extLocation);
+	inode = udf_iget(child->d_inode->i_sb, &tloc);
 	if (!inode)
 		goto out_unlock;
 	unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 					u16 partref, __u32 generation)
 {
 	struct inode *inode;
-	kernel_lb_addr loc;
+	struct kernel_lb_addr loc;
 
 	if (block == 0)
 		return ERR_PTR(-ESTALE);
 
 	loc.logicalBlockNum = block;
 	loc.partitionReferenceNum = partref;
-	inode = udf_iget(sb, loc);
+	inode = udf_iget(sb, &loc);
 
 	if (inode == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 {
 	int len = *lenp;
 	struct inode *inode =  de->d_inode;
-	kernel_lb_addr location = UDF_I(inode)->i_location;
+	struct kernel_lb_addr location = UDF_I(inode)->i_location;
 	struct fid *fid = (struct fid *)fh;
 	int type = FILEID_UDF_WITHOUT_PARENT;
 
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
 /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
 /* Implementation Use (UDF 2.50 2.2.6.4) */
 struct logicalVolIntegrityDescImpUse {
-	regid		impIdent;
+	struct regid	impIdent;
 	__le32		numFiles;
 	__le32		numDirs;
 	__le16		minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
 /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
 /* Implementation Use (UDF 2.50 2.2.7.2) */
 struct impUseVolDescImpUse {
-	charspec	LVICharset;
+	struct charspec	LVICharset;
 	dstring		logicalVolIdent[128];
 	dstring		LVInfo1[36];
 	dstring		LVInfo2[36];
 	dstring		LVInfo3[36];
-	regid		impIdent;
+	struct regid	impIdent;
 	uint8_t		impUse[128];
 } __attribute__ ((packed));
 
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
 	uint8_t		partitionMapType;
 	uint8_t		partitionMapLength;
 	uint8_t		reserved1[2];
-	regid		partIdent;
+	struct regid	partIdent;
 	__le16		volSeqNum;
 	__le16		partitionNum;
 } __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
 	uint8_t		partitionMapType;
 	uint8_t		partitionMapLength;
 	uint8_t		reserved1[2];
-	regid		partIdent;
+	struct regid	partIdent;
 	__le16		volSeqNum;
 	__le16		partitionNum;
 	uint8_t		reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
 	uint8_t partitionMapType;
 	uint8_t partitionMapLength;
 	uint8_t reserved1[2];
-	regid partIdent;
+	struct regid partIdent;
 	__le16 volSeqNum;
 	__le16 partitionNum;
 	__le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
 	uint8_t		partitionMapType;
 	uint8_t		partitionMapLength;
 	uint8_t		reserved1[2];
-	regid		partIdent;
+	struct regid	partIdent;
 	__le16		volSeqNum;
 	__le16		partitionNum;
 	__le32		metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
 /* Virtual Allocation Table (UDF 1.5 2.2.10) */
 struct virtualAllocationTable15 {
 	__le32		VirtualSector[0];
-	regid		vatIdent;
+	struct regid	vatIdent;
 	__le32		previousVATICBLoc;
 } __attribute__ ((packed));
 
@@ -192,8 +192,8 @@ struct sparingEntry {
 } __attribute__ ((packed));
 
 struct sparingTable {
-	tag 		descTag;
-	regid		sparingIdent;
+	struct tag	descTag;
+	struct regid	sparingIdent;
 	__le16		reallocationTableLen;
 	__le16		reserved;
 	__le32		sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
 #define ICBTAG_FILE_TYPE_MIRROR		0xFB
 #define ICBTAG_FILE_TYPE_BITMAP		0xFC
 
-/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
 struct allocDescImpUse {
 	__le16		flags;
 	uint8_t		impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
 {
 	struct super_block *sb = inode->i_sb;
 	struct udf_part_map *map;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t ext_offset;
 	struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
-static void udf_write_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
 static int udf_remount_fs(struct super_block *, int *, char *);
-static int udf_check_valid(struct super_block *, int, int);
-static int udf_vrs(struct super_block *sb, int silent);
-static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
-static void udf_find_anchor(struct super_block *);
-static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
-			    kernel_lb_addr *);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
+			    struct kernel_lb_addr *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
-			     kernel_lb_addr *);
+			     struct kernel_lb_addr *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
 	.delete_inode	= udf_delete_inode,
 	.clear_inode	= udf_clear_inode,
 	.put_super	= udf_put_super,
-	.write_super	= udf_write_super,
+	.sync_fs	= udf_sync_fs,
 	.statfs		= udf_statfs,
 	.remount_fs	= udf_remount_fs,
 	.show_options	= udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
 	mode_t umask;
 	gid_t gid;
 	uid_t uid;
+	mode_t fmode;
+	mode_t dmode;
 	struct nls_table *nls_map;
 };
 
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
 
 	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
 		seq_puts(seq, ",nostrict");
-	if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
 		seq_printf(seq, ",bs=%lu", sb->s_blocksize);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
 		seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",gid=%u", sbi->s_gid);
 	if (sbi->s_umask != 0)
 		seq_printf(seq, ",umask=%o", sbi->s_umask);
+	if (sbi->s_fmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",mode=%o", sbi->s_fmode);
+	if (sbi->s_dmode != UDF_INVALID_MODE)
+		seq_printf(seq, ",dmode=%o", sbi->s_dmode);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
 		seq_printf(seq, ",session=%u", sbi->s_session);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
 		seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
-	/*
-	 * s_anchor[2] could be zeroed out in case there is no anchor
-	 * in the specified block, but then the "anchor=N" option
-	 * originally given by the user wasn't effective, so it's OK
-	 * if we don't show it.
-	 */
-	if (sbi->s_anchor[2] != 0)
-		seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
+	if (sbi->s_anchor != 0)
+		seq_printf(seq, ",anchor=%u", sbi->s_anchor);
 	/*
 	 * volume, partition, fileset and rootdir seem to be ignored
 	 * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
  *
  *	gid=		Set the default group.
  *	umask=		Set the default umask.
+ *	mode=		Set the default file permissions.
+ *	dmode=		Set the default directory permissions.
  *	uid=		Set the default user.
  *	bs=		Set the block size.
  *	unhide		Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
 	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
 	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
 	Opt_rootdir, Opt_utf8, Opt_iocharset,
-	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
+	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+	Opt_fmode, Opt_dmode
 };
 
 static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
 	{Opt_rootdir,	"rootdir=%u"},
 	{Opt_utf8,	"utf8"},
 	{Opt_iocharset,	"iocharset=%s"},
+	{Opt_fmode,     "mode=%o"},
+	{Opt_dmode,     "dmode=%o"},
 	{Opt_err,	NULL}
 };
 
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 	int option;
 
 	uopt->novrs = 0;
-	uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
 	uopt->partition = 0xFFFF;
 	uopt->session = 0xFFFFFFFF;
 	uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 		switch (token) {
 		case Opt_novrs:
 			uopt->novrs = 1;
+			break;
 		case Opt_bs:
 			if (match_int(&args[0], &option))
 				return 0;
 			uopt->blocksize = option;
+			uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
 			break;
 		case Opt_unhide:
 			uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 		case Opt_gforget:
 			uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
 			break;
+		case Opt_fmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->fmode = option & 0777;
+			break;
+		case Opt_dmode:
+			if (match_octal(args, &option))
+				return 0;
+			uopt->dmode = option & 0777;
+			break;
 		default:
 			printk(KERN_ERR "udf: bad mount option \"%s\" "
 			       "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
 	return 1;
 }
 
-static void udf_write_super(struct super_block *sb)
-{
-	lock_kernel();
-
-	if (!(sb->s_flags & MS_RDONLY))
-		udf_open_lvid(sb);
-	sb->s_dirt = 0;
-
-	unlock_kernel();
-}
-
 static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
 	struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	uopt.uid   = sbi->s_uid;
 	uopt.gid   = sbi->s_gid;
 	uopt.umask = sbi->s_umask;
+	uopt.fmode = sbi->s_fmode;
+	uopt.dmode = sbi->s_dmode;
 
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
 	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
 
 	if (sbi->s_lvid_bh) {
 		int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	return 0;
 }
 
-static int udf_vrs(struct super_block *sb, int silent)
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
 {
 	struct volStructDesc *vsd = NULL;
 	loff_t sector = 32768;
 	int sectorsize;
 	struct buffer_head *bh = NULL;
-	int iso9660 = 0;
 	int nsr02 = 0;
 	int nsr03 = 0;
 	struct udf_sb_info *sbi;
 
-	/* Block size must be a multiple of 512 */
-	if (sb->s_blocksize & 511)
-		return 0;
 	sbi = UDF_SB(sb);
-
 	if (sb->s_blocksize < sizeof(struct volStructDesc))
 		sectorsize = sizeof(struct volStructDesc);
 	else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
 			break;
 		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
 				    VSD_STD_ID_LEN)) {
-			iso9660 = sector;
 			switch (vsd->structType) {
 			case 0:
 				udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
 		return 0;
 }
 
-/*
- * Check whether there is an anchor block in the given block
- */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-{
-	struct buffer_head *bh;
-	uint16_t ident;
-
-	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-	    udf_fixed_to_variable(block) >=
-	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-		return 0;
-
-	bh = udf_read_tagged(sb, block, block, &ident);
-	if (!bh)
-		return 0;
-	brelse(bh);
-
-	return ident == TAG_IDENT_AVDP;
-}
-
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-{
-	sector_t last[6];
-	int i;
-	struct udf_sb_info *sbi = UDF_SB(sb);
-
-	last[0] = lastblock;
-	last[1] = last[0] - 1;
-	last[2] = last[0] + 1;
-	last[3] = last[0] - 2;
-	last[4] = last[0] - 150;
-	last[5] = last[0] - 152;
-
-	/*  according to spec, anchor is in either:
-	 *     block 256
-	 *     lastblock-256
-	 *     lastblock
-	 *  however, if the disc isn't closed, it could be 512 */
-
-	for (i = 0; i < ARRAY_SIZE(last); i++) {
-		if (last[i] < 0)
-			continue;
-		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
-				sb->s_blocksize_bits)
-			continue;
-
-		if (udf_check_anchor_block(sb, last[i])) {
-			sbi->s_anchor[0] = last[i];
-			sbi->s_anchor[1] = last[i] - 256;
-			return last[i];
-		}
-
-		if (last[i] < 256)
-			continue;
-
-		if (udf_check_anchor_block(sb, last[i] - 256)) {
-			sbi->s_anchor[1] = last[i] - 256;
-			return last[i];
-		}
-	}
-
-	if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
-		sbi->s_anchor[0] = sbi->s_session + 256;
-		return last[0];
-	}
-	if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
-		sbi->s_anchor[0] = sbi->s_session + 512;
-		return last[0];
-	}
-	return 0;
-}
-
-/*
- * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
- * be the last block on the media.
- *
- * Return 1 if not found, 0 if ok
- *
- */
-static void udf_find_anchor(struct super_block *sb)
-{
-	sector_t lastblock;
-	struct buffer_head *bh = NULL;
-	uint16_t ident;
-	int i;
-	struct udf_sb_info *sbi = UDF_SB(sb);
-
-	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-	if (lastblock)
-		goto check_anchor;
-
-	/* No anchor found? Try VARCONV conversion of block numbers */
-	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-	/* Firstly, we try to not convert number of the last block */
-	lastblock = udf_scan_anchors(sb,
-				udf_variable_to_fixed(sbi->s_last_block));
-	if (lastblock)
-		goto check_anchor;
-
-	/* Secondly, we try with converted number of the last block */
-	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-	if (!lastblock) {
-		/* VARCONV didn't help. Clear it. */
-		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-	}
-
-check_anchor:
-	/*
-	 * Check located anchors and the anchor block supplied via
-	 * mount options
-	 */
-	for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-		if (!sbi->s_anchor[i])
-			continue;
-		bh = udf_read_tagged(sb, sbi->s_anchor[i],
-					sbi->s_anchor[i], &ident);
-		if (!bh)
-			sbi->s_anchor[i] = 0;
-		else {
-			brelse(bh);
-			if (ident != TAG_IDENT_AVDP)
-				sbi->s_anchor[i] = 0;
-		}
-	}
-
-	sbi->s_last_block = lastblock;
-}
-
 static int udf_find_fileset(struct super_block *sb,
-			    kernel_lb_addr *fileset,
-			    kernel_lb_addr *root)
+			    struct kernel_lb_addr *fileset,
+			    struct kernel_lb_addr *root)
 {
 	struct buffer_head *bh = NULL;
 	long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
 
 	if (fileset->logicalBlockNum != 0xFFFFFFFF ||
 	    fileset->partitionReferenceNum != 0xFFFF) {
-		bh = udf_read_ptagged(sb, *fileset, 0, &ident);
+		bh = udf_read_ptagged(sb, fileset, 0, &ident);
 
 		if (!bh) {
 			return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
 	sbi = UDF_SB(sb);
 	if (!bh) {
 		/* Search backwards through the partitions */
-		kernel_lb_addr newfileset;
+		struct kernel_lb_addr newfileset;
 
 /* --> cvg: FIXME - is it reasonable? */
 		return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
 			newfileset.logicalBlockNum = 0;
 
 			do {
-				bh = udf_read_ptagged(sb, newfileset, 0,
+				bh = udf_read_ptagged(sb, &newfileset, 0,
 						      &ident);
 				if (!bh) {
 					newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
 static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
 	struct primaryVolDesc *pvoldesc;
-	struct ustr instr;
-	struct ustr outstr;
+	struct ustr *instr, *outstr;
 	struct buffer_head *bh;
 	uint16_t ident;
+	int ret = 1;
+
+	instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!instr)
+		return 1;
+
+	outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!outstr)
+		goto out1;
 
 	bh = udf_read_tagged(sb, block, block, &ident);
 	if (!bh)
-		return 1;
+		goto out2;
+
 	BUG_ON(ident != TAG_IDENT_PVD);
 
 	pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 	if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
 			      pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
-		timestamp *ts = &pvoldesc->recordingDateAndTime;
+		struct timestamp *ts = &pvoldesc->recordingDateAndTime;
 		udf_debug("recording time %04u/%02u/%02u"
 			  " %02u:%02u (%x)\n",
 			  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
 	}
 
-	if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
-		if (udf_CS0toUTF8(&outstr, &instr)) {
-			strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name,
-				outstr.u_len > 31 ? 31 : outstr.u_len);
+	if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
+		if (udf_CS0toUTF8(outstr, instr)) {
+			strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+				outstr->u_len > 31 ? 31 : outstr->u_len);
 			udf_debug("volIdent[] = '%s'\n",
 					UDF_SB(sb)->s_volume_ident);
 		}
 
-	if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
-		if (udf_CS0toUTF8(&outstr, &instr))
-			udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
+		if (udf_CS0toUTF8(outstr, instr))
+			udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
 
 	brelse(bh);
-	return 0;
+	ret = 0;
+out2:
+	kfree(outstr);
+out1:
+	kfree(instr);
+	return ret;
 }
 
 static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
 	struct udf_meta_data *mdata;
-	kernel_lb_addr addr;
+	struct kernel_lb_addr addr;
 	int fe_error = 0;
 
 	map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	udf_debug("Metadata file location: block = %d part = %d\n",
 			  addr.logicalBlockNum, addr.partitionReferenceNum);
 
-	mdata->s_metadata_fe = udf_iget(sb, addr);
+	mdata->s_metadata_fe = udf_iget(sb, &addr);
 
 	if (mdata->s_metadata_fe == NULL) {
 		udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	udf_debug("Mirror metadata file location: block = %d part = %d\n",
 			  addr.logicalBlockNum, addr.partitionReferenceNum);
 
-	mdata->s_mirror_fe = udf_iget(sb, addr);
+	mdata->s_mirror_fe = udf_iget(sb, &addr);
 
 	if (mdata->s_mirror_fe == NULL) {
 		if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 		udf_debug("Bitmap file location: block = %d part = %d\n",
 			addr.logicalBlockNum, addr.partitionReferenceNum);
 
-		mdata->s_bitmap_fe = udf_iget(sb, addr);
+		mdata->s_bitmap_fe = udf_iget(sb, &addr);
 
 		if (mdata->s_bitmap_fe == NULL) {
 			if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
 }
 
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
-			     kernel_lb_addr *root)
+			     struct kernel_lb_addr *root)
 {
 	struct fileSetDesc *fset;
 
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 
 	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
 	if (phd->unallocSpaceTable.extLength) {
-		kernel_lb_addr loc = {
+		struct kernel_lb_addr loc = {
 			.logicalBlockNum = le32_to_cpu(
 				phd->unallocSpaceTable.extPosition),
 			.partitionReferenceNum = p_index,
 		};
 
-		map->s_uspace.s_table = udf_iget(sb, loc);
+		map->s_uspace.s_table = udf_iget(sb, &loc);
 		if (!map->s_uspace.s_table) {
 			udf_debug("cannot load unallocSpaceTable (part %d)\n",
 					p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		udf_debug("partitionIntegrityTable (part %d)\n", p_index);
 
 	if (phd->freedSpaceTable.extLength) {
-		kernel_lb_addr loc = {
+		struct kernel_lb_addr loc = {
 			.logicalBlockNum = le32_to_cpu(
 				phd->freedSpaceTable.extPosition),
 			.partitionReferenceNum = p_index,
 		};
 
-		map->s_fspace.s_table = udf_iget(sb, loc);
+		map->s_fspace.s_table = udf_iget(sb, &loc);
 		if (!map->s_fspace.s_table) {
 			udf_debug("cannot load freedSpaceTable (part %d)\n",
 				p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map = &sbi->s_partmaps[p_index];
-	kernel_lb_addr ino;
+	struct kernel_lb_addr ino;
 	struct buffer_head *bh = NULL;
 	struct udf_inode_info *vati;
 	uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 	/* VAT file entry is in the last recorded block */
 	ino.partitionReferenceNum = type1_index;
 	ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-	sbi->s_vat_inode = udf_iget(sb, ino);
+	sbi->s_vat_inode = udf_iget(sb, &ino);
 	if (!sbi->s_vat_inode)
 		return 1;
 
@@ -1322,7 +1208,7 @@ out_bh:
 }
 
 static int udf_load_logicalvol(struct super_block *sb, sector_t block,
-			       kernel_lb_addr *fileset)
+			       struct kernel_lb_addr *fileset)
 {
 	struct logicalVolDesc *lvd;
 	int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 	}
 
 	if (fileset) {
-		long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]);
+		struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
 
 		*fileset = lelb_to_cpu(la->extLocation);
 		udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
  * udf_load_logicalvolint
  *
  */
-static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
 {
 	struct buffer_head *bh = NULL;
 	uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
  *	Written, tested, and released.
  */
 static noinline int udf_process_sequence(struct super_block *sb, long block,
-				long lastblock, kernel_lb_addr *fileset)
+				long lastblock, struct kernel_lb_addr *fileset)
 {
 	struct buffer_head *bh = NULL;
 	struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 	return 0;
 }
 
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+			     struct kernel_lb_addr *fileset)
+{
+	struct anchorVolDescPtr *anchor;
+	long main_s, main_e, reserve_s, reserve_e;
+	struct udf_sb_info *sbi;
+
+	sbi = UDF_SB(sb);
+	anchor = (struct anchorVolDescPtr *)bh->b_data;
+
+	/* Locate the main sequence */
+	main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+	main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+	main_e = main_e >> sb->s_blocksize_bits;
+	main_e += main_s;
+
+	/* Locate the reserve sequence */
+	reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+	reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+	reserve_e = reserve_e >> sb->s_blocksize_bits;
+	reserve_e += reserve_s;
+
+	/* Process the main & reserve sequences */
+	/* responsible for finding the PartitionDesc(s) */
+	if (!udf_process_sequence(sb, main_s, main_e, fileset))
+		return 1;
+	return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
+
 /*
- * udf_check_valid()
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
  */
-static int udf_check_valid(struct super_block *sb, int novrs, int silent)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+				  struct kernel_lb_addr *fileset)
 {
-	long block;
-	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct buffer_head *bh;
+	uint16_t ident;
+	int ret;
 
-	if (novrs) {
-		udf_debug("Validity check skipped because of novrs option\n");
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+	    udf_fixed_to_variable(block) >=
+	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+		return 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 0;
+	if (ident != TAG_IDENT_AVDP) {
+		brelse(bh);
 		return 0;
 	}
-	/* Check that it is NSR02 compliant */
-	/* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
-	block = udf_vrs(sb, silent);
-	if (block == -1)
-		udf_debug("Failed to read byte 32768. Assuming open "
-			  "disc. Skipping validity check\n");
-	if (block && !sbi->s_last_block)
-		sbi->s_last_block = udf_get_last_block(sb);
-	return !block;
+	ret = udf_load_sequence(sb, bh, fileset);
+	brelse(bh);
+	return ret;
 }
 
-static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+				 struct kernel_lb_addr *fileset)
 {
-	struct anchorVolDescPtr *anchor;
-	uint16_t ident;
-	struct buffer_head *bh;
-	long main_s, main_e, reserve_s, reserve_e;
+	sector_t last[6];
 	int i;
-	struct udf_sb_info *sbi;
-
-	if (!sb)
-		return 1;
-	sbi = UDF_SB(sb);
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	int last_count = 0;
 
-	for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-		if (!sbi->s_anchor[i])
+	/* First try user provided anchor */
+	if (sbi->s_anchor) {
+		if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+			return lastblock;
+	}
+	/*
+	 * according to spec, anchor is in either:
+	 *     block 256
+	 *     lastblock-256
+	 *     lastblock
+	 *  however, if the disc isn't closed, it could be 512.
+	 */
+	if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+		return lastblock;
+	/*
+	 * The trouble is which block is the last one. Drives often misreport
+	 * this so we try various possibilities.
+	 */
+	last[last_count++] = lastblock;
+	if (lastblock >= 1)
+		last[last_count++] = lastblock - 1;
+	last[last_count++] = lastblock + 1;
+	if (lastblock >= 2)
+		last[last_count++] = lastblock - 2;
+	if (lastblock >= 150)
+		last[last_count++] = lastblock - 150;
+	if (lastblock >= 152)
+		last[last_count++] = lastblock - 152;
+
+	for (i = 0; i < last_count; i++) {
+		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits)
 			continue;
-
-		bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
-				     &ident);
-		if (!bh)
+		if (udf_check_anchor_block(sb, last[i], fileset))
+			return last[i];
+		if (last[i] < 256)
 			continue;
+		if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+			return last[i];
+	}
 
-		anchor = (struct anchorVolDescPtr *)bh->b_data;
+	/* Finally try block 512 in case media is open */
+	if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+		return last[0];
+	return 0;
+}
 
-		/* Locate the main sequence */
-		main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
-		main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
-		main_e = main_e >> sb->s_blocksize_bits;
-		main_e += main_s;
+/*
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
+ * area specified by it. The function expects sbi->s_lastblock to be the last
+ * block on the media.
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+			   struct kernel_lb_addr *fileset)
+{
+	sector_t lastblock;
+	struct udf_sb_info *sbi = UDF_SB(sb);
 
-		/* Locate the reserve sequence */
-		reserve_s = le32_to_cpu(
-				anchor->reserveVolDescSeqExt.extLocation);
-		reserve_e = le32_to_cpu(
-				anchor->reserveVolDescSeqExt.extLength);
-		reserve_e = reserve_e >> sb->s_blocksize_bits;
-		reserve_e += reserve_s;
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+	if (lastblock)
+		goto out;
 
-		brelse(bh);
+	/* No anchor found? Try VARCONV conversion of block numbers */
+	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	/* Firstly, we try to not convert number of the last block */
+	lastblock = udf_scan_anchors(sb,
+				udf_variable_to_fixed(sbi->s_last_block),
+				fileset);
+	if (lastblock)
+		goto out;
 
-		/* Process the main & reserve sequences */
-		/* responsible for finding the PartitionDesc(s) */
-		if (!(udf_process_sequence(sb, main_s, main_e,
-					   fileset) &&
-		      udf_process_sequence(sb, reserve_s, reserve_e,
-					   fileset)))
-			break;
+	/* Secondly, we try with converted number of the last block */
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+	if (!lastblock) {
+		/* VARCONV didn't help. Clear it. */
+		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+		return 0;
 	}
+out:
+	sbi->s_last_block = lastblock;
+	return 1;
+}
 
-	if (i == ARRAY_SIZE(sbi->s_anchor)) {
-		udf_debug("No Anchor block found\n");
-		return 1;
+/*
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+			int silent, struct kernel_lb_addr *fileset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	loff_t nsr_off;
+
+	if (!sb_set_blocksize(sb, uopt->blocksize)) {
+		if (!silent)
+			printk(KERN_WARNING "UDF-fs: Bad block size\n");
+		return 0;
+	}
+	sbi->s_last_block = uopt->lastblock;
+	if (!uopt->novrs) {
+		/* Check that it is NSR02 compliant */
+		nsr_off = udf_check_vsd(sb);
+		if (!nsr_off) {
+			if (!silent)
+				printk(KERN_WARNING "UDF-fs: No VRS found\n");
+			return 0;
+		}
+		if (nsr_off == -1)
+			udf_debug("Failed to read byte 32768. Assuming open "
+				  "disc. Skipping validity check\n");
+		if (!sbi->s_last_block)
+			sbi->s_last_block = udf_get_last_block(sb);
+	} else {
+		udf_debug("Validity check skipped because of novrs option\n");
 	}
-	udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
 
-	return 0;
+	/* Look for anchor block and load Volume Descriptor Sequence */
+	sbi->s_anchor = uopt->anchor;
+	if (!udf_find_anchor(sb, fileset)) {
+		if (!silent)
+			printk(KERN_WARNING "UDF-fs: No anchor found\n");
+		return 0;
+	}
+	return 1;
 }
 
 static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
 	struct logicalVolIntegrityDescImpUse *lvidiu;
+
 	if (!bh)
 		return;
-
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
 	lvidiu = udf_sb_lvidiu(sbi);
 
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
 	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
 				CURRENT_TIME);
-	lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
 
 	lvid->descTag.descCRC = cpu_to_le16(
-		crc_itu_t(0, (char *)lvid + sizeof(tag),
+		crc_itu_t(0, (char *)lvid + sizeof(struct tag),
 			le16_to_cpu(lvid->descTag.descCRCLength)));
 
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
 }
 
 static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
 		return;
 
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-
-	if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
-		return;
-
 	lvidiu = udf_sb_lvidiu(sbi);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
 	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
 
 	lvid->descTag.descCRC = cpu_to_le16(
-			crc_itu_t(0, (char *)lvid + sizeof(tag),
+			crc_itu_t(0, (char *)lvid + sizeof(struct tag),
 				le16_to_cpu(lvid->descTag.descCRCLength)));
 
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
+	sbi->s_lvid_dirty = 0;
 }
 
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
 	int i;
+	int ret;
 	struct inode *inode = NULL;
 	struct udf_options uopt;
-	kernel_lb_addr rootdir, fileset;
+	struct kernel_lb_addr rootdir, fileset;
 	struct udf_sb_info *sbi;
 
 	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
 	uopt.uid = -1;
 	uopt.gid = -1;
 	uopt.umask = 0;
+	uopt.fmode = UDF_INVALID_MODE;
+	uopt.dmode = UDF_INVALID_MODE;
 
 	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sbi->s_uid = uopt.uid;
 	sbi->s_gid = uopt.gid;
 	sbi->s_umask = uopt.umask;
+	sbi->s_fmode = uopt.fmode;
+	sbi->s_dmode = uopt.dmode;
 	sbi->s_nls_map = uopt.nls_map;
 
-	/* Set the block size for all transfers */
-	if (!sb_min_blocksize(sb, uopt.blocksize)) {
-		udf_debug("Bad block size (%d)\n", uopt.blocksize);
-		printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
-		goto error_out;
-	}
-
 	if (uopt.session == 0xFFFFFFFF)
 		sbi->s_session = udf_get_last_session(sb);
 	else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	udf_debug("Multi-session=%d\n", sbi->s_session);
 
-	sbi->s_last_block = uopt.lastblock;
-	sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
-	sbi->s_anchor[2] = uopt.anchor;
-
-	if (udf_check_valid(sb, uopt.novrs, silent)) {
-		/* read volume recognition sequences */
-		printk(KERN_WARNING "UDF-fs: No VRS found\n");
-		goto error_out;
-	}
-
-	udf_find_anchor(sb);
-
 	/* Fill in the rest of the superblock */
 	sb->s_op = &udf_sb_ops;
 	sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sb->s_magic = UDF_SUPER_MAGIC;
 	sb->s_time_gran = 1000;
 
-	if (udf_load_sequence(sb, &fileset)) {
+	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+	} else {
+		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+			if (!silent)
+				printk(KERN_NOTICE
+				       "UDF-fs: Rescanning with blocksize "
+				       "%d\n", UDF_DEFAULT_BLOCKSIZE);
+			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		}
+	}
+	if (!ret) {
 		printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
 		goto error_out;
 	}
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	}
 
 	if (!silent) {
-		timestamp ts;
+		struct timestamp ts;
 		udf_time_to_disk_stamp(&ts, sbi->s_record_time);
 		udf_info("UDF: Mounting volume '%s', "
 			 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	/* Assign the root inode */
 	/* assign inodes by physical block number */
 	/* perhaps it's not extensible enough, but for now ... */
-	inode = udf_iget(sb, rootdir);
+	inode = udf_iget(sb, &rootdir);
 	if (!inode) {
 		printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
 				"partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	mutex_lock(&sbi->s_alloc_mutex);
+	if (sbi->s_lvid_dirty) {
+		/*
+		 * Blockdevice will be synced later so we don't have to submit
+		 * the buffer for IO
+		 */
+		mark_buffer_dirty(sbi->s_lvid_bh);
+		sb->s_dirt = 0;
+		sbi->s_lvid_dirty = 0;
+	}
+	mutex_unlock(&sbi->s_alloc_mutex);
+
+	return 0;
+}
+
 static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct logicalVolIntegrityDescImpUse *lvidiu;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	if (sbi->s_lvid_bh != NULL)
 		lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 					  le32_to_cpu(lvidiu->numDirs)) : 0)
 			+ buf->f_bfree;
 	buf->f_ffree = buf->f_bfree;
-	/* __kernel_fsid_t f_fsid */
 	buf->f_namelen = UDF_NAME_LEN - 2;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	return 0;
 }
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 	unsigned int accum = 0;
 	int index;
 	int block = 0, newblock;
-	kernel_lb_addr loc;
+	struct kernel_lb_addr loc;
 	uint32_t bytes;
 	uint8_t *ptr;
 	uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 
 	loc.logicalBlockNum = bitmap->s_extPosition;
 	loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-	bh = udf_read_ptagged(sb, loc, 0, &ident);
+	bh = udf_read_ptagged(sb, &loc, 0, &ident);
 
 	if (!bh) {
 		printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 		bytes -= cur_bytes;
 		if (bytes) {
 			brelse(bh);
-			newblock = udf_get_lb_pblock(sb, loc, ++block);
+			newblock = udf_get_lb_pblock(sb, &loc, ++block);
 			bh = udf_tread(sb, newblock);
 			if (!bh) {
 				udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 {
 	unsigned int accum = 0;
 	uint32_t elen;
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	int8_t etype;
 	struct extent_position epos;
 
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
 #include "udf_sb.h"
 
 static void extent_trunc(struct inode *inode, struct extent_position *epos,
-			 kernel_lb_addr eloc, int8_t etype, uint32_t elen,
+			 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
 			 uint32_t nelen)
 {
-	kernel_lb_addr neloc = {};
+	struct kernel_lb_addr neloc = {};
 	int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
 					last_block);
 			etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
 		} else
-			neloc = eloc;
+			neloc = *eloc;
 		nelen = (etype << 30) | nelen;
 	}
 
 	if (elen != nelen) {
-		udf_write_aext(inode, epos, neloc, nelen, 0);
+		udf_write_aext(inode, epos, &neloc, nelen, 0);
 		if (last_block - first_block > 0) {
 			if (etype == (EXT_RECORDED_ALLOCATED >> 30))
 				mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
 void udf_truncate_tail_extent(struct inode *inode)
 {
 	struct extent_position epos = {};
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen, nelen;
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
 		return;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		BUG();
 
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
 				       (unsigned)elen);
 			nelen = elen - (lbcount - inode->i_size);
 			epos.offset -= adsize;
-			extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+			extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
 			epos.offset += adsize;
 			if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
 				printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
 void udf_discard_prealloc(struct inode *inode)
 {
 	struct extent_position epos = { NULL, 0, {0, 0} };
-	kernel_lb_addr eloc;
+	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
 		return;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		adsize = 0;
 
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
 	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
 		epos.offset -= adsize;
 		lbcount -= elen;
-		extent_trunc(inode, &epos, eloc, etype, elen, 0);
+		extent_trunc(inode, &epos, &eloc, etype, elen, 0);
 		if (!epos.bh) {
 			iinfo->i_lenAlloc =
 				epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
 void udf_truncate_extents(struct inode *inode)
 {
 	struct extent_position epos;
-	kernel_lb_addr eloc, neloc = {};
+	struct kernel_lb_addr eloc, neloc = {};
 	uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
 	int8_t etype;
 	struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(short_ad);
+		adsize = sizeof(struct short_ad);
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(long_ad);
+		adsize = sizeof(struct long_ad);
 	else
 		BUG();
 
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
 		(inode->i_size & (sb->s_blocksize - 1));
 	if (etype != -1) {
 		epos.offset -= adsize;
-		extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+		extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
 		epos.offset += adsize;
 		if (byte_offset)
 			lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
 		while ((etype = udf_current_aext(inode, &epos, &eloc,
 						 &elen, 0)) != -1) {
 			if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-				udf_write_aext(inode, &epos, neloc, nelen, 0);
+				udf_write_aext(inode, &epos, &neloc, nelen, 0);
 				if (indirect_ext_len) {
 					/* We managed to free all extents in the
 					 * indirect extent - free it too */
 					BUG_ON(!epos.bh);
-					udf_free_blocks(sb, inode, epos.block,
+					udf_free_blocks(sb, inode, &epos.block,
 							0, indirect_ext_len);
 				} else if (!epos.bh) {
 					iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
 				epos.offset = sizeof(struct allocExtDesc);
 				epos.block = eloc;
 				epos.bh = udf_tread(sb,
-						udf_get_lb_pblock(sb, eloc, 0));
+						udf_get_lb_pblock(sb, &eloc, 0));
 				if (elen)
 					indirect_ext_len =
 						(elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
 				else
 					indirect_ext_len = 1;
 			} else {
-				extent_trunc(inode, &epos, eloc, etype,
+				extent_trunc(inode, &epos, &eloc, etype,
 					     elen, 0);
 				epos.offset += adsize;
 			}
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
 
 		if (indirect_ext_len) {
 			BUG_ON(!epos.bh);
-			udf_free_blocks(sb, inode, epos.block, 0,
+			udf_free_blocks(sb, inode, &epos.block, 0,
 					indirect_ext_len);
 		} else if (!epos.bh) {
 			iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
 			udf_update_alloc_ext_desc(inode, &epos, lenalloc);
 	} else if (inode->i_size) {
 		if (byte_offset) {
-			kernel_long_ad extent;
+			struct kernel_long_ad extent;
 
 			/*
 			 *  OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
 struct udf_inode_info {
 	struct timespec		i_crtime;
 	/* Physical address of inode */
-	kernel_lb_addr		i_location;
+	struct kernel_lb_addr		i_location;
 	__u64			i_unique;
 	__u32			i_lenEAttr;
 	__u32			i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
 	unsigned		i_strat4096 : 1;
 	unsigned		reserved : 26;
 	union {
-		short_ad	*i_sad;
-		long_ad		*i_lad;
+		struct short_ad	*i_sad;
+		struct long_ad		*i_lad;
 		__u8		*i_data;
 	} i_ext;
 	struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
 #define UDF_FLAG_GID_SET	16
 #define UDF_FLAG_SESSION_SET	17
 #define UDF_FLAG_LASTBLOCK_SET	18
+#define UDF_FLAG_BLOCKSIZE_SET	19
 
 #define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
@@ -48,6 +49,8 @@
 #define UDF_SPARABLE_MAP15		0x1522U
 #define UDF_METADATA_MAP25		0x2511U
 
+#define UDF_INVALID_MODE		((mode_t)-1)
+
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 
 struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
 
 	/* Sector headers */
 	__s32			s_session;
-	__u32			s_anchor[3];
+	__u32			s_anchor;
 	__u32			s_last_block;
 
 	struct buffer_head	*s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
 	mode_t			s_umask;
 	gid_t			s_gid;
 	uid_t			s_uid;
+	mode_t			s_fmode;
+	mode_t			s_dmode;
 
 	/* Root Info */
 	struct timespec		s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
 	struct inode		*s_vat_inode;
 
 	struct mutex		s_alloc_mutex;
+	/* Protected by s_alloc_mutex */
+	unsigned int		s_lvid_dirty;
 };
 
 static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
 		return 0;
 }
 
-#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
-
 /* computes tag checksum */
-u8 udf_tag_checksum(const tag *t);
+u8 udf_tag_checksum(const struct tag *t);
 
 struct dentry;
 struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
 };
 
 struct generic_desc {
-	tag		descTag;
+	struct tag	descTag;
 	__le32		volDescSeqNum;
 };
 
@@ -108,11 +106,22 @@ struct ustr {
 struct extent_position {
 	struct buffer_head *bh;
 	uint32_t offset;
-	kernel_lb_addr block;
+	struct kernel_lb_addr block;
 };
 
 /* super.c */
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+	struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+
+	BUG_ON(!bh);
+	WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+		     bh->b_data)->integrityType !=
+		     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+	sb->s_dirt = 1;
+	UDF_SB(sb)->s_lvid_dirty = 1;
+}
 
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
 		     unsigned long);
 
 /* inode.c */
-extern struct inode *udf_iget(struct super_block *, kernel_lb_addr);
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, int);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
-			   kernel_long_ad *, sector_t);
+			   struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
-			 kernel_lb_addr *, uint32_t *, sector_t *);
+			 struct kernel_lb_addr *, uint32_t *, sector_t *);
 extern int8_t udf_add_aext(struct inode *, struct extent_position *,
-			   kernel_lb_addr, uint32_t, int);
+			   struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-			     kernel_lb_addr, uint32_t, int);
+			     struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
-			      kernel_lb_addr, uint32_t);
+			      struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
-			    kernel_lb_addr *, uint32_t *, int);
+			    struct kernel_lb_addr *, uint32_t *, int);
 extern int8_t udf_current_aext(struct inode *, struct extent_position *,
-			       kernel_lb_addr *, uint32_t *, int);
+			       struct kernel_lb_addr *, uint32_t *, int);
 
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
 extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
 					   uint32_t, uint16_t *);
 extern struct buffer_head *udf_read_ptagged(struct super_block *,
-					    kernel_lb_addr, uint32_t,
+					    struct kernel_lb_addr *, uint32_t,
 					    uint16_t *);
 extern void udf_update_tag(char *, int);
 extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
 					  uint32_t);
 extern int udf_relocate_blocks(struct super_block *, long, long *);
 
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+		  uint32_t offset)
+{
+	return udf_get_pblock(sb, loc->logicalBlockNum,
+			loc->partitionReferenceNum, offset);
+}
+
 /* unicode.c */
 extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
 
 /* balloc.c */
 extern void udf_free_blocks(struct super_block *, struct inode *,
-			    kernel_lb_addr, uint32_t, uint32_t);
+			    struct kernel_lb_addr *, uint32_t, uint32_t);
 extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 			       uint32_t, uint32_t);
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
 						struct udf_fileident_bh *,
 						struct fileIdentDesc *,
 						struct extent_position *,
-						kernel_lb_addr *, uint32_t *,
+						struct kernel_lb_addr *, uint32_t *,
 						sector_t *);
 extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
 					       int *offset);
-extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
-extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 
 /* udftime.c */
 extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
-						timestamp src);
-extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
+						struct timestamp src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
 
 #endif				/* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
 #include <asm/byteorder.h>
 #include <linux/string.h>
 
-static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
 {
-	kernel_lb_addr out;
+	struct kernel_lb_addr out;
 
 	out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
 	out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
 	return out;
 }
 
-static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
 {
-	lb_addr out;
+	struct lb_addr out;
 
 	out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
 	out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
 	return out;
 }
 
-static inline short_ad lesa_to_cpu(short_ad in)
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
 {
-	short_ad out;
+	struct short_ad out;
 
 	out.extLength = le32_to_cpu(in.extLength);
 	out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
 	return out;
 }
 
-static inline short_ad cpu_to_lesa(short_ad in)
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
 {
-	short_ad out;
+	struct short_ad out;
 
 	out.extLength = cpu_to_le32(in.extLength);
 	out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
 	return out;
 }
 
-static inline kernel_long_ad lela_to_cpu(long_ad in)
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
 {
-	kernel_long_ad out;
+	struct kernel_long_ad out;
 
 	out.extLength = le32_to_cpu(in.extLength);
 	out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
 	return out;
 }
 
-static inline long_ad cpu_to_lela(kernel_long_ad in)
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
 {
-	long_ad out;
+	struct long_ad out;
 
 	out.extLength = cpu_to_le32(in.extLength);
 	out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
 	return out;
 }
 
-static inline kernel_extent_ad leea_to_cpu(extent_ad in)
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
 {
-	kernel_extent_ad out;
+	struct kernel_extent_ad out;
 
 	out.extLength = le32_to_cpu(in.extLength);
 	out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR	(60 * 60)
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
 
-struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
 {
 	int yday;
 	u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
 	return dest;
 }
 
-timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
 {
 	long int days, rem, y;
 	const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 {
 	const uint8_t *ocu;
 	uint8_t cmp_id, ocu_len;
-	int i;
+	int i, len;
 
 
 	ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 		if (cmp_id == 16)
 			c = (c << 8) | ocu[i++];
 
-		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
-					      UDF_NAME_LEN - utf_o->u_len);
+		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+				    UDF_NAME_LEN - utf_o->u_len);
+		/* Valid character? */
+		if (len >= 0)
+			utf_o->u_len += len;
+		else
+			utf_o->u_name[utf_o->u_len++] = '?';
 	}
 	utf_o->u_cmpID = 8;
 
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
 			int length)
 {
-	unsigned len, i, max_val;
+	int len;
+	unsigned i, max_val;
 	uint16_t uni_char;
 	int u_len;
 
@@ -302,8 +308,13 @@ try_again:
 	u_len = 0U;
 	for (i = 0U; i < uni->u_len; i++) {
 		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
-		if (len <= 0)
+		if (!len)
 			continue;
+		/* Invalid character, deal with it */
+		if (len < 0) {
+			len = 1;
+			uni_char = '?';
+		}
 
 		if (uni_char > max_val) {
 			max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
 		     int flen)
 {
-	struct ustr filename, unifilename;
-	int len;
+	struct ustr *filename, *unifilename;
+	int len = 0;
 
-	if (udf_build_ustr_exact(&unifilename, sname, flen))
+	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!filename)
 		return 0;
 
+	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+	if (!unifilename)
+		goto out1;
+
+	if (udf_build_ustr_exact(unifilename, sname, flen))
+		goto out2;
+
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		if (!udf_CS0toUTF8(&filename, &unifilename)) {
+		if (!udf_CS0toUTF8(filename, unifilename)) {
 			udf_debug("Failed in udf_get_filename: sname = %s\n",
 				  sname);
-			return 0;
+			goto out2;
 		}
 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
-				  &unifilename)) {
+		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+				  unifilename)) {
 			udf_debug("Failed in udf_get_filename: sname = %s\n",
 				  sname);
-			return 0;
+			goto out2;
 		}
 	} else
-		return 0;
-
-	len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
-				     unifilename.u_name, unifilename.u_len);
-	if (len)
-		return len;
-
-	return 0;
+		goto out2;
+
+	len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+				     unifilename->u_name, unifilename->u_len);
+out2:
+	kfree(unifilename);
+out1:
+	kfree(filename);
+	return len;
 }
 
 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 0d9ada173739..54c16ec95dff 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 				   "bit already cleared for fragment %u", i);
 	}
 	
-	DQUOT_FREE_BLOCK (inode, count);
+	vfs_dq_free_block(inode, count);
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
 		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
-		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
+		vfs_dq_free_block(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
 		uspi->cs_total.cs_nbfree++;
@@ -556,7 +556,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-	if(DQUOT_ALLOC_BLOCK(inode, count)) {
+	if (vfs_dq_alloc_block(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -664,7 +664,7 @@ cg_found:
 		for (i = count; i < uspi->s_fpb; i++)
 			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
-		DQUOT_FREE_BLOCK(inode, i);
+		vfs_dq_free_block(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
 		uspi->cs_total.cs_nffree += i;
@@ -676,7 +676,7 @@ cg_found:
 	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
 	if (result == INVBLOCK)
 		return 0;
-	if(DQUOT_ALLOC_BLOCK(inode, count)) {
+	if (vfs_dq_alloc_block(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -747,7 +747,7 @@ gotit:
 	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
-	if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
+	if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
 		*err = -EDQUOT;
 		return INVBLOCK;
 	}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 6f5dcf006096..3527c00fef0d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
 
 	is_directory = S_ISDIR(inode->i_mode);
 
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	clear_inode (inode);
 
@@ -355,8 +355,8 @@ cg_found:
 
 	unlock_super (sb);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
+	if (vfs_dq_alloc_inode(inode)) {
+		vfs_dq_drop(inode);
 		err = -EDQUOT;
 		goto fail_without_unlock;
 	}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 39f877898565..3d2512c21f05 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -622,7 +622,6 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	mode_t mode;
-	unsigned i;
 
 	/*
 	 * Copy data to the in-core inode.
@@ -655,11 +654,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
+		memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
+		memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0;
 	}
 	return 0;
 }
@@ -669,7 +669,6 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	mode_t mode;
-	unsigned i;
 
 	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
 	/*
@@ -704,12 +703,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	*/
 
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufsi->i_u1.u2_i_data[i] =
-				ufs2_inode->ui_u2.ui_addr.ui_db[i];
+		memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr,
+		       sizeof(ufs2_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
+		memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink,
+		       sizeof(ufs2_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0;
 	}
 	return 0;
 }
@@ -781,7 +780,6 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
 	struct super_block *sb = inode->i_sb;
  	struct ufs_inode_info *ufsi = UFS_I(inode);
- 	unsigned i;
 
 	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
 	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
@@ -809,12 +807,12 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
 		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0];
 	} else if (inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.i_data[i];
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	}
 	else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i];
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
 	}
 
 	if (!inode->i_nlink)
@@ -825,7 +823,6 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 {
 	struct super_block *sb = inode->i_sb;
  	struct ufs_inode_info *ufsi = UFS_I(inode);
- 	unsigned i;
 
 	UFSD("ENTER\n");
 	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
@@ -850,11 +847,11 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
 		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0];
 	} else if (inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.u2_i_data[i];
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i];
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
  	}
 
 	if (!inode->i_nlink)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index e3a9b1fac75a..23119fe7ad62 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -147,7 +147,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	} else {
 		/* fast symlink */
 		inode->i_op = &ufs_fast_symlink_inode_operations;
-		memcpy((char*)&UFS_I(inode)->i_u1.i_data,symname,l);
+		memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
 		inode->i_size = l-1;
 	}
 	mark_inode_dirty(inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e65212dfb60e..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -41,7 +41,7 @@
  * Stefan Reinauer <stepan@home.culture.mipt.ru>
  *
  * Module usage counts added on 96/04/29 by
- * Gertjan van Wingerde <gertjan@cs.vu.nl>
+ * Gertjan van Wingerde <gwingerde@gmail.com>
  *
  * Clean swab support on 19970406 by
  * Francois-Rene Rideau <fare@tunes.org>
@@ -636,6 +636,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned block_size, super_block_size;
 	unsigned flags;
 	unsigned super_block_offset;
+	unsigned maxsymlen;
 	int ret = -EINVAL;
 
 	uspi = NULL;
@@ -1069,6 +1070,16 @@ magic_found:
 		uspi->s_maxsymlinklen =
 		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
 
+	if (uspi->fs_magic == UFS2_MAGIC)
+		maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR);
+	else
+		maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR);
+	if (uspi->s_maxsymlinklen > maxsymlen) {
+		ufs_warning(sb, __func__, "ufs_read_super: excessive maximum "
+			    "fast symlink size (%u)\n", uspi->s_maxsymlinklen);
+		uspi->s_maxsymlinklen = maxsymlen;
+	}
+
 	inode = ufs_iget(sb, UFS_ROOTINO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
@@ -1257,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ufs_super_block_first *usb1;
 	struct ufs_super_block_second *usb2;
 	struct ufs_super_block_third *usb3;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	lock_kernel();
 
@@ -1279,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
 	buf->f_files = uspi->s_ncg * uspi->s_ipg;
 	buf->f_namelen = UFS_MAXNAMLEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
 
 	unlock_kernel();
 
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 11c035168ea6..69b3427d7885 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -23,7 +23,7 @@ struct ufs_sb_info {
 struct ufs_inode_info {
 	union {
 		__fs32	i_data[15];
-		__u8	i_symlink[4*15];
+		__u8	i_symlink[2 * 4 * 15];
 		__fs64	u2_i_data[15];
 	} i_u1;
 	__u32	i_flags;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff89..60f107e47fe9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
 				   xfs_qm_syscalls.o \
 				   xfs_qm_bhv.o \
 				   xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA)		+= linux-2.6/xfs_quotaops.o
 
 ifeq ($(CONFIG_XFS_QUOTA),y)
 xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc2..000000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-
-#include <linux/mutex.h>
-
-typedef struct mutex mutex_t;
-
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771e..c13f67300fe7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..aa1016bb9134 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
 
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -166,6 +172,75 @@ test_page_region(
 }
 
 /*
+ *	Mapping of multi-page buffers into contiguous virtual space
+ */
+
+typedef struct a_list {
+	void		*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+
+static a_list_t		*as_free_head;
+static int		as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+
+/*
+ *	Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+#ifdef CONFIG_XEN
+	/*
+	 * Xen needs to be able to make sure it can get an exclusive
+	 * RO mapping of pages it wants to turn into a pagetable.  If
+	 * a newly allocated page is also still being vmap()ed by xfs,
+	 * it will cause pagetable construction to fail.  This is a
+	 * quick workaround to always eagerly unmap pages so that Xen
+	 * is happy.
+	 */
+	vunmap(addr);
+	return;
+#endif
+
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+	if (likely(aentry)) {
+		spin_lock(&as_lock);
+		aentry->next = as_free_head;
+		aentry->vm_addr = addr;
+		as_free_head = aentry;
+		as_list_len++;
+		spin_unlock(&as_lock);
+	} else {
+		vunmap(addr);
+	}
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL)
+		return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
+/*
  *	Internal xfs_buf_t object manipulation
  */
 
@@ -264,7 +339,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+			free_address(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +461,10 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                       -1, PAGE_KERNEL);
+		if (as_list_len > 64)
+			purge_addresses();
+		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
@@ -1364,10 +1441,12 @@ xfs_unregister_buftarg(
 
 void
 xfs_free_buftarg(
-	xfs_buftarg_t		*btp)
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*btp)
 {
 	xfs_flush_buftarg(btp, 1);
-	xfs_blkdev_issue_flush(btp);
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		xfs_blkdev_issue_flush(btp);
 	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
@@ -1672,6 +1751,8 @@ xfsbufd(
 			count++;
 		}
 
+		if (as_list_len > 0)
+			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c4c800..9b4d666ad31f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
  *	Handling of buftargs.
  */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
-	struct page		*page)
+	struct vm_fault		*vmf)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f33..d0b499418a7d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_ioctl.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
+	struct file		*file = NULL;
+	struct path		path;
+	int			error;
+	struct xfs_inode	*ip;
 
-	memset((char *)&handle, 0, sizeof(handle));
-
-	switch (cmd) {
-	case XFS_IOC_PATH_TO_FSHANDLE:
-	case XFS_IOC_PATH_TO_HANDLE: {
-		struct path path;
-		int error = user_lpath((const char __user *)hreq->path, &path);
+	if (cmd == XFS_IOC_FD_TO_HANDLE) {
+		file = fget(hreq->fd);
+		if (!file)
+			return -EBADF;
+		inode = file->f_path.dentry->d_inode;
+	} else {
+		error = user_lpath((const char __user *)hreq->path, &path);
 		if (error)
 			return error;
-
-		ASSERT(path.dentry);
-		ASSERT(path.dentry->d_inode);
-		inode = igrab(path.dentry->d_inode);
-		path_put(&path);
-		break;
+		inode = path.dentry->d_inode;
 	}
+	ip = XFS_I(inode);
 
-	case XFS_IOC_FD_TO_HANDLE: {
-		struct file	*file;
-
-		file = fget(hreq->fd);
-		if (!file)
-		    return -EBADF;
+	/*
+	 * We can only generate handles for inodes residing on a XFS filesystem,
+	 * and only for regular files, directories or symbolic links.
+	 */
+	error = -EINVAL;
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+		goto out_put;
 
-		ASSERT(file->f_path.dentry);
-		ASSERT(file->f_path.dentry->d_inode);
-		inode = igrab(file->f_path.dentry->d_inode);
-		fput(file);
-		break;
-	}
+	error = -EBADF;
+	if (!S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode) &&
+	    !S_ISLNK(inode->i_mode))
+		goto out_put;
 
-	default:
-		ASSERT(0);
-		return -XFS_ERROR(EINVAL);
-	}
 
-	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
-		/* we're not in XFS anymore, Toto */
-		iput(inode);
-		return -XFS_ERROR(EINVAL);
-	}
+	memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
 
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		break;
-	default:
-		iput(inode);
-		return -XFS_ERROR(EBADF);
-	}
-
-	/* now we can grab the fsid */
-	memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
-			sizeof(xfs_fsid_t));
-	hsize = sizeof(xfs_fsid_t);
-
-	if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
-		xfs_inode_t	*ip = XFS_I(inode);
+	if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+		/*
+		 * This handle only contains an fsid, zero the rest.
+		 */
+		memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+		hsize = sizeof(xfs_fsid_t);
+	} else {
 		int		lock_mode;
 
-		/* need to get access to the xfs_inode to read the generation */
 		lock_mode = xfs_ilock_map_shared(ip);
-
-		/* fill in fid section of handle from inode */
 		handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
 					sizeof(handle.ha_fid.fid_len);
 		handle.ha_fid.fid_pad = 0;
 		handle.ha_fid.fid_gen = ip->i_d.di_gen;
 		handle.ha_fid.fid_ino = ip->i_ino;
-
 		xfs_iunlock_map_shared(ip, lock_mode);
 
 		hsize = XFS_HSIZE(handle);
 	}
 
-	/* now copy our handle into the user buffer & write out the size */
+	error = -EFAULT;
 	if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-	    copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
-		iput(inode);
-		return -XFS_ERROR(EFAULT);
-	}
+	    copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+		goto out_put;
 
-	iput(inode);
-	return 0;
+	error = 0;
+
+ out_put:
+	if (cmd == XFS_IOC_FD_TO_HANDLE)
+		fput(file);
+	else
+		path_put(&path);
+	return error;
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
 	 * Irix uses Missed'em'V split, but doesn't want to see
 	 * the upper 5 bits of (14bit) major.
 	 */
-	if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
-		return -EINVAL;
+	if (S_ISCHR(mode) || S_ISBLK(mode)) {
+		if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+			return -EINVAL;
+		rdev = sysv_encode_dev(rdev);
+	} else {
+		rdev = 0;
+	}
 
 	if (test_default_acl && test_default_acl(dir)) {
 		if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
 		}
 	}
 
-	xfs_dentry_to_name(&name, dentry);
-
 	if (IS_POSIXACL(dir) && !default_acl)
-		mode &= ~current->fs->umask;
-
-	switch (mode & S_IFMT) {
-	case S_IFCHR:
-	case S_IFBLK:
-	case S_IFIFO:
-	case S_IFSOCK:
-		rdev = sysv_encode_dev(rdev);
-	case S_IFREG:
-		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
-		break;
-	case S_IFDIR:
-		error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
-		break;
-	default:
-		error = EINVAL;
-		break;
-	}
+		mode &= ~current_umask();
 
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
 	if (unlikely(error))
 		goto out_free_acl;
 
@@ -416,7 +404,7 @@ xfs_vn_symlink(
 	mode_t		mode;
 
 	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
 	stat->uid = ip->i_d.di_uid;
 	stat->gid = ip->i_d.di_gid;
 	stat->ino = ip->i_ino;
-#if XFS_BIG_INUMS
-	stat->ino += mp->m_inoadd;
-#endif
 	stat->atime = inode->i_atime;
 	stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 	stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dccd..f65a53f8752f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
 #include <kmem.h>
 #include <mrlock.h>
 #include <sv.h>
-#include <mutex.h>
 #include <time.h>
 
 #include <support/ktrace.h>
@@ -51,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/errno.h>
@@ -147,17 +147,6 @@
 #define SYNCHRONIZE()	barrier()
 #define __return_address __builtin_return_address(0)
 
-/*
- * IRIX (BSD) quotactl makes use of separate commands for user/group,
- * whereas on Linux the syscall encodes this information into the cmd
- * field (see the QCMD macro in quota.h).  These macros help keep the
- * code portable - they are not visible from the syscall interface.
- */
-#define Q_XSETGQLIM	XQM_CMD(8)	/* set groups disk limits */
-#define Q_XGETGQUOTA	XQM_CMD(9)	/* get groups disk limits */
-#define Q_XSETPQLIM	XQM_CMD(10)	/* set projects disk limits */
-#define Q_XGETPQUOTA	XQM_CMD(11)	/* get projects disk limits */
-
 #define dfltprid	0
 #define MAXPATHLEN	1024
 
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 000000000000..94d9a633d3d9
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_dmapi.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return XFS_DQ_USER;
+	case GRPQUOTA:
+		return XFS_DQ_GROUP;
+	default:
+		return XFS_DQ_PROJ;
+	}
+}
+
+STATIC int
+xfs_fs_quota_sync(
+	struct super_block	*sb,
+	int			type)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	return -xfs_sync_inodes(mp, SYNC_DELWRI);
+}
+
+STATIC int
+xfs_fs_get_xstate(
+	struct super_block	*sb,
+	struct fs_quota_stat	*fqs)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+	struct super_block	*sb,
+	unsigned int		uflags,
+	int			op)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+	unsigned int		flags = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (uflags & XFS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_PDQ_ACCT)
+		flags |= XFS_PQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+		flags |= XFS_OQUOTA_ENFD;
+
+	switch (op) {
+	case Q_XQUOTAON:
+		return -xfs_qm_scall_quotaon(mp, flags);
+	case Q_XQUOTAOFF:
+		if (!XFS_IS_QUOTA_ON(mp))
+			return -EINVAL;
+		return -xfs_qm_scall_quotaoff(mp, flags);
+	case Q_XQUOTARM:
+		if (XFS_IS_QUOTA_ON(mp))
+			return -EINVAL;
+		return -xfs_qm_scall_trunc_qfiles(mp, flags);
+	}
+
+	return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_xquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+
+	return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_xquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+struct quotactl_ops xfs_quotactl_operations = {
+	.quota_sync		= xfs_fs_quota_sync,
+	.get_xstate		= xfs_fs_get_xstate,
+	.set_xstate		= xfs_fs_set_xstate,
+	.get_xquota		= xfs_fs_get_xquota,
+	.set_xquota		= xfs_fs_set_xquota,
+};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c71e226da7f5..bb685269f832 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
 #include <linux/freezer.h>
 #include <linux/parser.h>
 
-static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
 #define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
 #define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
-#define MNTOPT_INO64	"ino64"		/* force inodes into 64-bit range */
 #define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
 #define MNTOPT_SWALLOC	"swalloc"	/* turn on stripe width allocation */
 #define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
 	int			dswidth = 0;
 	int			iosize = 0;
 	int			dmapi_implies_ikeep = 1;
-	uchar_t			iosizelog = 0;
+	__uint8_t		iosizelog = 0;
 
 	/*
 	 * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
 		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
 			mp->m_flags |= XFS_MOUNT_NORECOVERY;
-		} else if (!strcmp(this_char, MNTOPT_INO64)) {
-#if XFS_BIG_INUMS
-			mp->m_flags |= XFS_MOUNT_INO64;
-			mp->m_inoadd = XFS_INO64_OFFSET;
-#else
-			cmn_err(CE_WARN,
-				"XFS: %s option not allowed on this system",
-				this_char);
-			return EINVAL;
-#endif
 		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
 			mp->m_flags |= XFS_MOUNT_NOALIGN;
 		} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
 		/* the few simple ones we can get from the mount struct */
 		{ XFS_MOUNT_IKEEP,		"," MNTOPT_IKEEP },
 		{ XFS_MOUNT_WSYNC,		"," MNTOPT_WSYNC },
-		{ XFS_MOUNT_INO64,		"," MNTOPT_INO64 },
 		{ XFS_MOUNT_NOALIGN,		"," MNTOPT_NOALIGN },
 		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
 		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
 	return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
 
-int
+STATIC int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
 	return -error;
 }
 
-void
+STATIC void
 xfs_blkdev_put(
 	struct block_device	*bdev)
 {
@@ -734,15 +721,15 @@ xfs_close_devices(
 {
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
 		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-		xfs_free_buftarg(mp->m_logdev_targp);
+		xfs_free_buftarg(mp, mp->m_logdev_targp);
 		xfs_blkdev_put(logdev);
 	}
 	if (mp->m_rtdev_targp) {
 		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
 		xfs_blkdev_put(rtdev);
 	}
-	xfs_free_buftarg(mp->m_ddev_targp);
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
 }
 
 /*
@@ -811,9 +798,9 @@ xfs_open_devices(
 
  out_free_rtdev_targ:
 	if (mp->m_rtdev_targp)
-		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
  out_free_ddev_targ:
-	xfs_free_buftarg(mp->m_ddev_targp);
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
  out_close_rtdev:
 	if (rtdev)
 		xfs_blkdev_put(rtdev);
@@ -872,7 +859,7 @@ xfsaild_wakeup(
 	wake_up_process(ailp->xa_task);
 }
 
-int
+STATIC int
 xfsaild(
 	void	*data)
 {
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
 	int			sync)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
 	int			error = 0;
-	int			flags = 0;
 
 	xfs_itrace_entry(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
 	if (sync) {
 		error = xfs_wait_on_pages(ip, 0, -1);
 		if (error)
-			goto out_error;
-		flags |= FLUSH_SYNC;
+			goto out;
 	}
-	error = xfs_inode_flush(ip, flags);
 
-out_error:
+	/*
+	 * Bypass inodes which have already been cleaned by
+	 * the inode flush clustering code inside xfs_iflush
+	 */
+	if (xfs_inode_clean(ip))
+		goto out;
+
+	/*
+	 * We make this non-blocking if the inode is contended, return
+	 * EAGAIN to indicate to the caller that they did not succeed.
+	 * This prevents the flush path from blocking on inodes inside
+	 * another operation right now, they get caught later by xfs_sync.
+	 */
+	if (sync) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		xfs_iflock(ip);
+
+		error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+	} else {
+		error = EAGAIN;
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+			goto out;
+		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+			goto out_unlock;
+
+		error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+	}
+
+ out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
 	/*
 	 * if we failed to write out the inode then mark
 	 * it dirty again so we'll try again later.
 	 */
 	if (error)
 		xfs_mark_inode_dirty_sync(ip);
-
 	return -error;
 }
 
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
 	statp->f_bfree = statp->f_bavail =
 				sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 	fakeinos = statp->f_bfree << sbp->sb_inopblog;
-#if XFS_BIG_INUMS
-	fakeinos += mp->m_inoadd;
-#endif
 	statp->f_files =
 	    MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
 	if (mp->m_maxicount)
-#if XFS_BIG_INUMS
-		if (!mp->m_inoadd)
-#endif
-			statp->f_files = min_t(typeof(statp->f_files),
-						statp->f_files,
-						mp->m_maxicount);
+		statp->f_files = min_t(typeof(statp->f_files),
+					statp->f_files,
+					mp->m_maxicount);
 	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
 	spin_unlock(&mp->m_sb_lock);
 
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
 	return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
 }
 
-STATIC int
-xfs_fs_quotasync(
-	struct super_block	*sb,
-	int			type)
-{
-	return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
-}
-
-STATIC int
-xfs_fs_getxstate(
-	struct super_block	*sb,
-	struct fs_quota_stat	*fqs)
-{
-	return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
-}
-
-STATIC int
-xfs_fs_setxstate(
-	struct super_block	*sb,
-	unsigned int		flags,
-	int			op)
-{
-	return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
-}
-
-STATIC int
-xfs_fs_getxquota(
-	struct super_block	*sb,
-	int			type,
-	qid_t			id,
-	struct fs_disk_quota	*fdq)
-{
-	return -XFS_QM_QUOTACTL(XFS_M(sb),
-				 (type == USRQUOTA) ? Q_XGETQUOTA :
-				  ((type == GRPQUOTA) ? Q_XGETGQUOTA :
-				   Q_XGETPQUOTA), id, (caddr_t)fdq);
-}
-
-STATIC int
-xfs_fs_setxquota(
-	struct super_block	*sb,
-	int			type,
-	qid_t			id,
-	struct fs_disk_quota	*fdq)
-{
-	return -XFS_QM_QUOTACTL(XFS_M(sb),
-				 (type == USRQUOTA) ? Q_XSETQLIM :
-				  ((type == GRPQUOTA) ? Q_XSETGQLIM :
-				   Q_XSETPQLIM), id, (caddr_t)fdq);
-}
-
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
 	sb_min_blocksize(sb, BBSIZE);
 	sb->s_xattr = xfs_xattr_handlers;
 	sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
 	sb->s_qcop = &xfs_quotactl_operations;
+#endif
 	sb->s_op = &xfs_super_operations;
 
 	error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
 	.show_options		= xfs_fs_show_options,
 };
 
-static struct quotactl_ops xfs_quotactl_operations = {
-	.quota_sync		= xfs_fs_quotasync,
-	.get_xstate		= xfs_fs_getxstate,
-	.set_xstate		= xfs_fs_setxstate,
-	.get_xquota		= xfs_fs_getxquota,
-	.set_xquota		= xfs_fs_setxquota,
-};
-
 static struct file_system_type xfs_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd67..5a2ea3a21781 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
 extern struct xattr_handler *xfs_xattr_handlers[];
+extern struct quotactl_ops xfs_quotactl_operations;
 
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f6..04f058c848ae 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
 #define XFS_SYNC_H 1
 
 struct xfs_mount;
+struct xfs_perag;
 
 typedef struct bhv_vfs_sync_work {
 	struct list_head	w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d3..ad7fbead4c97 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
 /*
- * Flags for xfs_inode_flush
- */
-#define FLUSH_SYNC		1	/* wait for flush to complete	*/
-
-/*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
  */
 #define FI_NONE			0	/* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
 					   the operation completes. */
 
 /*
- * Dealing with bad inodes
- */
-static inline int VN_BAD(struct inode *vp)
-{
-	return is_bad_inode(vp);
-}
-
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
-{
-	bs_atime->tv_sec = vp->i_atime.tv_sec;
-	bs_atime->tv_nsec = vp->i_atime.tv_nsec;
-}
-
-static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
-{
-	*ts = vp->i_atime;
-}
-
-static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
-{
-	*tt = vp->i_atime.tv_sec;
-}
-
-/*
  * Some useful predicates.
  */
 #define VN_MAPPED(vp)	mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b29753..e4babcc63423 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
 	uint			flist_locked;
 	xfs_dquot_t		*d;
 
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+	ASSERT(mutex_is_locked(&qh->qh_lock));
 
 	flist_locked = B_FALSE;
 
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
 			/*
 			 * move the dquot to the front of the hashchain
 			 */
-			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			ASSERT(mutex_is_locked(&qh->qh_lock));
 			if (dqp->HL_PREVP != &qh->qh_next) {
 				xfs_dqtrace_entry(dqp,
 						  "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
 			}
 			xfs_dqtrace_entry(dqp, "LOOKUP END");
 			*O_dqpp = dqp;
-			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			ASSERT(mutex_is_locked(&qh->qh_lock));
 			return (0);
 		}
 	}
 
 	*O_dqpp = NULL;
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+	ASSERT(mutex_is_locked(&qh->qh_lock));
 	return (1);
 }
 
@@ -956,7 +956,7 @@ xfs_qm_dqget(
 			ASSERT(ip->i_gdquot == NULL);
 	}
 #endif
-	XFS_DQ_HASH_LOCK(h);
+	mutex_lock(&h->qh_lock);
 
 	/*
 	 * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
 		 */
 		ASSERT(*O_dqpp);
 		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-		XFS_DQ_HASH_UNLOCK(h);
+		mutex_unlock(&h->qh_lock);
 		xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
 		return (0);	/* success */
 	}
@@ -991,7 +991,7 @@ xfs_qm_dqget(
 	 * we don't keep the lock across a disk read
 	 */
 	version = h->qh_version;
-	XFS_DQ_HASH_UNLOCK(h);
+	mutex_unlock(&h->qh_lock);
 
 	/*
 	 * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
 	/*
 	 * Hashlock comes after ilock in lock order
 	 */
-	XFS_DQ_HASH_LOCK(h);
+	mutex_lock(&h->qh_lock);
 	if (version != h->qh_version) {
 		xfs_dquot_t *tmpdqp;
 		/*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
 			 * and start over.
 			 */
 			xfs_qm_dqput(tmpdqp);
-			XFS_DQ_HASH_UNLOCK(h);
+			mutex_unlock(&h->qh_lock);
 			xfs_qm_dqdestroy(dqp);
 			XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
 			goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
 	 * Put the dquot at the beginning of the hash-chain and mp's list
 	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
 	 */
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+	ASSERT(mutex_is_locked(&h->qh_lock));
 	dqp->q_hash = h;
 	XQM_HASHLIST_INSERT(h, dqp);
 
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
 	XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
 
 	xfs_qm_mplist_unlock(mp);
-	XFS_DQ_HASH_UNLOCK(h);
+	mutex_unlock(&h->qh_lock);
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
 	xfs_mount_t	*mp = dqp->q_mount;
 
 	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+	ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
 
 	xfs_dqlock(dqp);
 	/*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
 	 */
 	if (dqp->q_nrefs != 0) {
 		xfs_dqunlock(dqp);
-		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+		mutex_unlock(&dqp->q_hash->qh_lock);
 		return (1);
 	}
 
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
 	memset(&dqp->q_core, 0, sizeof(dqp->q_core));
 	xfs_dqfunlock(dqp);
 	xfs_dqunlock(dqp);
-	XFS_DQ_HASH_UNLOCK(thishash);
+	mutex_unlock(&thishash->qh_lock);
 	return (0);
 }
 
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b4331..de0f402ddb4c 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
  */
 typedef struct xfs_dqhash {
 	struct xfs_dquot *qh_next;
-	mutex_t		  qh_lock;
+	struct mutex	  qh_lock;
 	uint		  qh_version;	/* ever increasing version */
 	uint		  qh_nelems;	/* number of dquots on the list */
 } xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
 	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
 	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
 	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
-	mutex_t		 q_qlock;	/* quota lock */
+	struct mutex	 q_qlock;	/* quota lock */
 	struct completion q_flush;	/* flush completion queue */
 	atomic_t          q_pincount;	/* dquot pin count */
 	wait_queue_head_t q_pinwait;	/* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
 
 #define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
 
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
-	if (mutex_trylock(&dqp->q_qlock)) {
-		mutex_unlock(&dqp->q_qlock);
-		return 0;
-	}
-	return 1;
-}
-#endif
-
-
 /*
  * Manage the q_flush completion queue embedded in the dquot.  This completion
  * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 	complete(&dqp->q_flush);
 }
 
+#define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314f..5b6695049e00 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
  * quota functionality, including maintaining the freelist and hash
  * tables of dquots.
  */
-mutex_t		xfs_Gqm_lock;
+struct mutex	xfs_Gqm_lock;
 struct xfs_qm	*xfs_Gqm;
 uint		ndquot;
 
@@ -69,8 +69,6 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
 STATIC void	xfs_qm_freelist_init(xfs_frlist_t *);
 STATIC void	xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int	xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int	xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
 
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
 };
 
 #ifdef DEBUG
-extern mutex_t	qcheck_lock;
+extern struct mutex	qcheck_lock;
 #endif
 
 #ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
 	 * the structure could disappear between the entry to this routine and
 	 * a HOLD operation if not locked.
 	 */
-	XFS_QM_LOCK(xfs_Gqm);
+	mutex_lock(&xfs_Gqm_lock);
 
 	if (xfs_Gqm == NULL)
 		xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
 	 * debugging and statistical purposes, but ...
 	 * Just take a reference and get out.
 	 */
-	XFS_QM_HOLD(xfs_Gqm);
-	XFS_QM_UNLOCK(xfs_Gqm);
+	xfs_Gqm->qm_nrefs++;
+	mutex_unlock(&xfs_Gqm_lock);
 
 	return 0;
 }
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
 	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
 	 * be restarted.
 	 */
-	XFS_QM_LOCK(xfs_Gqm);
-	XFS_QM_RELE(xfs_Gqm);
-	if (xfs_Gqm->qm_nrefs == 0) {
+	mutex_lock(&xfs_Gqm_lock);
+	if (--xfs_Gqm->qm_nrefs == 0) {
 		xfs_qm_destroy(xfs_Gqm);
 		xfs_Gqm = NULL;
 	}
-	XFS_QM_UNLOCK(xfs_Gqm);
+	mutex_unlock(&xfs_Gqm_lock);
 }
 
 /*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
 			continue;
 		}
 
-		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
 			nrecl = XFS_QI_MPLRECLAIMS(mp);
 			xfs_qm_mplist_unlock(mp);
-			XFS_DQ_HASH_LOCK(dqp->q_hash);
+			mutex_lock(&dqp->q_hash->qh_lock);
 			xfs_qm_mplist_lock(mp);
 
 			/*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
 			 * this point, but somebody might be taking things off.
 			 */
 			if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
-				XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+				mutex_unlock(&dqp->q_hash->qh_lock);
 				goto again;
 			}
 		}
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
 	xfs_dqid_t	id,
 	uint		type,
 	uint		doalloc,
-	uint		dolock,
 	xfs_dquot_t	*udqhint, /* hint */
 	xfs_dquot_t	**IO_idqpp)
 {
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	error = 0;
+
 	/*
 	 * See if we already have it in the inode itself. IO_idqpp is
 	 * &i_udquot or &i_gdquot. This made the code look weird, but
 	 * made the logic a lot simpler.
 	 */
-	if ((dqp = *IO_idqpp)) {
-		if (dolock)
-			xfs_dqlock(dqp);
+	dqp = *IO_idqpp;
+	if (dqp) {
 		xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
-		goto done;
+		return 0;
 	}
 
 	/*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
 	 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
 	 * the user dquot.
 	 */
-	ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-	if (udqhint && !dolock)
+	if (udqhint) {
+		ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
 		xfs_dqlock(udqhint);
 
-	/*
-	 * No need to take dqlock to look at the id.
-	 * The ID can't change until it gets reclaimed, and it won't
-	 * be reclaimed as long as we have a ref from inode and we hold
-	 * the ilock.
-	 */
-	if (udqhint &&
-	    (dqp = udqhint->q_gdquot) &&
-	    (be32_to_cpu(dqp->q_core.d_id) == id)) {
-		ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-		xfs_dqlock(dqp);
-		XFS_DQHOLD(dqp);
-		ASSERT(*IO_idqpp == NULL);
-		*IO_idqpp = dqp;
-		if (!dolock) {
+		/*
+		 * No need to take dqlock to look at the id.
+		 *
+		 * The ID can't change until it gets reclaimed, and it won't
+		 * be reclaimed as long as we have a ref from inode and we
+		 * hold the ilock.
+		 */
+		dqp = udqhint->q_gdquot;
+		if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+			xfs_dqlock(dqp);
+			XFS_DQHOLD(dqp);
+			ASSERT(*IO_idqpp == NULL);
+			*IO_idqpp = dqp;
+
 			xfs_dqunlock(dqp);
 			xfs_dqunlock(udqhint);
+			return 0;
 		}
-		goto done;
-	}
-	/*
-	 * We can't hold a dquot lock when we call the dqget code.
-	 * We'll deadlock in no time, because of (not conforming to)
-	 * lock ordering - the inodelock comes before any dquot lock,
-	 * and we may drop and reacquire the ilock in xfs_qm_dqget().
-	 */
-	if (udqhint)
+
+		/*
+		 * We can't hold a dquot lock when we call the dqget code.
+		 * We'll deadlock in no time, because of (not conforming to)
+		 * lock ordering - the inodelock comes before any dquot lock,
+		 * and we may drop and reacquire the ilock in xfs_qm_dqget().
+		 */
 		xfs_dqunlock(udqhint);
+	}
+
 	/*
 	 * Find the dquot from somewhere. This bumps the
 	 * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
 	 * disk and we didn't ask it to allocate;
 	 * ESRCH if quotas got turned off suddenly.
 	 */
-	if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
-				 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
-		if (udqhint && dolock)
-			xfs_dqlock(udqhint);
-		goto done;
-	}
+	error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+	if (error)
+		return error;
 
 	xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+
 	/*
 	 * dqget may have dropped and re-acquired the ilock, but it guarantees
 	 * that the dquot returned is the one that should go in the inode.
 	 */
 	*IO_idqpp = dqp;
-	ASSERT(dqp);
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	if (! dolock) {
-		xfs_dqunlock(dqp);
-		goto done;
-	}
-	if (! udqhint)
-		goto done;
-
-	ASSERT(udqhint);
-	ASSERT(dolock);
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	if (! xfs_qm_dqlock_nowait(udqhint)) {
-		xfs_dqunlock(dqp);
-		xfs_dqlock(udqhint);
-		xfs_dqlock(dqp);
-	}
-      done:
-#ifdef QUOTADEBUG
-	if (udqhint) {
-		if (dolock)
-			ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-	}
-	if (! error) {
-		if (dolock)
-			ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	}
-#endif
-	return error;
+	xfs_dqunlock(dqp);
+	return 0;
 }
 
 
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
 STATIC void
 xfs_qm_dqattach_grouphint(
 	xfs_dquot_t	*udq,
-	xfs_dquot_t	*gdq,
-	uint		locked)
+	xfs_dquot_t	*gdq)
 {
 	xfs_dquot_t	*tmp;
 
-#ifdef QUOTADEBUG
-	if (locked) {
-		ASSERT(XFS_DQ_IS_LOCKED(udq));
-		ASSERT(XFS_DQ_IS_LOCKED(gdq));
-	}
-#endif
-	if (! locked)
-		xfs_dqlock(udq);
+	xfs_dqlock(udq);
 
 	if ((tmp = udq->q_gdquot)) {
 		if (tmp == gdq) {
-			if (! locked)
-				xfs_dqunlock(udq);
+			xfs_dqunlock(udq);
 			return;
 		}
 
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
 		 * because the freelist lock comes before dqlocks.
 		 */
 		xfs_dqunlock(udq);
-		if (locked)
-			xfs_dqunlock(gdq);
 		/*
 		 * we took a hard reference once upon a time in dqget,
 		 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
 
 	} else {
 		ASSERT(XFS_DQ_IS_LOCKED(udq));
-		if (! locked) {
-			xfs_dqlock(gdq);
-		}
+		xfs_dqlock(gdq);
 	}
 
 	ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
 		XFS_DQHOLD(gdq);
 		udq->q_gdquot = gdq;
 	}
-	if (! locked) {
-		xfs_dqunlock(gdq);
-		xfs_dqunlock(udq);
-	}
+
+	xfs_dqunlock(gdq);
+	xfs_dqunlock(udq);
 }
 
 
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
  * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
  * into account.
  * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
  * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
  * Inode may get unlocked and relocked in here, and the caller must deal with
  * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
 	if (XFS_IS_UQUOTA_ON(mp)) {
 		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
 						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
 						NULL, &ip->i_udquot);
 		if (error)
 			goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
 		error = XFS_IS_GQUOTA_ON(mp) ?
 			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
 						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
 						ip->i_udquot, &ip->i_gdquot) :
 			xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
 						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
 						ip->i_udquot, &ip->i_gdquot);
 		/*
 		 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
 		/*
 		 * Attach i_gdquot to the gdquot hint inside the i_udquot.
 		 */
-		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
-					 flags & XFS_QMOPT_DQLOCK);
+		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
 	}
 
       done:
 
 #ifdef QUOTADEBUG
 	if (! error) {
-		if (ip->i_udquot) {
-			if (flags & XFS_QMOPT_DQLOCK)
-				ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
-		}
-		if (ip->i_gdquot) {
-			if (flags & XFS_QMOPT_DQLOCK)
-				ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
-		}
 		if (XFS_IS_UQUOTA_ON(mp))
 			ASSERT(ip->i_udquot);
 		if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
 		 * a dqlookup process that holds the hashlock that is
 		 * waiting for the freelist lock.
 		 */
-		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
 			xfs_dqfunlock(dqp);
 			xfs_dqunlock(dqp);
 			dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
 			/* XXX put a sentinel so that we can come back here */
 			xfs_dqfunlock(dqp);
 			xfs_dqunlock(dqp);
-			XFS_DQ_HASH_UNLOCK(hash);
+			mutex_unlock(&hash->qh_lock);
 			xfs_qm_freelist_unlock(xfs_Gqm);
 			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
 				return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
 		XQM_HASHLIST_REMOVE(hash, dqp);
 		xfs_dqfunlock(dqp);
 		xfs_qm_mplist_unlock(dqp->q_mount);
-		XFS_DQ_HASH_UNLOCK(hash);
+		mutex_unlock(&hash->qh_lock);
 
  off_freelist:
 		XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
 			continue;
 		}
 
-		if (! xfs_qm_dqhashlock_nowait(dqp))
+		if (!mutex_trylock(&dqp->q_hash->qh_lock))
 			goto mplistunlock;
 
 		ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
 		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
 		XQM_FREELIST_REMOVE(dqp);
 		dqpout = dqp;
-		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+		mutex_unlock(&dqp->q_hash->qh_lock);
  mplistunlock:
 		xfs_qm_mplist_unlock(dqp->q_mount);
 		xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
 {
 	xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
 }
-
-STATIC int
-xfs_qm_dqhashlock_nowait(
-	xfs_dquot_t *dqp)
-{
-	int locked;
-
-	locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
-	return locked;
-}
-
-int
-xfs_qm_freelist_lock_nowait(
-	xfs_qm_t *xqm)
-{
-	int locked;
-
-	locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
-	return locked;
-}
-
-STATIC int
-xfs_qm_mplist_nowait(
-	xfs_mount_t	*mp)
-{
-	int locked;
-
-	ASSERT(mp->m_quotainfo);
-	locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
-	return locked;
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387c..a371954cae1b 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
 struct xfs_inode;
 
 extern uint		ndquot;
-extern mutex_t		xfs_Gqm_lock;
+extern struct mutex	xfs_Gqm_lock;
 extern struct xfs_qm	*xfs_Gqm;
 extern kmem_zone_t	*qm_dqzone;
 extern kmem_zone_t	*qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t	xfs_dqlist_t;
 typedef struct xfs_frlist {
        struct xfs_dquot *qh_next;
        struct xfs_dquot *qh_prev;
-       mutex_t		 qh_lock;
+       struct mutex	 qh_lock;
        uint		 qh_version;
        uint		 qh_nelems;
 } xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
 	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for blks warnings */
 	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for inodes warnings */
 	xfs_qwarncnt_t	 qi_rtbwarnlimit;/* limit for rt blks warnings */
-	mutex_t		 qi_quotaofflock;/* to serialize quotaoff */
+	struct mutex	 qi_quotaofflock;/* to serialize quotaoff */
 	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
 	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
 	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 
-#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
-
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void		xfs_qm_mount_quotas(xfs_mount_t *);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void		xfs_qm_dqdetach(xfs_inode_t *);
 extern int		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 
+/* quota ops */
+extern int		xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int		xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+extern int		xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
 /* vop stuff */
 extern int		xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
 					uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int		xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
 /* list stuff */
 extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int		xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-
-/* system call interface */
-extern int		xfs_qm_quotactl(struct xfs_mount *, int, int,
-				xfs_caddr_t);
 
 #ifdef DEBUG
 extern int		xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e12..63037c689a4b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
 	.xfs_dqvopchownresv	= xfs_qm_vop_chown_reserve,
 	.xfs_dqstatvfs		= xfs_qm_statvfs,
 	.xfs_dqsync		= xfs_qm_sync,
-	.xfs_quotactl		= xfs_qm_quotactl,
 	.xfs_dqtrxops		= &xfs_trans_dquot_ops,
 };
 EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aede..c7b66f6506ce 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
 # define qdprintk(s, args...)	do { } while (0)
 #endif
 
-STATIC int	xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int	xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-STATIC int	xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int	xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-STATIC int	xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int	xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
 					uint);
-STATIC uint	xfs_qm_import_flags(uint);
 STATIC uint	xfs_qm_export_flags(uint);
-STATIC uint	xfs_qm_import_qtype_flags(uint);
 STATIC uint	xfs_qm_export_qtype_flags(uint);
 STATIC void	xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
 					fs_disk_quota_t *);
 
 
 /*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
-	xfs_mount_t	*mp,
-	int		cmd,
-	int		id,
-	xfs_caddr_t	addr)
-{
-	int		error;
-
-	ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-
-	/*
-	 * The following commands are valid even when quotaoff.
-	 */
-	switch (cmd) {
-	case Q_XQUOTARM:
-		/*
-		 * Truncate quota files. quota must be off.
-		 */
-		if (XFS_IS_QUOTA_ON(mp))
-			return XFS_ERROR(EINVAL);
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		return (xfs_qm_scall_trunc_qfiles(mp,
-			       xfs_qm_import_qtype_flags(*(uint *)addr)));
-
-	case Q_XGETQSTAT:
-		/*
-		 * Get quota status information.
-		 */
-		return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-
-	case Q_XQUOTAON:
-		/*
-		 * QUOTAON - enabling quota enforcement.
-		 * Quota accounting must be turned on at mount time.
-		 */
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		return (xfs_qm_scall_quotaon(mp,
-					  xfs_qm_import_flags(*(uint *)addr)));
-
-	case Q_XQUOTAOFF:
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		break;
-
-	case Q_XQUOTASYNC:
-		return xfs_sync_inodes(mp, SYNC_DELWRI);
-
-	default:
-		break;
-	}
-
-	if (! XFS_IS_QUOTA_ON(mp))
-		return XFS_ERROR(ESRCH);
-
-	switch (cmd) {
-	case Q_XQUOTAOFF:
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_quotaoff(mp,
-					    xfs_qm_import_flags(*(uint *)addr),
-					    B_FALSE);
-		break;
-
-	case Q_XGETQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-					(fs_disk_quota_t *)addr);
-		break;
-	case Q_XGETGQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-					(fs_disk_quota_t *)addr);
-		break;
-	case Q_XGETPQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-					(fs_disk_quota_t *)addr);
-		break;
-
-	case Q_XSETQLIM:
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-					     (fs_disk_quota_t *)addr);
-		break;
-	case Q_XSETGQLIM:
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-					     (fs_disk_quota_t *)addr);
-		break;
-	case Q_XSETPQLIM:
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-					     (fs_disk_quota_t *)addr);
-		break;
-
-	default:
-		error = XFS_ERROR(EINVAL);
-		break;
-	}
-
-	return (error);
-}
-
-/*
  * Turn off quota accounting and/or enforcement for all udquots and/or
  * gdquots. Called only at unmount time.
  *
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
  * incore, and modifies the ondisk dquot directly. Therefore, for example,
  * it is an error to call this twice, without purging the cache.
  */
-STATIC int
+int
 xfs_qm_scall_quotaoff(
 	xfs_mount_t		*mp,
-	uint			flags,
-	boolean_t		force)
+	uint			flags)
 {
 	uint			dqtype;
 	int			error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
 	xfs_qoff_logitem_t	*qoffstart;
 	int			nculprits;
 
-	if (!force && !capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
 	/*
 	 * No file system can have quotas enabled on disk but not in core.
 	 * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
 	return (error);
 }
 
-STATIC int
+int
 xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
 	int		error = 0, error2 = 0;
 	xfs_inode_t	*qip;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
 		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
  * effect immediately.
  * (Switching on quota accounting must be done at mount time.)
  */
-STATIC int
+int
 xfs_qm_scall_quotaon(
 	xfs_mount_t	*mp,
 	uint		flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
 	uint		accflags;
 	__int64_t	sbflags;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-
 	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
 	/*
 	 * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
 /*
  * Return quota status information, such as uquota-off, enforcements, etc.
  */
-STATIC int
+int
 xfs_qm_scall_getqstat(
 	xfs_mount_t	*mp,
 	fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
 /*
  * Adjust quota limits, and start/stop timers accordingly.
  */
-STATIC int
+int
 xfs_qm_scall_setqlim(
 	xfs_mount_t		*mp,
 	xfs_dqid_t		id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-
 	if ((newlim->d_fieldmask &
 	    (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
 		return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
 	return error;
 }
 
-STATIC int
+int
 xfs_qm_scall_getquota(
 	xfs_mount_t	*mp,
 	xfs_dqid_t	id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
 }
 
 STATIC uint
-xfs_qm_import_qtype_flags(
-	uint		uflags)
-{
-	uint		oflags = 0;
-
-	/*
-	 * Can't be more than one, or none.
-	 */
-	if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
-			(XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
-	    ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
-			(XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
-	    ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
-			(XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
-	    ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
-		return (0);
-
-	oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
-	oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
-	oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
-	return oflags;
-}
-
-STATIC uint
 xfs_qm_export_qtype_flags(
 	uint flags)
 {
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
 }
 
 STATIC uint
-xfs_qm_import_flags(
-	uint uflags)
-{
-	uint flags = 0;
-
-	if (uflags & XFS_QUOTA_UDQ_ACCT)
-		flags |= XFS_UQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_PDQ_ACCT)
-		flags |= XFS_PQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_GDQ_ACCT)
-		flags |= XFS_GQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_UDQ_ENFD)
-		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
-		flags |= XFS_OQUOTA_ENFD;
-	return (flags);
-}
-
-
-STATIC uint
 xfs_qm_export_flags(
 	uint flags)
 {
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
 xfs_dqhash_t *qmtest_gdqtab;
 int	      qmtest_hashmask;
 int	      qmtest_nfails;
-mutex_t	      qcheck_lock;
+struct mutex  qcheck_lock;
 
 #define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
 				 (__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc2..8286b2842b6b 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
 #define XFS_QI_QOFFLOCK(mp)	((mp)->m_quotainfo->qi_quotaofflock)
 
 #define XFS_QI_MPL_LIST(mp)	((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp)	((mp)->m_quotainfo->qi_dqlist.qh_lock)
 #define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
 
-#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock)))
-#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
-#ifdef DEBUG
-struct xfs_dqhash;
-static inline int XQMISLCKD(struct xfs_dqhash *h)
-{
-	if (mutex_trylock(&h->qh_lock)) {
-		mutex_unlock(&h->qh_lock);
-		return 0;
-	}
-	return 1;
-}
-#endif
-
-#define XFS_DQ_HASH_LOCK(h)		XQMLCK(h)
-#define XFS_DQ_HASH_UNLOCK(h)		XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h)	XQMISLCKD(h)
-
-#define xfs_qm_mplist_lock(mp)		XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp)	XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp)	XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-
-#define xfs_qm_freelist_lock(qm)	XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm)	XQMUNLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_mplist_lock(mp) \
+	mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_nowait(mp) \
+	mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_unlock(mp) \
+	mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define XFS_QM_IS_MPLIST_LOCKED(mp) \
+	mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
+
+#define xfs_qm_freelist_lock(qm) \
+	mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_lock_nowait(qm) \
+	mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_unlock(qm) \
+	mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 
 /*
  * Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e740..447173bcf96d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
 	xfs_qcnt_t	*resbcountp;
 	xfs_quotainfo_t	*q = mp->m_quotainfo;
 
-	if (! (flags & XFS_QMOPT_DQLOCK)) {
-		xfs_dqlock(dqp);
-	}
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	xfs_dqlock(dqp);
+
 	if (flags & XFS_TRANS_DQ_RES_BLKS) {
 		hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
 		if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
 	ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
 
 error_return:
-	if (! (flags & XFS_QMOPT_DQLOCK)) {
-		xfs_dqunlock(dqp);
-	}
-	return (error);
+	xfs_dqunlock(dqp);
+	return error;
 }
 
 
@@ -753,8 +750,7 @@ error_return:
  * grp/prj quotas is important, because this follows a both-or-nothing
  * approach.
  *
- * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
- *	   XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
  *	   XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
  *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
  *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae5482965424..3f3610a7ee05 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_error.h"
 
 static char		message[1024];	/* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7e..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
  */
 #include <xfs.h>
 
-static DEFINE_MUTEX(uuid_monitor);
-static int	uuid_table_size;
-static uuid_t	*uuid_table;
-
 /* IRIX interpretation of an uuid_t */
 typedef struct {
 	__be32	uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
 	fsid[1] = be32_to_cpu(uup->uu_timelow);
 }
 
-void
-uuid_create_nil(uuid_t *uuid)
-{
-	memset(uuid, 0, sizeof(*uuid));
-}
-
 int
 uuid_is_nil(uuid_t *uuid)
 {
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
 {
 	return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
 }
-
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words.  NOTE: This function can not be changed EVER.  Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent.  Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
-	__uint64_t	*sp = (__uint64_t *)uuid;
-
-	return sp[0] + sp[1];
-}
-
-int
-uuid_table_insert(uuid_t *uuid)
-{
-	int	i, hole;
-
-	mutex_lock(&uuid_monitor);
-	for (i = 0, hole = -1; i < uuid_table_size; i++) {
-		if (uuid_is_nil(&uuid_table[i])) {
-			hole = i;
-			continue;
-		}
-		if (uuid_equal(uuid, &uuid_table[i])) {
-			mutex_unlock(&uuid_monitor);
-			return 0;
-		}
-	}
-	if (hole < 0) {
-		uuid_table = kmem_realloc(uuid_table,
-			(uuid_table_size + 1) * sizeof(*uuid_table),
-			uuid_table_size  * sizeof(*uuid_table),
-			KM_SLEEP);
-		hole = uuid_table_size++;
-	}
-	uuid_table[hole] = *uuid;
-	mutex_unlock(&uuid_monitor);
-	return 1;
-}
-
-void
-uuid_table_remove(uuid_t *uuid)
-{
-	int	i;
-
-	mutex_lock(&uuid_monitor);
-	for (i = 0; i < uuid_table_size; i++) {
-		if (uuid_is_nil(&uuid_table[i]))
-			continue;
-		if (!uuid_equal(uuid, &uuid_table[i]))
-			continue;
-		uuid_create_nil(&uuid_table[i]);
-		break;
-	}
-	ASSERT(i < uuid_table_size);
-	mutex_unlock(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d445..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
 	unsigned char	__u_bits[16];
 } uuid_t;
 
-extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
 
 #endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20a..c8641f713caa 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
 		be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
 #define	XFS_MIN_FREELIST_PAG(pag,mp)	\
 	(XFS_MIN_FREELIST_RAW(		\
-		(uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-		(uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
 
 #define XFS_AGB_TO_FSB(mp,agno,agbno)	\
 	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea9..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
 }
 
 /*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	xfs_extlen_t		need, delta = 0;
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	if (pag->pagf_longest > delta)
+		return pag->pagf_longest - delta;
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
  */
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
 	}
 
 	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-		need = XFS_MIN_FREELIST_PAG(pag, mp);
-		delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
 		/*
 		 * If it looks like there isn't a long enough extent, or enough
 		 * total blocks, reject it.
 		 */
-		longest = (pag->pagf_longest > delta) ?
-			(pag->pagf_longest - delta) :
-			(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+		need = XFS_MIN_FREELIST_PAG(pag, mp);
+		longest = xfs_alloc_longest_free_extent(mp, pag);
 		if ((args->minlen + args->alignment + args->minalignslop - 1) >
 				longest ||
 		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7b..e704caee10df 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
 
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+		struct xfs_perag *pag);
 
 #ifdef __KERNEL__
 
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd1..afdc8911637d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 		 * minimum offset only needs to be the space required for 
 		 * the btree root.
 		 */ 
-		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset)
+		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+		    xfs_default_attroffset(dp))
 			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
 		break;
 		
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 }
 
 /*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp)
+{
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	ip->i_d.di_forkoff = 0;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_afp == NULL);
+
+	ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
  * Remove an attribute from the shortform attribute list structure.
  */
 int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 	 */
 	totsize -= size;
 	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-				!(args->op_flags & XFS_DA_OP_ADDNAME) &&
-				(mp->m_flags & XFS_MOUNT_ATTR2) &&
-				(dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
-		/*
-		 * Last attribute now removed, revert to original
-		 * inode format making all literal area available
-		 * to the data fork once more.
-		 */
-		xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-		dp->i_d.di_forkoff = 0;
-		dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-		ASSERT(dp->i_d.di_anextents == 0);
-		ASSERT(dp->i_afp == NULL);
-		dp->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+	    (mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+		xfs_attr_fork_reset(dp, args->trans);
 	} else {
 		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
 		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 	if (forkoff == -1) {
 		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
 		ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
-
-		/*
-		 * Last attribute was removed, revert to original
-		 * inode format making all literal area available
-		 * to the data fork once more.
-		 */
-		xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-		dp->i_d.di_forkoff = 0;
-		dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-		ASSERT(dp->i_d.di_anextents == 0);
-		ASSERT(dp->i_afp == NULL);
-		dp->i_df.if_ext_max =
-			XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		xfs_attr_fork_reset(dp, args->trans);
 		goto out;
 	}
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aaea..3a6ed426327a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
 	/*
 	 * If allocating at eof, and there's a previous real block,
-	 * try to use it's last block as our starting point.
+	 * try to use its last block as our starting point.
 	 */
 	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
 	    !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
 	xfs_agnumber_t	startag;
 	xfs_alloc_arg_t	args;
 	xfs_extlen_t	blen;
-	xfs_extlen_t	delta;
-	xfs_extlen_t	longest;
-	xfs_extlen_t	need;
 	xfs_extlen_t	nextminlen = 0;
 	xfs_perag_t	*pag;
 	int		nullfb;		/* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
 			 * See xfs_alloc_fix_freelist...
 			 */
 			if (pag->pagf_init) {
-				need = XFS_MIN_FREELIST_PAG(pag, mp);
-				delta = need > pag->pagf_flcount ?
-					need - pag->pagf_flcount : 0;
-				longest = (pag->pagf_longest > delta) ?
-					(pag->pagf_longest - delta) :
-					(pag->pagf_flcount > 0 ||
-					 pag->pagf_longest > 0);
+				xfs_extlen_t	longest;
+				longest = xfs_alloc_longest_free_extent(mp, pag);
 				if (blen < longest)
 					blen = longest;
 			} else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
 }
 
 /*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			offset;
+
+	if (mp->m_sb.sb_inodesize == 256) {
+		offset = XFS_LITINO(mp) -
+				XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	} else {
+		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+	}
+
+	ASSERT(offset < XFS_LITINO(mp));
+	return offset;
+}
+
+/*
  * Helper routine to reset inode di_forkoff field when switching
  * attribute fork from local to extent format - we reset it where
  * possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
 	int		whichfork)
 {
 	if (whichfork == XFS_ATTR_FORK &&
-	    (ip->i_d.di_format != XFS_DINODE_FMT_DEV) &&
-	    (ip->i_d.di_format != XFS_DINODE_FMT_UUID) &&
-	    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-	    ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) {
-		ip->i_d.di_forkoff = mp->m_attroffset >> 3;
-		ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) /
-					(uint)sizeof(xfs_bmbt_rec_t);
-		ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) /
-					(uint)sizeof(xfs_bmbt_rec_t);
+	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_d.di_forkoff) {
+			ip->i_d.di_forkoff = dfl_forkoff;
+			ip->i_df.if_ext_max =
+				XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+			ip->i_afp->if_ext_max =
+				XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+		}
 	}
 }
 
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
 	case XFS_DINODE_FMT_BTREE:
 		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
 		if (!ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
 		else if (mp->m_flags & XFS_MOUNT_ATTR2)
 			version = 2;
 		break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
 	 * (a signed 16-bit number, xfs_aextnum_t).
 	 *
 	 * Note that we can no longer assume that if we are in ATTR1 that
-	 * the fork offset of all the inodes will be (m_attroffset >> 3)
-	 * because we could have mounted with ATTR2 and then mounted back
-	 * with ATTR1, keeping the di_forkoff's fixed but probably at
-	 * various positions. Therefore, for both ATTR1 and ATTR2
-	 * we have to assume the worst case scenario of a minimum size
-	 * available.
+	 * the fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+	 * with ATTR2 and then mounted back with ATTR1, keeping the
+	 * di_forkoff's fixed but probably at various positions. Therefore,
+	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+	 * of a minimum size available.
 	 */
 	if (whichfork == XFS_DATA_FORK) {
 		maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
 	xfs_extlen_t	minlen;		/* min allocation size */
 	xfs_mount_t	*mp;		/* xfs mount structure */
 	int		n;		/* current extent index */
-	int		nallocs;	/* number of extents alloc\'d */
+	int		nallocs;	/* number of extents alloc'd */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	xfs_fileoff_t	obno;		/* old block number (offset) */
 	xfs_bmbt_irec_t	prev;		/* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
 	return(bp);
 }
 
-void
+STATIC void
 xfs_check_block(
 	struct xfs_btree_block	*block,
 	xfs_mount_t		*mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
 	block = XFS_BUF_TO_BLOCK(bp);
 
 	if (--level) {
-		/* Not at node above leafs, count this level of nodes */
+		/* Not at node above leaves, count this level of nodes */
 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 		while (nextbno != NULLFSBLOCK) {
 			if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d32..1b8ff9256bd0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
 	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
 	xfs_extlen_t		alen;	/* i/o length asked/allocated */
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
-	xfs_extlen_t		minlen;	/* mininum allocation size (blocks) */
+	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	char			eof;	/* set if allocating past last extent */
 	char			wasdel;	/* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
 	xfs_extnum_t		idx,
 	xfs_extnum_t		num);
 
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip);
+
 #ifdef __KERNEL__
 
 /*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23f..e9df99574829 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
 
 	/*
 	 * We add one entry to the left side and remove one for the right side.
-	 * Accout for it here, the changes will be updated on disk and logged
+	 * Account for it here, the changes will be updated on disk and logged
 	 * later.
 	 */
 	lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
 	XFS_BTREE_STATS_INC(cur, join);
 
 	/*
-	 * Fix up the the number of records and right block pointer in the
+	 * Fix up the number of records and right block pointer in the
 	 * surviving block, and log it.
 	 */
 	xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2f..4f852b735b96 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
 /*
  * Generic btree header.
  *
- * This is a comination of the actual format used on disk for short and long
+ * This is a combination of the actual format used on disk for short and long
  * format btrees.  The first three fields are shared by both format, but
  * the pointers are different and should be used with care.
  *
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5b..9ff6e57a5075 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
  * This is implemented with some source-level loop unrolling.
  */
 xfs_dahash_t
-xfs_da_hashname(const uchar_t *name, int namelen)
+xfs_da_hashname(const __uint8_t *name, int namelen)
 {
 	xfs_dahash_t hash;
 
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792d..8c536167bf75 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
  * Structure to ease passing around component names.
  */
 typedef struct xfs_da_args {
-	const uchar_t	*name;		/* string (maybe not NULL terminated) */
+	const __uint8_t	*name;		/* string (maybe not NULL terminated) */
 	int		namelen;	/* length of string (maybe no NULL) */
-	uchar_t		*value;		/* set of bytes (maybe contain NULLs) */
+	__uint8_t	*value;		/* set of bytes (maybe contain NULLs) */
 	int		valuelen;	/* length of value */
 	int		flags;		/* argument flags (eg: ATTR_NOCREATE) */
 	xfs_dahash_t	hashval;	/* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
 	unsigned char		inleaf;		/* insert into 1->lf, 0->splf */
 	unsigned char		extravalid;	/* T/F: extrablk is in use */
 	unsigned char		extraafter;	/* T/F: extrablk is after new */
-	xfs_da_state_blk_t	extrablk;	/* for double-splits on leafs */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leaves */
 						/* for dirv2 extrablk is data */
 } xfs_da_state_t;
 
@@ -251,7 +251,7 @@ xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  xfs_dabuf_t *dead_buf);
 
-uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
 				const char *name, int len);
 
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
 
 #endif	/* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d3..e6d839bddbf0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
 		goto out_put_target_file;
 	}
 
+	if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+	    IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+		error = XFS_ERROR(EINVAL);
+		goto out_put_target_file;
+	}
+
 	ip = XFS_I(file->f_path.dentry->d_inode);
 	tip = XFS_I(target_file->f_path.dentry->d_inode);
 
@@ -118,19 +124,17 @@ xfs_swap_extents(
 	xfs_bstat_t	*sbp = &sxp->sx_stat;
 	xfs_ifork_t	*tempifp, *ifp, *tifp;
 	int		ilf_fields, tilf_fields;
-	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
 	int		error = 0;
 	int		aforkblks = 0;
 	int		taforkblks = 0;
 	__uint64_t	tmp;
-	char		locked = 0;
 
 	mp = ip->i_mount;
 
 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
 	if (!tempifp) {
 		error = XFS_ERROR(ENOMEM);
-		goto error0;
+		goto out;
 	}
 
 	sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
 	 */
 	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-	locked = 1;
 
 	/* Verify that both files have the same format */
 	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
 		error = XFS_ERROR(EINVAL);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/* Verify both files are either real-time or non-realtime */
 	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
 		error = XFS_ERROR(EINVAL);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/* Should never get a local format */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
 	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
 		error = XFS_ERROR(EINVAL);
-		goto error0;
+		goto out_unlock;
 	}
 
 	if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
 		error = xfs_flushinval_pages(tip, 0, -1,
 				FI_REMAPF_LOCKED);
 		if (error)
-			goto error0;
+			goto out_unlock;
 	}
 
 	/* Verify O_DIRECT for ftmp */
 	if (VN_CACHED(VFS_I(tip)) != 0) {
 		error = XFS_ERROR(EINVAL);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
 	    sxp->sx_length != ip->i_d.di_size ||
 	    sxp->sx_length != tip->i_d.di_size) {
 		error = XFS_ERROR(EFAULT);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/*
@@ -193,7 +196,7 @@ xfs_swap_extents(
 	 */
 	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
 		error = XFS_ERROR(EINVAL);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/*
@@ -208,7 +211,7 @@ xfs_swap_extents(
 	    (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
 	    (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
 		error = XFS_ERROR(EBUSY);
-		goto error0;
+		goto out_unlock;
 	}
 
 	/* We need to fail if the file is memory mapped.  Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
 	 */
 	if (VN_MAPPED(VFS_I(ip))) {
 		error = XFS_ERROR(EBUSY);
-		goto error0;
+		goto out_unlock;
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
 		xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
 		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
 		xfs_trans_cancel(tp, 0);
-		locked = 0;
-		goto error0;
+		goto out;
 	}
 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 
@@ -253,19 +255,15 @@ xfs_swap_extents(
 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
 		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
-		if (error) {
-			xfs_trans_cancel(tp, 0);
-			goto error0;
-		}
+		if (error)
+			goto out_trans_cancel;
 	}
 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
 			&taforkblks);
-		if (error) {
-			xfs_trans_cancel(tp, 0);
-			goto error0;
-		}
+		if (error)
+			goto out_trans_cancel;
 	}
 
 	/*
@@ -332,10 +330,10 @@ xfs_swap_extents(
 
 
 	IHOLD(ip);
-	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	IHOLD(tip);
-	xfs_trans_ijoin(tp, tip, lock_flags);
+	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	xfs_trans_log_inode(tp, ip,  ilf_fields);
 	xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
 	 * If this is a synchronous mount, make sure that the
 	 * transaction goes to disk before returning to the user.
 	 */
-	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
-	}
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
-	locked = 0;
 
- error0:
-	if (locked) {
-		xfs_iunlock(ip,  lock_flags);
-		xfs_iunlock(tip, lock_flags);
-	}
-	if (tempifp != NULL)
-		kmem_free(tempifp);
+out_unlock:
+	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out:
+	kmem_free(tempifp);
 	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp, 0);
+	goto out_unlock;
 }
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5e..e5b153b2e6a3 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
 /*
  * Inode size for given fs.
  */
-#define	XFS_LITINO(mp)	((mp)->m_litino)
+#define XFS_LITINO(mp) \
+	((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+
 #define	XFS_BROOT_SIZE_ADJ	\
 	(XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8d..c657bec6d951 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
 
 struct xfs_name xfs_name_dotdot = {"..", 2};
 
-extern const struct xfs_nameops xfs_default_nameops;
-
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
  * used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf04..ab52e9e1c1ee 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	char			*ptr;		/* current data entry */
 	int			wantoff;	/* starting block offset */
-	xfs_ino_t		ino;
 	xfs_off_t		cook;
 
 	mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
 
 		cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 					    (char *)dep - (char *)block);
-		ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-		ino += mp->m_inoadd;
-#endif
 
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
 		if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
-			    ino, DT_UNKNOWN)) {
+			    be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
 			*offset = cook & 0x7fffffff;
 			xfs_da_brelse(NULL, bp);
 			return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e0252739..efbc290c7fec 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
 
 /*
  * Directory address space divided into sections,
- * spaces separated by 32gb.
+ * spaces separated by 32GB.
  */
 #define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
 #define	XFS_DIR2_DATA_SPACE	0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374eec..fa913e459442 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
  * Check the internal consistency of a leaf1 block.
  * Pop an assert if something is wrong.
  */
-void
+STATIC void
 xfs_dir2_leaf_check(
 	xfs_inode_t		*dp,		/* incore directory inode */
 	xfs_dabuf_t		*bp)		/* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
 	int			ra_index;	/* *map index for read-ahead */
 	int			ra_offset;	/* map entry offset for ra */
 	int			ra_want;	/* readahead count wanted */
-	xfs_ino_t		ino;
 
 	/*
 	 * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
 			continue;
 		}
 
-		/*
-		 * Copy the entry into the putargs, and try formatting it.
-		 */
 		dep = (xfs_dir2_data_entry_t *)ptr;
-
 		length = xfs_dir2_data_entsize(dep->namelen);
 
-		ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-		ino += mp->m_inoadd;
-#endif
-
-		/*
-		 * Won't fit.  Return to caller.
-		 */
 		if (filldir(dirent, dep->name, dep->namelen,
 			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
-			    ino, DT_UNKNOWN))
+			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
 			break;
 
 		/*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc6..5a81ccd1045b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
 	}
 	xfs_dir2_leafn_check(dp, bp);
 	/*
-	 * Return indication of whether this leaf block is emtpy enough
+	 * Return indication of whether this leaf block is empty enough
 	 * to justify trying to join it with a neighbor.
 	 */
 	*rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5b..e89734e84646 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
 	 * Put . entry unless we're starting past it.
 	 */
 	if (*offset <= dot_offset) {
-		ino = dp->i_ino;
-#if XFS_BIG_INUMS
-		ino += mp->m_inoadd;
-#endif
-		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
+		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
 			*offset = dot_offset & 0x7fffffff;
 			return 0;
 		}
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
 	 */
 	if (*offset <= dotdot_offset) {
 		ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
-#if XFS_BIG_INUMS
-		ino += mp->m_inoadd;
-#endif
 		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
 			*offset = dotdot_offset & 0x7fffffff;
 			return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
 		}
 
 		ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-#if XFS_BIG_INUMS
-		ino += mp->m_inoadd;
-#endif
-
 		if (filldir(dirent, sfep->name, sfep->namelen,
 			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
 			*offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85f..0d22c56fdf64 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
  * conversion routine.
  */
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_extent_32 {
 	__uint64_t	ext_start;
 	__uint32_t	ext_len;
 } __attribute__((packed)) xfs_extent_32_t;
-#endif
 
 typedef struct xfs_extent_64 {
 	__uint64_t	ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
 	xfs_extent_t		efi_extents[1];	/* array of extents to free */
 } xfs_efi_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efi_log_format_32 {
 	__uint16_t		efi_type;	/* efi log item type */
 	__uint16_t		efi_size;	/* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
 	__uint64_t		efi_id;		/* efi identifier */
 	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
 } __attribute__((packed)) xfs_efi_log_format_32_t;
-#endif
 
 typedef struct xfs_efi_log_format_64 {
 	__uint16_t		efi_type;	/* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
 	xfs_extent_t		efd_extents[1];	/* array of extents freed */
 } xfs_efd_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efd_log_format_32 {
 	__uint16_t		efd_type;	/* efd log item type */
 	__uint16_t		efd_size;	/* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
 	__uint64_t		efd_efi_id;	/* id of corresponding efi */
 	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
 } __attribute__((packed)) xfs_efd_log_format_32_t;
-#endif
 
 typedef struct xfs_efd_log_format_64 {
 	__uint16_t		efd_type;	/* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384e..6c87c8f304ef 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
 	xfs_extlen_t	minlen)
 {
 	int		err, trylock, nscan;
-	xfs_extlen_t	delta, longest, need, free, minfree, maxfree = 0;
+	xfs_extlen_t	longest, free, minfree, maxfree = 0;
 	xfs_agnumber_t	ag, max_ag = NULLAGNUMBER;
 	struct xfs_perag *pag;
 
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
 			goto next_ag;
 		}
 
-		need = XFS_MIN_FREELIST_PAG(pag, mp);
-		delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
-		longest = (pag->pagf_longest > delta) ?
-		          (pag->pagf_longest - delta) :
-		          (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
-
+		longest = xfs_alloc_longest_free_extent(mp, pag);
 		if (((minlen && longest >= minlen) ||
 		     (!minlen && pag->pagf_freeblks >= minfree)) &&
 		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec932..8379e3bca26c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
 	if (fdblks_delta) {
 		/*
 		 * If we are putting blocks back here, m_resblks_avail is
-		 * already at it's max so this will put it in the free pool.
+		 * already at its max so this will put it in the free pool.
 		 *
 		 * If we need space, we'll either succeed in getting it
 		 * from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7be..3120a3a5e20f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
 		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
 
 		/* Allow space for the inode btree to split. */
-		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+		args.minleft = args.mp->m_in_maxlevels - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	} else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
 		/*
 		 * Allow space for the inode btree to split.
 		 */
-		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+		args.minleft = args.mp->m_in_maxlevels - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
 		 * Initialize all inodes in this buffer and then log them.
 		 *
 		 * XXX: It would be much better if we had just one transaction to
-		 *      log a whole cluster of inodes instead of all the indivdual
+		 *      log a whole cluster of inodes instead of all the individual
 		 *      transactions causing a lot of log traffic.
 		 */
 		xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
 				   XFS_INODES_PER_CHUNK) == 0);
 	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-	XFS_INOBT_CLR_FREE(&rec, offset);
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
 	rec.ir_freecount--;
 	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
 			rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
 	 */
 	off = agino - rec.ir_startino;
 	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
-	ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
 	/*
 	 * Mark the inode free & increment the count.
 	 */
-	XFS_INOBT_SET_FREE(&rec, off);
+	rec.ir_free |= XFS_INOBT_MASK(off);
 	rec.ir_freecount++;
 
 	/*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8e..c282a9af5393 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
 }
 
 /*
- * intial value of ptr for lookup
+ * initial value of ptr for lookup
  */
 STATIC void
 xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff06..f782ad0c4769 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
 #define	XFS_IBT_MAGIC	0x49414254	/* 'IABT' */
 
 typedef	__uint64_t	xfs_inofree_t;
-#define	XFS_INODES_PER_CHUNK	(NBBY * sizeof(xfs_inofree_t))
+#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
 #define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
-#define	XFS_INOBT_ALL_FREE	((xfs_inofree_t)-1)
+#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
+#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
 
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
-	return (((n) >= XFS_INODES_PER_CHUNK ? \
-		(xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
+	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
 }
 
 /*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
 typedef __be32 xfs_inobt_ptr_t;
 
 /*
- * Bit manipulations for ir_free.
- */
-#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
-#define	XFS_INOBT_IS_FREE(rp,i)		\
-		(((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
-#define	XFS_INOBT_SET_FREE(rp,i)	((rp)->ir_free |= XFS_INOBT_MASK(i))
-#define	XFS_INOBT_CLR_FREE(rp,i)	((rp)->ir_free &= ~XFS_INOBT_MASK(i))
-
-/*
- * Maximum number of inode btree levels.
- */
-#define	XFS_IN_MAXLEVELS(mp)		((mp)->m_in_maxlevels)
-
-/*
  * block numbers in the AG.
  */
 #define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb6210d4c5..478e587087fe 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
 		goto out_destroy;
 	}
 
-	if (lock_flags)
-		xfs_ilock(ip, lock_flags);
-
 	/*
 	 * Preload the radix tree so we can insert safely under the
 	 * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
 	 */
 	if (radix_tree_preload(GFP_KERNEL)) {
 		error = EAGAIN;
-		goto out_unlock;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
 	}
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
 out_preload_end:
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
-out_unlock:
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
 out_destroy:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b22..f879c1bc4b96 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
 
 /*
  * NOTE:  This structure must be kept identical to struct xfs_dinode
- * 	  in xfs_dinode.h except for the endianess annotations.
+ * 	  in xfs_dinode.h except for the endianness annotations.
  */
 typedef struct xfs_icdinode {
 	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d54..a52ac125f055 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
 	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_inode_log_format_32 {
 	__uint16_t		ilf_type;	/* inode log item type */
 	__uint16_t		ilf_size;	/* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
 	__int32_t		ilf_len;	/* len of inode buffer */
 	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } __attribute__((packed)) xfs_inode_log_format_32_t;
-#endif
 
 typedef struct xfs_inode_log_format_64 {
 	__uint16_t		ilf_type;	/* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc2..a1cc1322fc0f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
  */
 
 typedef struct xfs_iomap {
-	xfs_daddr_t		iomap_bn;	/* first 512b blk of mapping */
+	xfs_daddr_t		iomap_bn;	/* first 512B blk of mapping */
 	xfs_buftarg_t		*iomap_target;
 	xfs_off_t		iomap_offset;	/* offset of mapping, bytes */
 	xfs_off_t		iomap_bsize;	/* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec90..aeb2d2221c7d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
-	vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
+	/*
+	 * We are reading the atime from the Linux inode because the
+	 * dinode might not be uptodate.
+	 */
+	buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
+	buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
 	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
 	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
 	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
 				 * first inode of the cluster.
 				 *
 				 * Careful with clustidx.   There can be
-				 * multple clusters per chunk, a single
+				 * multiple clusters per chunk, a single
 				 * cluster per chunk or a cluster that has
 				 * inodes represented from several different
 				 * chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9e..f76c6d7cea21 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
 	error = xfs_trans_ail_init(mp);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
-		goto error;
+		goto out_free_log;
 	}
 	mp->m_log->l_ailp = mp->m_ail;
 
@@ -594,20 +594,22 @@ xfs_log_mount(
 			mp->m_flags |= XFS_MOUNT_RDONLY;
 		if (error) {
 			cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
-			goto error;
+			goto out_destroy_ail;
 		}
 	}
 
 	/* Normal transactions can now occur */
 	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
-	/* End mounting message in xfs_log_mount_finish */
 	return 0;
-error:
-	xfs_log_unmount_dealloc(mp);
+
+out_destroy_ail:
+	xfs_trans_ail_destroy(mp);
+out_free_log:
+	xlog_dealloc_log(mp->m_log);
 out:
 	return error;
-}	/* xfs_log_mount */
+}
 
 /*
  * Finish the recovery of the file system.  This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
 }
 
 /*
- * Unmount processing for the log.
- */
-int
-xfs_log_unmount(xfs_mount_t *mp)
-{
-	int		error;
-
-	error = xfs_log_unmount_write(mp);
-	xfs_log_unmount_dealloc(mp);
-	return error;
-}
-
-/*
  * Final log writes as part of unmount.
  *
  * Mark the filesystem clean as unmount happens.  Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
  * and deallocate the log as the aild references the log.
  */
 void
-xfs_log_unmount_dealloc(xfs_mount_t *mp)
+xfs_log_unmount(xfs_mount_t *mp)
 {
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
 /*
  * Return size of each in-core log record buffer.
  *
- * All machines get 8 x 32KB buffers by default, unless tuned otherwise.
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
  *
  * If the filesystem blocksize is too large, we may need to choose a
  * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
 		}
 
 		if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-			/* # headers = size / 32K
-			 * one header holds cycles from 32K of data
+			/* # headers = size / 32k
+			 * one header holds cycles from 32k of data
 			 */
 
 			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
 		goto done;
 	}
 
-	/* All machines use 32KB buffers by default. */
+	/* All machines use 32kB buffers by default. */
 	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
 	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
 
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
 	log->l_iclog_hsize = BBSIZE;
 	log->l_iclog_heads = 1;
 
-	/*
-	 * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
-	 * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
-	 */
-	if (mp->m_sb.sb_blocksize >= 16*1024) {
-		log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
-		log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-		if (mp->m_logbufs <= 0) {
-			switch (mp->m_sb.sb_blocksize) {
-			    case 16*1024:			/* 16 KB */
-				log->l_iclog_bufs = 3;
-				break;
-			    case 32*1024:			/* 32 KB */
-				log->l_iclog_bufs = 4;
-				break;
-			    case 64*1024:			/* 64 KB */
-				log->l_iclog_bufs = 8;
-				break;
-			    default:
-				xlog_panic("XFS: Invalid blocksize");
-				break;
-			}
-		}
-	}
-
-done:	/* are we being asked to make the sizes selected above visible? */
+done:
+	/* are we being asked to make the sizes selected above visible? */
 	if (mp->m_logbufs == 0)
 		mp->m_logbufs = log->l_iclog_bufs;
 	if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
  */
 
 /*
- * Free a used ticket when it's refcount falls to zero.
+ * Free a used ticket when its refcount falls to zero.
  */
 void
 xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a3..d0c9baa50b1a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int	  xfs_log_write(struct xfs_mount *mp,
 			int		 nentries,
 			xfs_log_ticket_t ticket,
 			xfs_lsn_t	 *start_lsn);
-int	  xfs_log_unmount(struct xfs_mount *mp);
 int	  xfs_log_unmount_write(struct xfs_mount *mp);
-void      xfs_log_unmount_dealloc(struct xfs_mount *mp);
+void      xfs_log_unmount(struct xfs_mount *mp);
 int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
 int	  xfs_log_need_covered(struct xfs_mount *mp);
 
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0efb..bcad5f4c1fd1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
 	int			ic_size;
 	int			ic_offset;
 	int			ic_bwritecnt;
-	ushort_t		ic_state;
+	unsigned short		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
 #ifdef XFS_LOG_TRACE
 	struct ktrace		*ic_trace;
@@ -455,7 +455,6 @@ extern void	 xlog_recover_process_iunlinks(xlog_t *log);
 
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void	 xlog_put_bp(struct xfs_buf *);
-extern int	 xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 
 extern kmem_zone_t	*xfs_log_ticket_zone;
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b1047de2fffd..7ba450116d4f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
 	xfs_buf_free(bp);
 }
 
+STATIC xfs_caddr_t
+xlog_align(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp)
+{
+	xfs_caddr_t	ptr;
+
+	if (!log->l_sectbb_log)
+		return XFS_BUF_PTR(bp);
+
+	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+	ASSERT(XFS_BUF_SIZE(bp) >=
+		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+	return ptr;
+}
+
 
 /*
  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  */
-int
-xlog_bread(
+STATIC int
+xlog_bread_noalign(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
 	return error;
 }
 
+STATIC int
+xlog_bread(
+	xlog_t		*log,
+	xfs_daddr_t	blk_no,
+	int		nbblks,
+	xfs_buf_t	*bp,
+	xfs_caddr_t	*offset)
+{
+	int		error;
+
+	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+	if (error)
+		return error;
+
+	*offset = xlog_align(log, blk_no, nbblks, bp);
+	return 0;
+}
+
 /*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
 	return error;
 }
 
-STATIC xfs_caddr_t
-xlog_align(
-	xlog_t		*log,
-	xfs_daddr_t	blk_no,
-	int		nbblks,
-	xfs_buf_t	*bp)
-{
-	xfs_caddr_t	ptr;
-
-	if (!log->l_sectbb_log)
-		return XFS_BUF_PTR(bp);
-
-	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-	ASSERT(XFS_BUF_SIZE(bp) >=
-		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-	return ptr;
-}
-
 #ifdef DEBUG
 /*
  * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
 
 	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 	for (b = 0; b < 16; b++)
-		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+		cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
 	cmn_err(CE_DEBUG, "    log : uuid = ");
 	for (b = 0; b < 16; b++)
-		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+		cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
 
 	mid_blk = BLK_AVG(first_blk, *last_blk);
 	while (mid_blk != first_blk && mid_blk != *last_blk) {
-		if ((error = xlog_bread(log, mid_blk, 1, bp)))
+		error = xlog_bread(log, mid_blk, 1, bp, &offset);
+		if (error)
 			return error;
-		offset = xlog_align(log, mid_blk, 1, bp);
 		mid_cycle = xlog_get_cycle(offset);
 		if (mid_cycle == cycle) {
 			*last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
 
 		bcount = min(bufblks, (start_blk + nbblks - i));
 
-		if ((error = xlog_bread(log, i, bcount, bp)))
+		error = xlog_bread(log, i, bcount, bp, &buf);
+		if (error)
 			goto out;
 
-		buf = xlog_align(log, i, bcount, bp);
 		for (j = 0; j < bcount; j++) {
 			cycle = xlog_get_cycle(buf);
 			if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
 			return ENOMEM;
 		smallmem = 1;
 	} else {
-		if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+		if (error)
 			goto out;
-		offset = xlog_align(log, start_blk, num_blks, bp);
 		offset += ((num_blks - 1) << BBSHIFT);
 	}
 
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
 		}
 
 		if (smallmem) {
-			if ((error = xlog_bread(log, i, 1, bp)))
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
 				goto out;
-			offset = xlog_align(log, i, 1, bp);
 		}
 
 		head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
-	if ((error = xlog_bread(log, 0, 1, bp)))
+
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
 		goto bp_err;
-	offset = xlog_align(log, 0, 1, bp);
+
 	first_half_cycle = xlog_get_cycle(offset);
 
 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
-	if ((error = xlog_bread(log, last_blk, 1, bp)))
+	error = xlog_bread(log, last_blk, 1, bp, &offset);
+	if (error)
 		goto bp_err;
-	offset = xlog_align(log, last_blk, 1, bp);
+
 	last_half_cycle = xlog_get_cycle(offset);
 	ASSERT(last_half_cycle != 0);
 
@@ -817,9 +838,10 @@ xlog_find_tail(
 	if (!bp)
 		return ENOMEM;
 	if (*head_blk == 0) {				/* special case */
-		if ((error = xlog_bread(log, 0, 1, bp)))
+		error = xlog_bread(log, 0, 1, bp, &offset);
+		if (error)
 			goto bread_err;
-		offset = xlog_align(log, 0, 1, bp);
+
 		if (xlog_get_cycle(offset) == 0) {
 			*tail_blk = 0;
 			/* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
 	 */
 	ASSERT(*head_blk < INT_MAX);
 	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
-		if ((error = xlog_bread(log, i, 1, bp)))
+		error = xlog_bread(log, i, 1, bp, &offset);
+		if (error)
 			goto bread_err;
-		offset = xlog_align(log, i, 1, bp);
+
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
 			found = 1;
 			break;
@@ -848,9 +871,10 @@ xlog_find_tail(
 	 */
 	if (!found) {
 		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
-			if ((error = xlog_bread(log, i, 1, bp)))
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
 				goto bread_err;
-			offset = xlog_align(log, i, 1, bp);
+
 			if (XLOG_HEADER_MAGIC_NUM ==
 			    be32_to_cpu(*(__be32 *)offset)) {
 				found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
-		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+		if (error)
 			goto bread_err;
-		}
-		offset = xlog_align(log, umount_data_blk, 1, bp);
+
 		op_head = (xlog_op_header_t *)offset;
 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 			/*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
-	if ((error = xlog_bread(log, 0, 1, bp)))
+	error = xlog_bread(log, 0, 1, bp, &offset);
+	if (error)
 		goto bp_err;
-	offset = xlog_align(log, 0, 1, bp);
+
 	first_cycle = xlog_get_cycle(offset);
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
 	}
 
 	/* check partially zeroed log */
-	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+	if (error)
 		goto bp_err;
-	offset = xlog_align(log, log_bbnum-1, 1, bp);
+
 	last_cycle = xlog_get_cycle(offset);
 	if (last_cycle != 0) {		/* log completely written to */
 		xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
 	 */
 	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
 	if (balign != start_block) {
-		if ((error = xlog_bread(log, start_block, 1, bp))) {
-			xlog_put_bp(bp);
-			return error;
-		}
+		error = xlog_bread_noalign(log, start_block, 1, bp);
+		if (error)
+			goto out_put_bp;
+
 		j = start_block - balign;
 	}
 
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
 			balign = BBTOB(ealign - start_block);
 			error = XFS_BUF_SET_PTR(bp, offset + balign,
 						BBTOB(sectbb));
-			if (!error)
-				error = xlog_bread(log, ealign, sectbb, bp);
-			if (!error)
-				error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+			if (error)
+				break;
+
+			error = xlog_bread_noalign(log, ealign, sectbb, bp);
+			if (error)
+				break;
+
+			error = XFS_BUF_SET_PTR(bp, offset, bufblks);
 			if (error)
 				break;
 		}
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
 		start_block += endcount;
 		j = 0;
 	}
+
+ out_put_bp:
 	xlog_put_bp(bp);
 	return error;
 }
@@ -1455,10 +1487,19 @@ xlog_recover_add_to_trans(
 	item = item->ri_prev;
 
 	if (item->ri_total == 0) {		/* first region to be added */
-		item->ri_total	= in_f->ilf_size;
-		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
-		item->ri_buf = kmem_zalloc((item->ri_total *
-					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xlog_warn(
+	"XFS: bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    KM_SLEEP);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */
@@ -2502,16 +2543,10 @@ xlog_recover_do_inode_trans(
 	}
 
 write_inode_buffer:
-	if (ITEM_TYPE(item) == XFS_LI_INODE) {
-		ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-		bp->b_mount = mp;
-		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
-		xfs_bdwrite(mp, bp);
-	} else {
-		XFS_BUF_STALE(bp);
-		error = xfs_bwrite(mp, bp);
-	}
-
+	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+	bp->b_mount = mp;
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+	xfs_bdwrite(mp, bp);
 error:
 	if (need_free)
 		kmem_free(in_f);
@@ -2760,51 +2795,48 @@ xlog_recover_do_trans(
 	int			error = 0;
 	xlog_recover_item_t	*item, *first_item;
 
-	if ((error = xlog_recover_reorder_trans(trans)))
+	error = xlog_recover_reorder_trans(trans);
+	if (error)
 		return error;
+
 	first_item = item = trans->r_itemq;
 	do {
-		/*
-		 * we don't need to worry about the block number being
-		 * truncated in > 1 TB buffers because in user-land,
-		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
-		 * the blknos will get through the user-mode buffer
-		 * cache properly.  The only bad case is o32 kernels
-		 * where xfs_daddr_t is 32-bits but mount will warn us
-		 * off a > 1 TB filesystem before we get here.
-		 */
-		if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
-			if  ((error = xlog_recover_do_buffer_trans(log, item,
-								 pass)))
-				break;
-		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
-			if ((error = xlog_recover_do_inode_trans(log, item,
-								pass)))
-				break;
-		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-						  pass)))
-				break;
-		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+		switch (ITEM_TYPE(item)) {
+		case XFS_LI_BUF:
+			error = xlog_recover_do_buffer_trans(log, item, pass);
+			break;
+		case XFS_LI_INODE:
+			error = xlog_recover_do_inode_trans(log, item, pass);
+			break;
+		case XFS_LI_EFI:
+			error = xlog_recover_do_efi_trans(log, item,
+							  trans->r_lsn, pass);
+			break;
+		case XFS_LI_EFD:
 			xlog_recover_do_efd_trans(log, item, pass);
-		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
-			if ((error = xlog_recover_do_dquot_trans(log, item,
-								   pass)))
-					break;
-		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
-			if ((error = xlog_recover_do_quotaoff_trans(log, item,
-								   pass)))
-					break;
-		} else {
-			xlog_warn("XFS: xlog_recover_do_trans");
+			error = 0;
+			break;
+		case XFS_LI_DQUOT:
+			error = xlog_recover_do_dquot_trans(log, item, pass);
+			break;
+		case XFS_LI_QUOTAOFF:
+			error = xlog_recover_do_quotaoff_trans(log, item,
+							       pass);
+			break;
+		default:
+			xlog_warn(
+	"XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			break;
 		}
+
+		if (error)
+			return error;
 		item = item->ri_next;
 	} while (first_item != item);
 
-	return error;
+	return 0;
 }
 
 /*
@@ -3481,9 +3513,11 @@ xlog_do_recovery_pass(
 		hbp = xlog_get_bp(log, 1);
 		if (!hbp)
 			return ENOMEM;
-		if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+
+		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+		if (error)
 			goto bread_err1;
-		offset = xlog_align(log, tail_blk, 1, hbp);
+
 		rhead = (xlog_rec_header_t *)offset;
 		error = xlog_valid_rec_header(log, rhead, tail_blk);
 		if (error)
@@ -3517,9 +3551,10 @@ xlog_do_recovery_pass(
 	memset(rhash, 0, sizeof(rhash));
 	if (tail_blk <= head_blk) {
 		for (blk_no = tail_blk; blk_no < head_blk; ) {
-			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+			if (error)
 				goto bread_err2;
-			offset = xlog_align(log, blk_no, hblks, hbp);
+
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
@@ -3527,10 +3562,11 @@ xlog_do_recovery_pass(
 
 			/* blocks in data section */
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-			error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+			error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+					   &offset);
 			if (error)
 				goto bread_err2;
-			offset = xlog_align(log, blk_no + hblks, bblks, dbp);
+
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log,
 						rhash, rhead, offset, pass)))
@@ -3553,10 +3589,10 @@ xlog_do_recovery_pass(
 			wrapped_hblks = 0;
 			if (blk_no + hblks <= log->l_logBBsize) {
 				/* Read header in one read */
-				error = xlog_bread(log, blk_no, hblks, hbp);
+				error = xlog_bread(log, blk_no, hblks, hbp,
+						   &offset);
 				if (error)
 					goto bread_err2;
-				offset = xlog_align(log, blk_no, hblks, hbp);
 			} else {
 				/* This LR is split across physical log end */
 				if (blk_no != log->l_logBBsize) {
@@ -3564,12 +3600,13 @@ xlog_do_recovery_pass(
 					ASSERT(blk_no <= INT_MAX);
 					split_hblks = log->l_logBBsize - (int)blk_no;
 					ASSERT(split_hblks > 0);
-					if ((error = xlog_bread(log, blk_no,
-							split_hblks, hbp)))
+					error = xlog_bread(log, blk_no,
+							   split_hblks, hbp,
+							   &offset);
+					if (error)
 						goto bread_err2;
-					offset = xlog_align(log, blk_no,
-							split_hblks, hbp);
 				}
+
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
@@ -3587,14 +3624,19 @@ xlog_do_recovery_pass(
 				error = XFS_BUF_SET_PTR(hbp,
 						bufaddr + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
-				if (!error)
-					error = xlog_bread(log, 0,
-							wrapped_hblks, hbp);
-				if (!error)
-					error = XFS_BUF_SET_PTR(hbp, bufaddr,
+				if (error)
+					goto bread_err2;
+
+				error = xlog_bread_noalign(log, 0,
+							   wrapped_hblks, hbp);
+				if (error)
+					goto bread_err2;
+
+				error = XFS_BUF_SET_PTR(hbp, bufaddr,
 							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
+
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
@@ -3610,10 +3652,10 @@ xlog_do_recovery_pass(
 
 			/* Read in data for log record */
 			if (blk_no + bblks <= log->l_logBBsize) {
-				error = xlog_bread(log, blk_no, bblks, dbp);
+				error = xlog_bread(log, blk_no, bblks, dbp,
+						   &offset);
 				if (error)
 					goto bread_err2;
-				offset = xlog_align(log, blk_no, bblks, dbp);
 			} else {
 				/* This log record is split across the
 				 * physical end of log */
@@ -3627,12 +3669,13 @@ xlog_do_recovery_pass(
 					split_bblks =
 						log->l_logBBsize - (int)blk_no;
 					ASSERT(split_bblks > 0);
-					if ((error = xlog_bread(log, blk_no,
-							split_bblks, dbp)))
+					error = xlog_bread(log, blk_no,
+							split_bblks, dbp,
+							&offset);
+					if (error)
 						goto bread_err2;
-					offset = xlog_align(log, blk_no,
-							split_bblks, dbp);
 				}
+
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
@@ -3649,15 +3692,19 @@ xlog_do_recovery_pass(
 				error = XFS_BUF_SET_PTR(dbp,
 						bufaddr + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
-				if (!error)
-					error = xlog_bread(log, wrapped_hblks,
-							bblks - split_bblks,
-							dbp);
-				if (!error)
-					error = XFS_BUF_SET_PTR(dbp, bufaddr,
-							h_size);
 				if (error)
 					goto bread_err2;
+
+				error = xlog_bread_noalign(log, wrapped_hblks,
+						bblks - split_bblks,
+						dbp);
+				if (error)
+					goto bread_err2;
+
+				error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+				if (error)
+					goto bread_err2;
+
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
@@ -3674,17 +3721,21 @@ xlog_do_recovery_pass(
 
 		/* read first part of physical log */
 		while (blk_no < head_blk) {
-			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+			if (error)
 				goto bread_err2;
-			offset = xlog_align(log, blk_no, hblks, hbp);
+
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
 				goto bread_err2;
+
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-			if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+			error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+					   &offset);
+			if (error)
 				goto bread_err2;
-			offset = xlog_align(log, blk_no+hblks, bblks, dbp);
+
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
 							rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86d..b101990df027 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
 
@@ -121,6 +120,84 @@ static const struct {
     { sizeof(xfs_sb_t),			 0 }
 };
 
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			hole, i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return 0;
+
+	if (uuid_is_nil(uuid)) {
+		cmn_err(CE_WARN,
+			"XFS: Filesystem %s has nil UUID - can't mount",
+			mp->m_fsname);
+		return XFS_ERROR(EINVAL);
+	}
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(uuid, &xfs_uuid_table[i]))
+			goto out_duplicate;
+	}
+
+	if (hole < 0) {
+		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+			xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
+			KM_SLEEP);
+		hole = xfs_uuid_table_size++;
+	}
+	xfs_uuid_table[hole] = *uuid;
+	mutex_unlock(&xfs_uuid_table_mutex);
+
+	return 0;
+
+ out_duplicate:
+	mutex_unlock(&xfs_uuid_table_mutex);
+	cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+			 mp->m_fsname);
+	return XFS_ERROR(EINVAL);
+}
+
+STATIC void
+xfs_uuid_unmount(
+	struct xfs_mount	*mp)
+{
+	uuid_t			*uuid = &mp->m_sb.sb_uuid;
+	int			i;
+
+	if (mp->m_flags & XFS_MOUNT_NOUUID)
+		return;
+
+	mutex_lock(&xfs_uuid_table_mutex);
+	for (i = 0; i < xfs_uuid_table_size; i++) {
+		if (uuid_is_nil(&xfs_uuid_table[i]))
+			continue;
+		if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+			continue;
+		memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+		break;
+	}
+	ASSERT(i < xfs_uuid_table_size);
+	mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
 /*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
 		return XFS_ERROR(ENOSYS);
 	}
 
+	/*
+	 * Currently only very few inode sizes are supported.
+	 */
+	switch (sbp->sb_inodesize) {
+	case 256:
+	case 512:
+	case 1024:
+	case 2048:
+		break;
+	default:
+		xfs_fs_mount_cmn_err(flags,
+			"inode size of %d bytes not supported",
+			sbp->sb_inodesize);
+		return XFS_ERROR(ENOSYS);
+	}
+
 	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 		xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
 	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
 	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-	mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
 	mp->m_blockmask = sbp->sb_blocksize - 1;
 	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
 	mp->m_blockwmask = mp->m_blockwsize - 1;
 
-	/*
-	 * Setup for attributes, in case they get created.
-	 * This value is for inodes getting attributes for the first time,
-	 * the per-inode value is for old attribute values.
-	 */
-	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
-	switch (sbp->sb_inodesize) {
-	case 256:
-		mp->m_attroffset = XFS_LITINO(mp) -
-				   XFS_BMDR_SPACE_CALC(MINABTPTRS);
-		break;
-	case 512:
-	case 1024:
-	case 2048:
-		mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-		break;
-	default:
-		ASSERT(0);
-	}
-	ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
 	mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 	for (index = 0; index < agcount; index++) {
 		/*
 		 * read the agf, then the agi. This gets us
-		 * all the inforamtion we need and populates the
+		 * all the information we need and populates the
 		 * per-ag structures for us.
 		 */
 		error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
 }
 
 /*
- * xfs_mountfs
- *
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
  *	- if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
 	xfs_inode_t	*rip;
 	__uint64_t	resblks;
 	uint		quotamount, quotaflags;
-	int		uuid_mounted = 0;
 	int		error = 0;
 
 	xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
 	 */
 	error = xfs_update_alignment(mp);
 	if (error)
-		goto error1;
+		goto out;
 
 	xfs_alloc_compute_maxlevels(mp);
 	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
 
 	mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
 
-	/*
-	 * XFS uses the uuid from the superblock as the unique
-	 * identifier for fsid.  We can not use the uuid from the volume
-	 * since a single partition filesystem is identical to a single
-	 * partition volume/filesystem.
-	 */
-	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-		if (xfs_uuid_mount(mp)) {
-			error = XFS_ERROR(EINVAL);
-			goto error1;
-		}
-		uuid_mounted=1;
-	}
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out;
 
 	/*
 	 * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
 	 */
 	error = xfs_check_sizes(mp);
 	if (error)
-		goto error1;
+		goto out_remove_uuid;
 
 	/*
 	 * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
 	error = xfs_rtmount_init(mp);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: RT mount failed");
-		goto error1;
+		goto out_remove_uuid;
 	}
 
 	/*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
 	mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
 				  KM_MAYFAIL);
 	if (!mp->m_perag)
-		goto error1;
+		goto out_remove_uuid;
 
 	mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
 
+	if (!sbp->sb_logblocks) {
+		cmn_err(CE_WARN, "XFS: no log defined");
+		XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out_free_perag;
+	}
+
 	/*
 	 * log's mount-time initialization. Perform 1st part recovery if needed
 	 */
-	if (likely(sbp->sb_logblocks > 0)) {	/* check for volume case */
-		error = xfs_log_mount(mp, mp->m_logdev_targp,
-				      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
-				      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
-		if (error) {
-			cmn_err(CE_WARN, "XFS: log mount failed");
-			goto error2;
-		}
-	} else {	/* No log has been defined */
-		cmn_err(CE_WARN, "XFS: no log defined");
-		XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
-		error = XFS_ERROR(EFSCORRUPTED);
-		goto error2;
+	error = xfs_log_mount(mp, mp->m_logdev_targp,
+			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+	if (error) {
+		cmn_err(CE_WARN, "XFS: log mount failed");
+		goto out_free_perag;
 	}
 
 	/*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
 	 * If we are currently making the filesystem, the initialisation will
 	 * fail as the perag data is in an undefined state.
 	 */
-
 	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
 	    !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
 	     !mp->m_sb.sb_inprogress) {
 		error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
-		if (error) {
-			goto error2;
-		}
+		if (error)
+			goto out_free_perag;
 	}
+
 	/*
 	 * Get and sanity-check the root inode.
 	 * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
 	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: failed to read root inode");
-		goto error3;
+		goto out_log_dealloc;
 	}
 
 	ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
 				 mp);
 		error = XFS_ERROR(EFSCORRUPTED);
-		goto error4;
+		goto out_rele_rip;
 	}
 	mp->m_rootip = rip;	/* save it */
 
@@ -1131,7 +1188,7 @@ xfs_mountfs(
 		 * Free up the root inode.
 		 */
 		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
-		goto error4;
+		goto out_rele_rip;
 	}
 
 	/*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
 		error = xfs_mount_log_sb(mp, mp->m_update_flags);
 		if (error) {
 			cmn_err(CE_WARN, "XFS: failed to write sb changes");
-			goto error4;
+			goto out_rtunmount;
 		}
 	}
 
@@ -1152,7 +1209,7 @@ xfs_mountfs(
 	 */
 	error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
 	if (error)
-		goto error4;
+		goto out_rtunmount;
 
 	/*
 	 * Finish recovering the file system.  This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
 	error = xfs_log_mount_finish(mp);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: log mount finish failed");
-		goto error4;
+		goto out_rtunmount;
 	}
 
 	/*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
 	 */
 	error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
 	if (error)
-		goto error4;
+		goto out_rtunmount;
 
 	/*
 	 * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
 
 	return 0;
 
- error4:
-	/*
-	 * Free up the root inode.
-	 */
+ out_rtunmount:
+	xfs_rtunmount_inodes(mp);
+ out_rele_rip:
 	IRELE(rip);
- error3:
-	xfs_log_unmount_dealloc(mp);
- error2:
+ out_log_dealloc:
+	xfs_log_unmount(mp);
+ out_free_perag:
 	xfs_free_perag(mp);
- error1:
-	if (uuid_mounted)
-		uuid_table_remove(&mp->m_sb.sb_uuid);
+ out_remove_uuid:
+	xfs_uuid_unmount(mp);
+ out:
 	return error;
 }
 
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
 	 */
 	XFS_QM_UNMOUNT(mp);
 
-	if (mp->m_rbmip)
-		IRELE(mp->m_rbmip);
-	if (mp->m_rsumip)
-		IRELE(mp->m_rsumip);
+	xfs_rtunmount_inodes(mp);
 	IRELE(mp->m_rootip);
 
 	/*
 	 * We can potentially deadlock here if we have an inode cluster
-	 * that has been freed has it's buffer still pinned in memory because
+	 * that has been freed has its buffer still pinned in memory because
 	 * the transaction is still sitting in a iclog. The stale inodes
 	 * on that buffer will have their flush locks held until the
 	 * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
 	 * lazy superblock counting because it trusts the incore superblock
-	 * counters to be aboslutely correct on clean unmount.
+	 * counters to be absolutely correct on clean unmount.
 	 *
 	 * We don't bother correcting this elsewhere for lazy superblock
 	 * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
 				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
 	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
-	xfs_log_unmount(mp);			/* Done! No more fs ops. */
-
-	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
-		uuid_table_remove(&mp->m_sb.sb_uuid);
+	xfs_log_unmount_write(mp);
+	xfs_log_unmount(mp);
+	xfs_uuid_unmount(mp);
 
 #if defined(DEBUG)
 	xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
 }
 
 /*
- * See if the UUID is unique among mounted XFS filesystems.
- * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
- */
-STATIC int
-xfs_uuid_mount(
-	xfs_mount_t	*mp)
-{
-	if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
-		cmn_err(CE_WARN,
-			"XFS: Filesystem %s has nil UUID - can't mount",
-			mp->m_fsname);
-		return -1;
-	}
-	if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
-		cmn_err(CE_WARN,
-			"XFS: Filesystem %s has duplicate UUID - can't mount",
-			mp->m_fsname);
-		return -1;
-	}
-	return 0;
-}
-
-/*
  * Used to log changes to the superblock unit and width fields which could
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
  * we disable the per-cpu counter and go through the slow path.
  *
  * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain it's resources back to
+ * when we disable a per-cpu counter, we need to drain its resources back to
  * the global superblock. We do this after disabling the counter to prevent
  * more threads from queueing up on the counter.
  *
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bdb..7af44adffc8f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int	(*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
 			struct xfs_dquot *, struct xfs_dquot *, uint);
 typedef void	(*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int	(*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef int	(*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
 
 typedef struct xfs_qmops {
 	xfs_qminit_t		xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
 	xfs_dqvopchownresv_t	xfs_dqvopchownresv;
 	xfs_dqstatvfs_t		xfs_dqstatvfs;
 	xfs_dqsync_t		xfs_dqsync;
-	xfs_quotactl_t		xfs_quotactl;
 	struct xfs_dqtrxops	*xfs_dqtrxops;
 } xfs_qmops_t;
 
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
 	(*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
 #define XFS_QM_DQSYNC(mp, flags) \
 	(*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
-#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
-	(*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
 
 #ifdef HAVE_PERCPU_SB
 
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
 	uint			m_inobt_mnr[2];	/* min inobt btree records */
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
-	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
+	uint			m_in_maxlevels;	/* max inobt btree levels. */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
 	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
 	struct mutex		m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
 	__uint64_t		m_flags;	/* global mount flags */
-	uint			m_attroffset;	/* inode attribute offset */
 	uint			m_dir_node_ents; /* #entries in a dir danode */
 	uint			m_attr_node_ents; /* #entries in attr danode */
 	int			m_ialloc_inos;	/* inodes in inode allocation */
 	int			m_ialloc_blks;	/* blocks in inode allocation */
-	int			m_litino;	/* size of inode union area */
 	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
 	uint			m_qflags;	/* quota status flags */
 	xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
 	__uint64_t		m_maxioffset;	/* maximum inode offset */
 	__uint64_t		m_resblks;	/* total reserved blocks */
 	__uint64_t		m_resblks_avail;/* available reserved blocks */
-#if XFS_BIG_INUMS
-	xfs_ino_t		m_inoadd;	/* add value for ino64_offset */
-#endif
 	int			m_dalign;	/* stripe unit */
 	int			m_swidth;	/* stripe width */
 	int			m_sinoalign;	/* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
 						   must be synchronous except
 						   for space allocations */
-#define XFS_MOUNT_INO64		(1ULL << 1)
 #define XFS_MOUNT_DMAPI		(1ULL << 2)	/* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
  * Synchronous read and write sizes.  This should be
  * better for NFSv2 wsync filesystems.
  */
-#define	XFS_WSYNC_READIO_LOG	15	/* 32K */
-#define	XFS_WSYNC_WRITEIO_LOG	14	/* 16K */
+#define	XFS_WSYNC_READIO_LOG	15	/* 32k */
+#define	XFS_WSYNC_WRITEIO_LOG	14	/* 16k */
 
 /*
  * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
 	int64_t		msb_delta;	/* Change to make to specified field */
 } xfs_mod_sb_t;
 
-#define	XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock))
-#define	XFS_MOUNT_IUNLOCK(mp)	mutex_unlock(&((mp)->m_ilock))
-
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
 extern int	xfs_mountfs(xfs_mount_t *mp);
 extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520a..e101790ea8e7 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
 	.xfs_dqvopchownresv	= (xfs_dqvopchownresv_t) fs_noerr,
 	.xfs_dqstatvfs		= (xfs_dqstatvfs_t) fs_noval,
 	.xfs_dqsync		= (xfs_dqsync_t) fs_noerr,
-	.xfs_quotactl		= (xfs_quotactl_t) fs_nosys,
 };
 
 int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa155..f5d1202dde25 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_QUOTA_H__
 #define __XFS_QUOTA_H__
 
+struct xfs_trans;
+
 /*
  * The ondisk form of a dquot structure.
  */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
  * to a single function. None of these XFS_QMOPT_* flags are meant to have
  * persistent values (ie. their values can and will change between versions)
  */
-#define XFS_QMOPT_DQLOCK	0x0000001 /* dqlock */
 #define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
 #define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
 #define XFS_QMOPT_PQUOTA	0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec05..385f6dceba5d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
 	return 0;
 }
 
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_rbmip)
+		IRELE(mp->m_rbmip);
+	if (mp->m_rsumip)
+		IRELE(mp->m_rsumip);
+}
+
 /*
  * Pick an extent for allocation at the start of a new realtime file.
  * Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd215716..b2d67adb6a08 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
 
 /* Min and max rt extent sizes, specified in bytes */
 #define	XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
-#define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64KB */
-#define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4KB */
+#define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64kB */
+#define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4kB */
 
 /*
  * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
 int					/* error */
 xfs_rtmount_init(
 	struct xfs_mount	*mp);	/* file system mount structure */
+void
+xfs_rtunmount_inodes(
+	struct xfs_mount	*mp);
 
 /*
  * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
 # define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79f..775249a54f6f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  * In a write transaction we can allocate a maximum of 2
  * extents.  This gives:
  *    the inode getting the new extents: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
  *    the agfs of the ags from which the extents are allocated: 2 * sector
  *    the superblock free block counter: sector size
  *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
  * In truncating a file we free up to two extents at once.  We can modify:
  *    the inode being truncated: inode size
- *    the inode\'s bmap btree: (max depth + 1) * block size
+ *    the inode's bmap btree: (max depth + 1) * block size
  * And the bmap_finish transaction can free the blocks and bmap blocks:
  *    the agf for each of the ags: 4 * sector size
  *    the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
 	  (128 * 5) + \
 	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	   (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	   (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
 	    XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define	XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  *    the new inode: inode size
  *    the inode btree entry: 1 block
  *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 KB
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
  * Or in the first xact we allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	  (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
 	 (2 * (mp)->m_sb.sb_sectsize + \
 	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
 	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
 	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define	XFS_SYMLINK_LOG_RES(mp)	((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  *    the inode btree entry: block size
  *    the superblock for the nlink flag: sector size
  *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
  * Or in the first xact we allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	  (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
 	 (3 * (mp)->m_sb.sb_sectsize + \
 	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
 	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
 	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define	XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
 	 (128 * 5) + \
 	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
 	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 
 
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
  * Removing the attribute fork of a file
  *    the inode being truncated: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
  * And the bmap_finish transaction can free the blocks and bmap blocks:
  *    the agf for each of the ags: 4 * sector size
  *    the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8bed..f31271c30de9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
  * the push is run asynchronously in a separate thread, so we return the tail
  * of the log right now instead of the tail after the push. This means we will
  * either continue right away, or we will sleep waiting on the async thread to
- * do it's work.
+ * do its work.
  *
  * We do this unlocked - we only need to know whether there is anything in the
  * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
 /*
  * Now that the traversal is complete, we need to remove the cursor
  * from the list of traversing cursors. Avoid removing the embedded
- * push cursor, but use the fact it is alway present to make the
+ * push cursor, but use the fact it is always present to make the
  * list deletion simple.
  */
 void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f4..eb3fc57f9eef 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+/* XXX: from here down needed until struct xfs_trans has its own ailp */
 #include "xfs_bit.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bdd..7d2c920dfb9c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
 #define	XFS_DIRREMOVE_SPACE_RES(mp)	\
 	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define	XFS_IALLOC_SPACE_RES(mp)	\
-	(XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+	(XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
 
 /*
  * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1b..d725428c9df6 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
 #ifdef __KERNEL__
 
 /*
- * POSIX Extensions
- */
-typedef unsigned char		uchar_t;
-typedef unsigned short		ushort_t;
-typedef unsigned int		uint_t;
-typedef unsigned long		ulong_t;
-
-/*
  * Additional type declarations for XFS
  */
 typedef signed char		__int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03ed..79b9e5ea5359 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
 
 	/*
 	 * Follow the normal truncate locking protocol.  Since we
-	 * hold the inode in the transaction, we know that it's number
+	 * hold the inode in the transaction, we know that its number
 	 * of references will stay constant.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5f..7394c7af5de5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
 	 * If the inode is already free, then there can be nothing
 	 * to clean up here.
 	 */
-	if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
+	if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
 		ASSERT(ip->i_df.if_real_bytes == 0);
 		ASSERT(ip->i_df.if_broot_bytes == 0);
 		return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t		*ip;
-	xfs_trans_t		*tp;
+	int			is_dir = S_ISDIR(mode);
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_inode	*ip = NULL;
+	struct xfs_trans	*tp = NULL;
 	int			error;
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	boolean_t		unlock_dp_on_error = B_FALSE;
-	int			dm_event_sent = 0;
 	uint			cancel_flags;
 	int			committed;
 	xfs_prid_t		prid;
-	struct xfs_dquot	*udqp, *gdqp;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
 	uint			resblks;
+	uint			log_res;
+	uint			log_count;
 
-	ASSERT(!*ipp);
 	xfs_itrace_entry(dp);
 
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
 				dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
 
 		if (error)
 			return error;
-		dm_event_sent = 1;
 	}
 
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	/* Return through std_return after this point. */
-
-	udqp = gdqp = NULL;
 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 		prid = dp->i_d.di_projid;
 	else
-		prid = (xfs_prid_t)dfltprid;
+		prid = dfltprid;
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
 	error = XFS_QM_DQVOPALLOC(mp, dp,
 			current_fsuid(), current_fsgid(), prid,
-			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
 
-	ip = NULL;
+	if (is_dir) {
+		rdev = 0;
+		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+		log_res = XFS_MKDIR_LOG_RES(mp);
+		log_count = XFS_MKDIR_LOG_COUNT;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+	} else {
+		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+		log_res = XFS_CREATE_LOG_RES(mp);
+		log_count = XFS_CREATE_LOG_COUNT;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+
 	/*
 	 * Initially assume that the file does not exist and
 	 * reserve the resources for that case.  If that is not
 	 * the case we'll drop the one we have and get a more
 	 * appropriate transaction later.
 	 */
-	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	error = xfs_trans_reserve(tp, resblks, log_res, 0,
+			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		resblks = 0;
-		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
-				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+		error = xfs_trans_reserve(tp, 0, log_res, 0,
+				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
 	if (error) {
 		cancel_flags = 0;
-		goto error_return;
+		goto out_trans_cancel;
 	}
 
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = B_TRUE;
 
-	xfs_bmap_init(&free_list, &first_block);
+	/*
+	 * Check for directory link count overflow.
+	 */
+	if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto out_trans_cancel;
+	}
 
-	ASSERT(ip == NULL);
+	xfs_bmap_init(&free_list, &first_block);
 
 	/*
 	 * Reserve disk quota and the inode.
 	 */
 	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
 	if (error)
-		goto error_return;
+		goto out_trans_cancel;
 
 	error = xfs_dir_canenter(tp, dp, name, resblks);
 	if (error)
-		goto error_return;
-	error = xfs_dir_ialloc(&tp, dp, mode, 1,
-			rdev, credp, prid, resblks > 0,
-			&ip, &committed);
+		goto out_trans_cancel;
+
+	/*
+	 * A newly created regular or special file just has one directory
+	 * entry pointing to them, but a directory also the "." entry
+	 * pointing to itself.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+			       prid, resblks > 0, &ip, &committed);
 	if (error) {
 		if (error == ENOSPC)
-			goto error_return;
-		goto abort_return;
+			goto out_trans_cancel;
+		goto out_trans_abort;
 	}
-	xfs_itrace_ref(ip);
 
 	/*
 	 * At this point, we've gotten a newly allocated inode.
 	 * It is locked (and joined to the transaction).
 	 */
-
+	xfs_itrace_ref(ip);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
@@ -1508,19 +1526,28 @@ xfs_create(
 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
-		goto abort_return;
+		goto out_trans_abort;
 	}
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
+	if (is_dir) {
+		error = xfs_dir_init(tp, ip, dp);
+		if (error)
+			goto out_bmap_cancel;
+
+		error = xfs_bumplink(tp, dp);
+		if (error)
+			goto out_bmap_cancel;
+	}
+
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * create transaction goes to disk before returning to
 	 * the user.
 	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 		xfs_trans_set_sync(tp);
-	}
 
 	/*
 	 * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
 	IHOLD(ip);
 
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
-	if (error) {
-		xfs_bmap_cancel(&free_list);
-		goto abort_rele;
-	}
+	if (error)
+		goto out_abort_rele;
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
 		IRELE(ip);
-		tp = NULL;
-		goto error_return;
+		goto out_dqrele;
 	}
 
 	XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
 	*ipp = ip;
 
 	/* Fallthrough to std_return with error = 0  */
-
-std_return:
-	if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
-	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-			dp, DM_RIGHT_NULL,
-			*ipp ? ip : NULL,
-			DM_RIGHT_NULL, name->name, NULL,
-			mode, error, 0);
+ std_return:
+	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
+		XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
+				ip, DM_RIGHT_NULL, name->name, NULL, mode,
+				error, 0);
 	}
+
 	return error;
 
- abort_return:
+ out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+ out_trans_abort:
 	cancel_flags |= XFS_TRANS_ABORT;
-	/* FALLTHROUGH */
-
- error_return:
-	if (tp != NULL)
-		xfs_trans_cancel(tp, cancel_flags);
-
+ out_trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+ out_dqrele:
 	XFS_QM_DQRELE(mp, udqp);
 	XFS_QM_DQRELE(mp, gdqp);
 
@@ -1583,20 +1603,18 @@ std_return:
 
 	goto std_return;
 
- abort_rele:
+ out_abort_rele:
 	/*
 	 * Wait until after the current transaction is aborted to
 	 * release the inode.  This prevents recursive transactions
 	 * and deadlocks from xfs_inactive.
 	 */
+	xfs_bmap_cancel(&free_list);
 	cancel_flags |= XFS_TRANS_ABORT;
 	xfs_trans_cancel(tp, cancel_flags);
 	IRELE(ip);
-
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
-
-	goto std_return;
+	unlock_dp_on_error = B_FALSE;
+	goto out_dqrele;
 }
 
 #ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
 	/* Return through std_return after this point. */
 
 	error = XFS_QM_DQATTACH(mp, sip, 0);
-	if (!error && sip != tdp)
-		error = XFS_QM_DQATTACH(mp, tdp, 0);
+	if (error)
+		goto std_return;
+
+	error = XFS_QM_DQATTACH(mp, tdp, 0);
 	if (error)
 		goto std_return;
 
@@ -2110,209 +2130,6 @@ std_return:
 	goto std_return;
 }
 
-
-int
-xfs_mkdir(
-	xfs_inode_t             *dp,
-	struct xfs_name		*dir_name,
-	mode_t			mode,
-	xfs_inode_t		**ipp,
-	cred_t			*credp)
-{
-	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t		*cdp;	/* inode of created dir */
-	xfs_trans_t		*tp;
-	int			cancel_flags;
-	int			error;
-	int			committed;
-	xfs_bmap_free_t         free_list;
-	xfs_fsblock_t           first_block;
-	boolean_t		unlock_dp_on_error = B_FALSE;
-	boolean_t		created = B_FALSE;
-	int			dm_event_sent = 0;
-	xfs_prid_t		prid;
-	struct xfs_dquot	*udqp, *gdqp;
-	uint			resblks;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	tp = NULL;
-
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-					dp, DM_RIGHT_NULL, NULL,
-					DM_RIGHT_NULL, dir_name->name, NULL,
-					mode, 0, 0);
-		if (error)
-			return error;
-		dm_event_sent = 1;
-	}
-
-	/* Return through std_return after this point. */
-
-	xfs_itrace_entry(dp);
-
-	mp = dp->i_mount;
-	udqp = gdqp = NULL;
-	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = dp->i_d.di_projid;
-	else
-		prid = (xfs_prid_t)dfltprid;
-
-	/*
-	 * Make sure that we have allocated dquot(s) on disk.
-	 */
-	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(), current_fsgid(), prid,
-			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
-	if (error)
-		goto std_return;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
-	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
-				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
-	if (error == ENOSPC) {
-		resblks = 0;
-		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
-					  XFS_TRANS_PERM_LOG_RES,
-					  XFS_MKDIR_LOG_COUNT);
-	}
-	if (error) {
-		cancel_flags = 0;
-		goto error_return;
-	}
-
-	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-	unlock_dp_on_error = B_TRUE;
-
-	/*
-	 * Check for directory link count overflow.
-	 */
-	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
-		error = XFS_ERROR(EMLINK);
-		goto error_return;
-	}
-
-	/*
-	 * Reserve disk quota and the inode.
-	 */
-	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
-	if (error)
-		goto error_return;
-
-	error = xfs_dir_canenter(tp, dp, dir_name, resblks);
-	if (error)
-		goto error_return;
-	/*
-	 * create the directory inode.
-	 */
-	error = xfs_dir_ialloc(&tp, dp, mode, 2,
-			0, credp, prid, resblks > 0,
-		&cdp, NULL);
-	if (error) {
-		if (error == ENOSPC)
-			goto error_return;
-		goto abort_return;
-	}
-	xfs_itrace_ref(cdp);
-
-	/*
-	 * Now we add the directory inode to the transaction.
-	 * We waited until now since xfs_dir_ialloc might start
-	 * a new transaction.  Had we joined the transaction
-	 * earlier, the locks might have gotten released. An error
-	 * from here on will result in the transaction cancel
-	 * unlocking dp so don't do it explicitly in the error path.
-	 */
-	IHOLD(dp);
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	unlock_dp_on_error = B_FALSE;
-
-	xfs_bmap_init(&free_list, &first_block);
-
-	error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
-					&first_block, &free_list, resblks ?
-					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
-	if (error) {
-		ASSERT(error != ENOSPC);
-		goto error1;
-	}
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-	error = xfs_dir_init(tp, cdp, dp);
-	if (error)
-		goto error2;
-
-	error = xfs_bumplink(tp, dp);
-	if (error)
-		goto error2;
-
-	created = B_TRUE;
-
-	*ipp = cdp;
-	IHOLD(cdp);
-
-	/*
-	 * Attach the dquots to the new inode and modify the icount incore.
-	 */
-	XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * mkdir transaction goes to disk before returning to
-	 * the user.
-	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-		xfs_trans_set_sync(tp);
-	}
-
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
-	if (error) {
-		IRELE(cdp);
-		goto error2;
-	}
-
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
-	if (error) {
-		IRELE(cdp);
-	}
-
-	/* Fall through to std_return with error = 0 or errno from
-	 * xfs_trans_commit. */
-
-std_return:
-	if ((created || (error != 0 && dm_event_sent != 0)) &&
-	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-					dp, DM_RIGHT_NULL,
-					created ? cdp : NULL,
-					DM_RIGHT_NULL,
-					dir_name->name, NULL,
-					mode, error, 0);
-	}
-	return error;
-
- error2:
- error1:
-	xfs_bmap_cancel(&free_list);
- abort_return:
-	cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-	xfs_trans_cancel(tp, cancel_flags);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
-
-	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-	goto std_return;
-}
-
 int
 xfs_symlink(
 	xfs_inode_t		*dp,
@@ -2587,51 +2404,6 @@ std_return:
 }
 
 int
-xfs_inode_flush(
-	xfs_inode_t	*ip,
-	int		flags)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	int		error = 0;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	/*
-	 * Bypass inodes which have already been cleaned by
-	 * the inode flush clustering code inside xfs_iflush
-	 */
-	if (xfs_inode_clean(ip))
-		return 0;
-
-	/*
-	 * We make this non-blocking if the inode is contended,
-	 * return EAGAIN to indicate to the caller that they
-	 * did not succeed. This prevents the flush path from
-	 * blocking on inodes inside another operation right
-	 * now, they get caught later by xfs_sync.
-	 */
-	if (flags & FLUSH_SYNC) {
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-		xfs_iflock(ip);
-	} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			return EAGAIN;
-		}
-	} else {
-		return EAGAIN;
-	}
-
-	error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
-						    : XFS_IFLUSH_ASYNC_NOBLOCK);
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	return error;
-}
-
-
-int
 xfs_set_dmattrs(
 	xfs_inode_t     *ip,
 	u_int		evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
 	ASSERT(!VN_MAPPED(VFS_I(ip)));
 
 	/* bad inode, get out here ASAP */
-	if (VN_BAD(VFS_I(ip))) {
+	if (is_bad_inode(VFS_I(ip))) {
 		xfs_ireclaim(ip);
 		return 0;
 	}
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
 
 	/*
 	 * Need to zero the stuff we're not freeing, on disk.
-	 * If its a realtime file & can't use unwritten extents then we
+	 * If it's a realtime file & can't use unwritten extents then we
 	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
 	 * will take care of it for us.
 	 */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b4..04373c6c61ff 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
-int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-		mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, mode_t mode, struct xfs_inode **ipp,
 		cred_t *credp);
-int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,