From 30f712c9dd69348aa51351d5cb6d366bf4fae31d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 25 Jun 2014 14:57:53 +1000
Subject: libxfs: move source files

Move all the source files that are shared with userspace into
libxfs/. This is done as one big chunk simpy to get it done
quickly

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/Makefile                    |   63 +-
 fs/xfs/libxfs/xfs_alloc.c          | 2630 +++++++++++++++++
 fs/xfs/libxfs/xfs_alloc_btree.c    |  504 ++++
 fs/xfs/libxfs/xfs_attr.c           | 1459 ++++++++++
 fs/xfs/libxfs/xfs_attr_leaf.c      | 2697 +++++++++++++++++
 fs/xfs/libxfs/xfs_attr_remote.c    |  628 ++++
 fs/xfs/libxfs/xfs_bmap.c           | 5609 ++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_bmap_btree.c     |  967 +++++++
 fs/xfs/libxfs/xfs_btree.c          | 3989 +++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_da_btree.c       | 2665 +++++++++++++++++
 fs/xfs/libxfs/xfs_da_format.c      |  911 ++++++
 fs/xfs/libxfs/xfs_dir2.c           |  762 +++++
 fs/xfs/libxfs/xfs_dir2_block.c     | 1265 ++++++++
 fs/xfs/libxfs/xfs_dir2_data.c      | 1050 +++++++
 fs/xfs/libxfs/xfs_dir2_leaf.c      | 1831 ++++++++++++
 fs/xfs/libxfs/xfs_dir2_node.c      | 2284 +++++++++++++++
 fs/xfs/libxfs/xfs_dir2_priv.h      |  274 ++
 fs/xfs/libxfs/xfs_dir2_sf.c        | 1184 ++++++++
 fs/xfs/libxfs/xfs_dquot_buf.c      |  290 ++
 fs/xfs/libxfs/xfs_ialloc.c         | 2189 ++++++++++++++
 fs/xfs/libxfs/xfs_ialloc_btree.c   |  422 +++
 fs/xfs/libxfs/xfs_inode_buf.c      |  479 +++
 fs/xfs/libxfs/xfs_inode_fork.c     | 1906 ++++++++++++
 fs/xfs/libxfs/xfs_log_rlimit.c     |  150 +
 fs/xfs/libxfs/xfs_rtbitmap.c       |  973 +++++++
 fs/xfs/libxfs/xfs_symlink_remote.c |  201 ++
 fs/xfs/libxfs/xfs_trans_resv.c     |  894 ++++++
 fs/xfs/xfs_alloc.c                 | 2630 -----------------
 fs/xfs/xfs_alloc_btree.c           |  504 ----
 fs/xfs/xfs_attr.c                  | 1459 ----------
 fs/xfs/xfs_attr_leaf.c             | 2697 -----------------
 fs/xfs/xfs_attr_remote.c           |  628 ----
 fs/xfs/xfs_bmap.c                  | 5609 ------------------------------------
 fs/xfs/xfs_bmap_btree.c            |  967 -------
 fs/xfs/xfs_btree.c                 | 3989 -------------------------
 fs/xfs/xfs_da_btree.c              | 2665 -----------------
 fs/xfs/xfs_da_format.c             |  911 ------
 fs/xfs/xfs_dir2.c                  |  762 -----
 fs/xfs/xfs_dir2_block.c            | 1265 --------
 fs/xfs/xfs_dir2_data.c             | 1050 -------
 fs/xfs/xfs_dir2_leaf.c             | 1831 ------------
 fs/xfs/xfs_dir2_node.c             | 2284 ---------------
 fs/xfs/xfs_dir2_priv.h             |  274 --
 fs/xfs/xfs_dir2_sf.c               | 1184 --------
 fs/xfs/xfs_dquot_buf.c             |  290 --
 fs/xfs/xfs_ialloc.c                | 2189 --------------
 fs/xfs/xfs_ialloc_btree.c          |  422 ---
 fs/xfs/xfs_inode_buf.c             |  479 ---
 fs/xfs/xfs_inode_fork.c            | 1906 ------------
 fs/xfs/xfs_log_rlimit.c            |  150 -
 fs/xfs/xfs_rtbitmap.c              |  973 -------
 fs/xfs/xfs_symlink_remote.c        |  201 --
 fs/xfs/xfs_trans_resv.c            |  894 ------
 53 files changed, 38245 insertions(+), 38244 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_alloc.c
 create mode 100644 fs/xfs/libxfs/xfs_alloc_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_attr.c
 create mode 100644 fs/xfs/libxfs/xfs_attr_leaf.c
 create mode 100644 fs/xfs/libxfs/xfs_attr_remote.c
 create mode 100644 fs/xfs/libxfs/xfs_bmap.c
 create mode 100644 fs/xfs/libxfs/xfs_bmap_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_da_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_da_format.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2_block.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2_data.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2_leaf.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2_node.c
 create mode 100644 fs/xfs/libxfs/xfs_dir2_priv.h
 create mode 100644 fs/xfs/libxfs/xfs_dir2_sf.c
 create mode 100644 fs/xfs/libxfs/xfs_dquot_buf.c
 create mode 100644 fs/xfs/libxfs/xfs_ialloc.c
 create mode 100644 fs/xfs/libxfs/xfs_ialloc_btree.c
 create mode 100644 fs/xfs/libxfs/xfs_inode_buf.c
 create mode 100644 fs/xfs/libxfs/xfs_inode_fork.c
 create mode 100644 fs/xfs/libxfs/xfs_log_rlimit.c
 create mode 100644 fs/xfs/libxfs/xfs_rtbitmap.c
 create mode 100644 fs/xfs/libxfs/xfs_symlink_remote.c
 create mode 100644 fs/xfs/libxfs/xfs_trans_resv.c
 delete mode 100644 fs/xfs/xfs_alloc.c
 delete mode 100644 fs/xfs/xfs_alloc_btree.c
 delete mode 100644 fs/xfs/xfs_attr.c
 delete mode 100644 fs/xfs/xfs_attr_leaf.c
 delete mode 100644 fs/xfs/xfs_attr_remote.c
 delete mode 100644 fs/xfs/xfs_bmap.c
 delete mode 100644 fs/xfs/xfs_bmap_btree.c
 delete mode 100644 fs/xfs/xfs_btree.c
 delete mode 100644 fs/xfs/xfs_da_btree.c
 delete mode 100644 fs/xfs/xfs_da_format.c
 delete mode 100644 fs/xfs/xfs_dir2.c
 delete mode 100644 fs/xfs/xfs_dir2_block.c
 delete mode 100644 fs/xfs/xfs_dir2_data.c
 delete mode 100644 fs/xfs/xfs_dir2_leaf.c
 delete mode 100644 fs/xfs/xfs_dir2_node.c
 delete mode 100644 fs/xfs/xfs_dir2_priv.h
 delete mode 100644 fs/xfs/xfs_dir2_sf.c
 delete mode 100644 fs/xfs/xfs_dquot_buf.c
 delete mode 100644 fs/xfs/xfs_ialloc.c
 delete mode 100644 fs/xfs/xfs_ialloc_btree.c
 delete mode 100644 fs/xfs/xfs_inode_buf.c
 delete mode 100644 fs/xfs/xfs_inode_fork.c
 delete mode 100644 fs/xfs/xfs_log_rlimit.c
 delete mode 100644 fs/xfs/xfs_rtbitmap.c
 delete mode 100644 fs/xfs/xfs_symlink_remote.c
 delete mode 100644 fs/xfs/xfs_trans_resv.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4c5edf0df9a3..0dfa26d626f5 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -28,7 +28,35 @@ xfs-y				+= xfs_trace.o
 
 # build the libxfs code first
 xfs-y				+= $(addprefix libxfs/, \
+				   xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_da_btree.o \
+				   xfs_da_format.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_inode_fork.o \
+				   xfs_inode_buf.o \
+				   xfs_log_rlimit.o \
 				   xfs_sb.o \
+				   xfs_symlink_remote.o \
+				   xfs_trans_resv.o \
+				   )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
+				   xfs_rtbitmap.o \
 				   )
 
 # highlevel code
@@ -51,6 +79,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
+				   xfs_inode.o \
 				   xfs_itable.o \
 				   xfs_message.o \
 				   xfs_mount.o \
@@ -62,41 +91,14 @@ xfs-y				+= xfs_aops.o \
 				   kmem.o \
 				   uuid.o
 
-# code shared with libxfs
-xfs-y				+= xfs_alloc.o \
-				   xfs_alloc_btree.o \
-				   xfs_attr.o \
-				   xfs_attr_leaf.o \
-				   xfs_attr_remote.o \
-				   xfs_bmap.o \
-				   xfs_bmap_btree.o \
-				   xfs_btree.o \
-				   xfs_da_btree.o \
-				   xfs_da_format.o \
-				   xfs_dir2.o \
-				   xfs_dir2_block.o \
-				   xfs_dir2_data.o \
-				   xfs_dir2_leaf.o \
-				   xfs_dir2_node.o \
-				   xfs_dir2_sf.o \
-				   xfs_dquot_buf.o \
-				   xfs_ialloc.o \
-				   xfs_ialloc_btree.o \
-				   xfs_icreate_item.o \
-				   xfs_inode.o \
-				   xfs_inode_fork.o \
-				   xfs_inode_buf.o \
-				   xfs_log_recover.o \
-				   xfs_log_rlimit.o \
-				   xfs_symlink_remote.o \
-				   xfs_trans_resv.o
-
 # low-level transaction/log code
 xfs-y				+= xfs_log.o \
 				   xfs_log_cil.o \
 				   xfs_buf_item.o \
 				   xfs_extfree_item.o \
+				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
+				   xfs_log_recover.o \
 				   xfs_trans_ail.o \
 				   xfs_trans_buf.o \
 				   xfs_trans_extfree.o \
@@ -112,8 +114,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
-				   xfs_rtbitmap.o
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000000..d43813267a80
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,2630 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+
+struct workqueue_struct *xfs_alloc_wq;
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define	XFSA_FIXUP_BNO_OK	1
+#define	XFSA_FIXUP_CNT_OK	2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	union xfs_btree_rec	rec;
+
+	rec.alloc.ar_startblock = cpu_to_be32(bno);
+	rec.alloc.ar_blockcount = cpu_to_be32(len);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		*bno = be32_to_cpu(rec->alloc.ar_startblock);
+		*len = be32_to_cpu(rec->alloc.ar_blockcount);
+	}
+	return error;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen)	/* result length */
+{
+	xfs_agblock_t	bno;
+	xfs_extlen_t	len;
+
+	/* Trim busy sections out of found extent */
+	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+		xfs_extlen_t	diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
+	} else {
+		*resbno = bno;
+		*reslen = len;
+	}
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	char		userdata,	/* are we allocating data? */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	/*
+	 * We want to allocate from the start of a free extent if it is past
+	 * the desired block or if we are allocating user data and the free
+	 * extent is before desired block. The second case is there to allow
+	 * for contiguous allocation from the remaining free space if the file
+	 * grows in the short term.
+	 */
+	if (freebno >= wantbno || (userdata && freeend < wantend)) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod)
+		rlen = rlen - (k - args->mod);
+	else
+		rlen = rlen - args->prod + (args->mod - k);
+	if ((int)rlen < (int)args->minlen)
+		return;
+	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+	ASSERT(rlen % args->prod == args->mod);
+	args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_agf_t	*agf;		/* a.g. freelist header */
+	int		diff;		/* free space difference */
+
+	if (args->minleft == 0)
+		return 1;
+	agf = XFS_BUF_TO_AGF(args->agbp);
+	diff = be32_to_cpu(agf->agf_freeblks)
+		- args->len - args->minleft;
+	if (diff >= 0)
+		return 1;
+	args->len += diff;		/* shrink the allocated space */
+	if (args->len >= args->minlen)
+		return 1;
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	xfs_btree_cur_t	*cnt_cur,	/* cursor for by-size btree */
+	xfs_btree_cur_t	*bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+#ifdef DEBUG
+	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+		struct xfs_btree_block	*bnoblock;
+		struct xfs_btree_block	*cntblock;
+
+		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+		XFS_WANT_CORRUPTED_RETURN(
+			bnoblock->bb_numrecs == cntblock->bb_numrecs);
+	}
+#endif
+
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_btree_delete(cnt_cur, &i)))
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	return 0;
+}
+
+static bool
+xfs_agfl_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
+	int		i;
+
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+		return false;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+		if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_agfl_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc AGFLs */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_agfl_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (bip)
+		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.verify_read = xfs_agfl_read_verify,
+	.verify_write = xfs_agfl_write_verify,
+};
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int				/* error */
+xfs_alloc_read_agfl(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
+{
+	xfs_buf_t	*bp;		/* return value */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+	if (error)
+		return error;
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agbp,
+	long			len)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+
+	pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
+
+	xfs_trans_agblocks_delta(tp, len);
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length)))
+		return EFSCORRUPTED;
+
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
+{
+	int		error=0;
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+
+	if (error || args->agbno == NULLAGBLOCK)
+		return error;
+
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->pag,
+						  args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+					      args->agbno, args->len));
+	}
+
+	if (!args->isfl) {
+		xfs_trans_mod_sb(args->tp, args->wasdel ?
+				 XFS_TRANS_SB_RES_FDBLOCKS :
+				 XFS_TRANS_SB_FDBLOCKS,
+				 -((long)(args->len)));
+	}
+
+	XFS_STATS_INC(xs_allocx);
+	XFS_STATS_ADD(xs_allocb, args->len);
+	return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
+	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	int		i;	/* success/failure of operation */
+
+	ASSERT(args->alignment == 1);
+
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->agno, XFS_BTNUM_BNO);
+
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
+		goto error0;
+	if (!i)
+		goto not_found;
+
+	/*
+	 * Grab the freespace record.
+	 */
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	ASSERT(fbno <= args->agbno);
+
+	/*
+	 * Check for overlapping busy extents.
+	 */
+	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+	/*
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
+	 */
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
+		goto not_found;
+
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 *
+	 * Fix the length according to mod and prod if given.
+	 */
+	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+						- args->agbno;
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args))
+		goto not_found;
+
+	ASSERT(args->agbno + args->len <= tend);
+
+	/*
+	 * We are allocating agbno for args->len
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	ASSERT(args->agbno + args->len <=
+		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	trace_xfs_alloc_exact_error(args);
+	return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	**gcur,	/* good cursor */
+	struct xfs_btree_cur	**scur,	/* searching cursor */
+	xfs_agblock_t		gdiff,	/* difference for search comparison */
+	xfs_agblock_t		*sbno,	/* extent found by search */
+	xfs_extlen_t		*slen,	/* extent length */
+	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
+	xfs_extlen_t		*slena,	/* aligned extent length */
+	int			dir)	/* 0 = search right, 1 = search left */
+{
+	xfs_agblock_t		new;
+	xfs_agblock_t		sdiff;
+	int			error;
+	int			i;
+
+	/* The good extent is perfect, no need to  search. */
+	if (!gdiff)
+		goto out_use_good;
+
+	/*
+	 * Look until we find a better one, run out of space or run off the end.
+	 */
+	do {
+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+		/*
+		 * The good extent is closer than this one.
+		 */
+		if (!dir) {
+			if (*sbnoa >= args->agbno + gdiff)
+				goto out_use_good;
+		} else {
+			if (*sbnoa <= args->agbno - gdiff)
+				goto out_use_good;
+		}
+
+		/*
+		 * Same distance, compare length and pick the best.
+		 */
+		if (*slena >= args->minlen) {
+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+			xfs_alloc_fix_len(args);
+
+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+						       args->alignment,
+						       args->userdata, *sbnoa,
+						       *slena, &new);
+
+			/*
+			 * Choose closer size and invalidate other cursor.
+			 */
+			if (sdiff < gdiff)
+				goto out_use_search;
+			goto out_use_good;
+		}
+
+		if (!dir)
+			error = xfs_btree_increment(*scur, 0, &i);
+		else
+			error = xfs_btree_decrement(*scur, 0, &i);
+		if (error)
+			goto error0;
+	} while (i);
+
+out_use_good:
+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+	*scur = NULL;
+	return 0;
+
+out_use_search:
+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+	*gcur = NULL;
+	return 0;
+
+error0:
+	/* caller invalidates cursors */
+	return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_near(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur_gt;	/* cursor for bno btree, right side */
+	xfs_btree_cur_t	*bno_cur_lt;	/* cursor for bno btree, left side */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for count btree */
+	xfs_agblock_t	gtbno;		/* start bno of right side entry */
+	xfs_agblock_t	gtbnoa;		/* aligned ... */
+	xfs_extlen_t	gtdiff;		/* difference to right side entry */
+	xfs_extlen_t	gtlen;		/* length of right side entry */
+	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_agblock_t	gtnew;		/* useful start bno of right side */
+	int		error;		/* error code */
+	int		i;		/* result code, temporary */
+	int		j;		/* result code, temporary */
+	xfs_agblock_t	ltbno;		/* start bno of left side entry */
+	xfs_agblock_t	ltbnoa;		/* aligned ... */
+	xfs_extlen_t	ltdiff;		/* difference to left side entry */
+	xfs_extlen_t	ltlen;		/* length of left side entry */
+	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_agblock_t	ltnew;		/* useful start bno of left side */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+#ifdef DEBUG
+	/*
+	 * Randomly don't execute the first algorithm.
+	 */
+	int		dofirst;	/* set to do first algorithm */
+
+	dofirst = prandom_u32() & 1;
+#endif
+
+restart:
+	bno_cur_lt = NULL;
+	bno_cur_gt = NULL;
+	ltlen = 0;
+	gtlena = 0;
+	ltlena = 0;
+
+	/*
+	 * Get a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+
+	/*
+	 * See if there are any free extents as big as maxlen.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+				&ltlen, &i)))
+			goto error0;
+		if (i == 0 || ltlen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	args->wasfromfl = 0;
+
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 * This is written as a while loop so we can break out of it,
+	 * but we never loop back to the top.
+	 */
+	while (xfs_btree_islastblock(cnt_cur, 0)) {
+		xfs_extlen_t	bdiff;
+		int		besti=0;
+		xfs_extlen_t	blen=0;
+		xfs_agblock_t	bnew=0;
+
+#ifdef DEBUG
+		if (dofirst)
+			break;
+#endif
+		/*
+		 * Start from the entry that lookup found, sequence through
+		 * all larger free blocks.  If we're actually pointing at a
+		 * record smaller than maxlen, go to the start of this block,
+		 * and skip all those smaller than minlen.
+		 */
+		if (ltlen || args->alignment > 1) {
+			cnt_cur->bc_ptrs[0] = 1;
+			do {
+				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+						&ltlen, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (ltlen >= args->minlen)
+					break;
+				if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+					goto error0;
+			} while (i);
+			ASSERT(ltlen >= args->minlen);
+			if (!i)
+				break;
+		}
+		i = cnt_cur->bc_ptrs[0];
+		for (j = 1, blen = 0, bdiff = 0;
+		     !error && j && (blen < args->maxlen || bdiff > 0);
+		     error = xfs_btree_increment(cnt_cur, 0, &j)) {
+			/*
+			 * For each entry, decide if it's better than
+			 * the previous best entry.
+			 */
+			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena < args->minlen)
+				continue;
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ASSERT(args->len >= args->minlen);
+			if (args->len < blen)
+				continue;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
+			if (ltnew != NULLAGBLOCK &&
+			    (args->len > blen || ltdiff < bdiff)) {
+				bdiff = ltdiff;
+				bnew = ltnew;
+				blen = args->len;
+				besti = cnt_cur->bc_ptrs[0];
+			}
+		}
+		/*
+		 * It didn't work.  We COULD be in a case where
+		 * there's a good record somewhere, so try again.
+		 */
+		if (blen == 0)
+			break;
+		/*
+		 * Point at the best entry, and retrieve it again.
+		 */
+		cnt_cur->bc_ptrs[0] = besti;
+		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+		args->len = blen;
+		if (!xfs_alloc_fix_minleft(args)) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_nominleft(args);
+			return 0;
+		}
+		blen = args->len;
+		/*
+		 * We are allocating starting at bnew for blen blocks.
+		 */
+		args->agbno = bnew;
+		ASSERT(bnew >= ltbno);
+		ASSERT(bnew + blen <= ltbno + ltlen);
+		/*
+		 * Set up a cursor for the by-bno tree.
+		 */
+		bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO);
+		/*
+		 * Fix up the btree entries.
+		 */
+		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+			goto error0;
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+
+		trace_xfs_alloc_near_first(args);
+		return 0;
+	}
+	/*
+	 * Second algorithm.
+	 * Search in the by-bno tree to the left and to the right
+	 * simultaneously, until in each case we find a space big enough,
+	 * or run into the edge of the tree.  When we run into the edge,
+	 * we deallocate that cursor.
+	 * If both searches succeed, we compare the two spaces and pick
+	 * the better one.
+	 * With alignment, it's possible for both to fail; the upper
+	 * level algorithm that picks allocation groups for allocations
+	 * is not supposed to do this.
+	 */
+	/*
+	 * Allocate and initialize the cursor for the leftward search.
+	 */
+	bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	/*
+	 * Lookup <= bno to find the leftward search's starting point.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find anything; use this cursor for the rightward
+		 * search.
+		 */
+		bno_cur_gt = bno_cur_lt;
+		bno_cur_lt = NULL;
+	}
+	/*
+	 * Found something.  Duplicate the cursor for the rightward search.
+	 */
+	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+		goto error0;
+	/*
+	 * Increment the cursor, so we will point at the entry just right
+	 * of the leftward entry if any, or to the leftmost entry.
+	 */
+	if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * It failed, there are no rightward entries.
+		 */
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+		bno_cur_gt = NULL;
+	}
+	/*
+	 * Loop going left with the leftward cursor, right with the
+	 * rightward cursor, until either both directions give up or
+	 * we find an entry at least as big as minlen.
+	 */
+	do {
+		if (bno_cur_lt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_lt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+		if (bno_cur_gt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+						  &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
+				break;
+			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+	} while (bno_cur_lt || bno_cur_gt);
+
+	/*
+	 * Got both cursors still active, need to find better entry.
+	 */
+	if (bno_cur_lt && bno_cur_gt) {
+		if (ltlena >= args->minlen) {
+			/*
+			 * Left side is good, look for a right side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_lt, &bno_cur_gt,
+						ltdiff, &gtbno, &gtlen,
+						&gtbnoa, &gtlena,
+						0 /* search right */);
+		} else {
+			ASSERT(gtlena >= args->minlen);
+
+			/*
+			 * Right side is good, look for a left side entry.
+			 */
+			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, gtbnoa,
+				gtlena, &gtnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_gt, &bno_cur_lt,
+						gtdiff, &ltbno, &ltlen,
+						&ltbnoa, &ltlena,
+						1 /* search left */);
+		}
+
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+		if (!forced++) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		trace_xfs_alloc_size_neither(args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+
+	/*
+	 * At this point we have selected a freespace entry, either to the
+	 * left or to the right.  If it's on the right, copy all the
+	 * useful variables to the "left" set so we only have one
+	 * copy of this code.
+	 */
+	if (bno_cur_gt) {
+		bno_cur_lt = bno_cur_gt;
+		bno_cur_gt = NULL;
+		ltbno = gtbno;
+		ltbnoa = gtbnoa;
+		ltlen = gtlen;
+		ltlena = gtlena;
+		j = 1;
+	} else
+		j = 0;
+
+	/*
+	 * Fix up the length and compute the useful address.
+	 */
+	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		trace_xfs_alloc_near_nominleft(args);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+				     args->userdata, ltbnoa, ltlena, &ltnew);
+	ASSERT(ltnew >= ltbno);
+	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	args->agbno = ltnew;
+
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+		goto error0;
+
+	if (j)
+		trace_xfs_alloc_near_greater(args);
+	else
+		trace_xfs_alloc_near_lesser(args);
+
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+	return 0;
+
+ error0:
+	trace_xfs_alloc_near_error(args);
+	if (cnt_cur != NULL)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur_lt != NULL)
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+	if (bno_cur_gt != NULL)
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for bno btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
+
+restart:
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+	bno_cur = NULL;
+
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+
+	/*
+	 * If none or we have busy extents that we cannot allocate from, then
+	 * we have to settle for a smaller extent. In the case that there are
+	 * no large extents, this will return the last entry in the tree unless
+	 * the tree is empty. In the case that there are only busy large
+	 * extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
+	 */
+	if (!i || forced > 1) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_noentry(args);
+			return 0;
+		}
+		ASSERT(i == 1);
+		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 * If we are at low space, don't check, or if we fall of
+		 * the end of the btree, turn off the busy check and
+		 * restart.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying. If we've been here before, forcing
+				 * the log isn't making the extents available,
+				 * which means they have probably been freed in
+				 * this transaction.  In that case, we have to
+				 * give up on them and we'll attempt a minlen
+				 * allocation the next time around.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				if (!forced++)
+					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				goto restart;
+			}
+		}
+	}
+
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (flen < bestrlen)
+				break;
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+				(rlen <= flen && rbno + rlen <= fbno + flen),
+				error0);
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	if (rlen < args->minlen) {
+		if (!forced++) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		goto out_nominleft;
+	}
+	xfs_alloc_fix_len(args);
+
+	if (!xfs_alloc_fix_minleft(args))
+		goto out_nominleft;
+	rlen = args->len;
+	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	XFS_WANT_CORRUPTED_GOTO(
+		args->agbno + args->len <=
+			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+		error0);
+	trace_xfs_alloc_size_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_size_error(args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	xfs_btree_cur_t	*ccur,	/* by-size cursor */
+	xfs_agblock_t	*fbnop,	/* result block number */
+	xfs_extlen_t	*flenp,	/* result length */
+	int		*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int		error;
+	xfs_agblock_t	fbno;
+	xfs_extlen_t	flen;
+	int		i;
+
+	if ((error = xfs_btree_decrement(ccur, 0, &i)))
+		goto error0;
+	if (i) {
+		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * Nothing in the btree, try the freelist.  Make sure
+	 * to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+		  > args->minleft)) {
+		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+		if (error)
+			goto error0;
+		if (fbno != NULLAGBLOCK) {
+			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+					     args->userdata);
+
+			if (args->userdata) {
+				xfs_buf_t	*bp;
+
+				bp = xfs_btree_get_bufs(args->mp, args->tp,
+					args->agno, fbno, 0);
+				xfs_trans_binval(args->tp, bp);
+			}
+			args->len = 1;
+			args->agbno = fbno;
+			XFS_WANT_CORRUPTED_GOTO(
+				args->agbno + args->len <=
+				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+				error0);
+			args->wasfromfl = 1;
+			trace_xfs_alloc_small_freelist(args);
+			*stat = 0;
+			return 0;
+		}
+		/*
+		 * Nothing in the freelist.
+		 */
+		else
+			flen = 0;
+	}
+	/*
+	 * Can't allocate from the freelist for some reason.
+	 */
+	else {
+		fbno = NULLAGBLOCK;
+		flen = 0;
+	}
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		trace_xfs_alloc_small_notenough(args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	trace_xfs_alloc_small_done(args);
+	return 0;
+
+error0:
+	trace_xfs_alloc_small_error(args);
+	return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int			/* error */
+xfs_free_ag_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	xfs_agblock_t	bno,	/* starting block number */
+	xfs_extlen_t	len,	/* length of extent */
+	int		isfl)	/* set if is freelist blocks - no sb acctg */
+{
+	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
+	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
+	int		error;		/* error return value */
+	xfs_agblock_t	gtbno;		/* start of right neighbor block */
+	xfs_extlen_t	gtlen;		/* length of right neighbor block */
+	int		haveleft;	/* have a left neighbor block */
+	int		haveright;	/* have a right neighbor block */
+	int		i;		/* temp, result code */
+	xfs_agblock_t	ltbno;		/* start of left neighbor block */
+	xfs_extlen_t	ltlen;		/* length of left neighbor block */
+	xfs_mount_t	*mp;		/* mount point struct for filesystem */
+	xfs_agblock_t	nbno;		/* new starting block of freespace */
+	xfs_extlen_t	nlen;		/* new length of freespace */
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+	mp = tp->t_mountp;
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+	cnt_cur = NULL;
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_btree_delete(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(
+				i == 1 && xxbno == ltbno && xxlen == ltlen,
+				error0);
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_btree_insert(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+	if ((error = xfs_btree_insert(cnt_cur, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_perag_put(pag);
+	if (error)
+		goto error0;
+
+	if (!isfl)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+	XFS_STATS_INC(xs_freex);
+	XFS_STATS_ADD(xs_freeb, len);
+
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
+	return 0;
+
+ error0:
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	xfs_extlen_t		need, delta = 0;
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	if (pag->pagf_longest > delta)
+		return pag->pagf_longest - delta;
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int			/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t	*args,	/* allocation argument structure */
+	int		flags)	/* XFS_ALLOC_FLAG_... */
+{
+	xfs_buf_t	*agbp;	/* agf buffer pointer */
+	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
+	xfs_buf_t	*agflbp;/* agfl buffer pointer */
+	xfs_agblock_t	bno;	/* freelist block */
+	xfs_extlen_t	delta;	/* new blocks needed in freelist */
+	int		error;	/* error result code */
+	xfs_extlen_t	longest;/* longest extent in allocation group */
+	xfs_mount_t	*mp;	/* file system mount point structure */
+	xfs_extlen_t	need;	/* total blocks needed in freelist */
+	xfs_perag_t	*pag;	/* per-ag information structure */
+	xfs_alloc_arg_t	targs;	/* local allocation arguments */
+	xfs_trans_t	*tp;	/* transaction pointer */
+
+	mp = args->mp;
+
+	pag = args->pag;
+	tp = args->tp;
+	if (!pag->pagf_init) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (!pag->pagf_init) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	} else
+		agbp = NULL;
+
+	/*
+	 * If this is a metadata preferred pag and we are user data
+	 * then try somewhere else if we are not being asked to
+	 * try harder at this point
+	 */
+	if (pag->pagf_metadata && args->userdata &&
+	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+		args->agbp = NULL;
+		return 0;
+	}
+
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		/*
+		 * If it looks like there isn't a long enough extent, or enough
+		 * total blocks, reject it.
+		 */
+		need = XFS_MIN_FREELIST_PAG(pag, mp);
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+			   need - args->total) < (int)args->minleft)) {
+			if (agbp)
+				xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (agbp == NULL) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (agbp == NULL) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Figure out how many blocks we should have in the freelist.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	need = XFS_MIN_FREELIST(agf, mp);
+	/*
+	 * If there isn't enough total or single-extent, reject it.
+	 */
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		delta = need > be32_to_cpu(agf->agf_flcount) ?
+			(need - be32_to_cpu(agf->agf_flcount)) : 0;
+		longest = be32_to_cpu(agf->agf_longest);
+		longest = (longest > delta) ? (longest - delta) :
+			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(be32_to_cpu(agf->agf_freeblks) +
+		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
+				(int)args->minleft)) {
+			xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Make the freelist shorter if it's too long.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) > need) {
+		xfs_buf_t	*bp;
+
+		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+		if (error)
+			return error;
+		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+			return error;
+		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Initialize the args structure.
+	 */
+	memset(&targs, 0, sizeof(targs));
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+		return error;
+	/*
+	 * Make the freelist longer if it's too short.
+	 */
+	while (be32_to_cpu(agf->agf_flcount) < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+		/*
+		 * Allocate as many blocks as possible at once.
+		 */
+		if ((error = xfs_alloc_ag_vextent(&targs))) {
+			xfs_trans_brelse(tp, agflbp);
+			return error;
+		}
+		/*
+		 * Stop if we run out.  Won't happen if callers are obeying
+		 * the restrictions correctly.  Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK) {
+			if (flags & XFS_ALLOC_FLAG_FREEING)
+				break;
+			xfs_trans_brelse(tp, agflbp);
+			args->agbp = NULL;
+			return 0;
+		}
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			error = xfs_alloc_put_freelist(tp, agbp,
+							agflbp, bno, 0);
+			if (error)
+				return error;
+		}
+	}
+	xfs_trans_brelse(tp, agflbp);
+	args->agbp = agbp;
+	return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk) /* destination is a AGF btree */
+{
+	xfs_agf_t	*agf;	/* a.g. freespace structure */
+	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
+	xfs_agblock_t	bno;	/* block number returned */
+	__be32		*agfl_bno;
+	int		error;
+	int		logflags;
+	xfs_mount_t	*mp = tp->t_mountp;
+	xfs_perag_t	*pag;	/* per allocation group data */
+
+	/*
+	 * Freelist is empty, give up.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	if (!agf->agf_flcount) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+				    &agflbp);
+	if (error)
+		return error;
+
+
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	be32_add_cpu(&agf->agf_flfirst, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+		agf->agf_flfirst = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, -1);
+	xfs_trans_agflist_delta(tp, -1);
+	pag->pagf_flcount--;
+	xfs_perag_put(pag);
+
+	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, 1);
+		pag->pagf_btreeblks++;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+	*bnop = bno;
+
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
+	int		fields)	/* mask of fields to be logged (XFS_AGF_...) */
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		offsetof(xfs_agf_t, agf_btreeblks),
+		offsetof(xfs_agf_t, agf_uuid),
+		sizeof(xfs_agf_t)
+	};
+
+	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int					/* error */
+xfs_alloc_pagf_init(
+	xfs_mount_t		*mp,	/* file system mount structure */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags)	/* XFS_ALLOC_FLAGS_... */
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int					/* error */
+xfs_alloc_put_freelist(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
+	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t		bno,	/* block being freed */
+	int			btreeblk) /* block came from a AGF btree */
+{
+	xfs_agf_t		*agf;	/* a.g. freespace structure */
+	__be32			*blockp;/* pointer to array entry */
+	int			error;
+	int			logflags;
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_perag_t		*pag;	/* per allocation group data */
+	__be32			*agfl_bno;
+	int			startoff;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	mp = tp->t_mountp;
+
+	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+			be32_to_cpu(agf->agf_seqno), &agflbp)))
+		return error;
+	be32_add_cpu(&agf->agf_fllast, 1);
+	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+		agf->agf_fllast = 0;
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, 1);
+	xfs_trans_agflist_delta(tp, 1);
+	pag->pagf_flcount++;
+
+	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, -1);
+		pag->pagf_btreeblks--;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+	*blockp = cpu_to_be32(bno);
+	startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(tp, agflbp, startoff,
+			  startoff + sizeof(xfs_agblock_t) - 1);
+	return 0;
+}
+
+static bool
+xfs_agf_verify(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp)
+ {
+	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+			return false;
+
+	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+	      be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+		return false;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+		return false;
+
+	return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+				XFS_ERRTAG_ALLOC_READ_AGF,
+				XFS_RANDOM_ALLOC_READ_AGF))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agf_verify(mp, bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.verify_read = xfs_agf_read_verify,
+	.verify_write = xfs_agf_write_verify,
+};
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_BUF_ */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	int		error;
+
+	trace_xfs_read_agf(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(
+			mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+
+	ASSERT(!(*bpp)->b_error);
+	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	struct xfs_agf		*agf;		/* ag freelist header */
+	struct xfs_perag	*pag;		/* per allocation group data */
+	int			error;
+
+	trace_xfs_alloc_read_agf(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_read_agf(mp, tp, agno,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+			bpp);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+	ASSERT(!(*bpp)->b_error);
+
+	agf = XFS_BUF_TO_AGF(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+		spin_lock_init(&pag->pagb_lock);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
+		pag->pagf_init = 1;
+	}
+#ifdef DEBUG
+	else if (!XFS_FORCED_SHUTDOWN(mp)) {
+		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+	}
+#endif
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t	*args)	/* allocation argument structure */
+{
+	xfs_agblock_t	agsize;	/* allocation group size */
+	int		error;
+	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
+	xfs_extlen_t	minleft;/* minimum left value, temp copy */
+	xfs_mount_t	*mp;	/* mount structure pointer */
+	xfs_agnumber_t	sagno;	/* starting allocation group number */
+	xfs_alloctype_t	type;	/* input allocation type */
+	int		bump_rotor = 0;
+	int		no_min = 0;
+	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		trace_xfs_alloc_vextent_badargs(args);
+		return 0;
+	}
+	minleft = args->minleft;
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		args->pag = xfs_perag_get(mp, args->agno);
+		args->minleft = 0;
+		error = xfs_alloc_fix_freelist(args, 0);
+		args->minleft = minleft;
+		if (error) {
+			trace_xfs_alloc_vextent_nofix(args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			trace_xfs_alloc_vextent_noagbp(args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp,
+					((mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount), 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		/* FALLTHROUGH */
+	case XFS_ALLOCTYPE_ANY_AG:
+	case XFS_ALLOCTYPE_START_AG:
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_ANY_AG) {
+			/*
+			 * Start with the last place we left off.
+			 */
+			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+					mp->m_sb.sb_agcount;
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			if (type == XFS_ALLOCTYPE_START_AG)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		for (;;) {
+			args->pag = xfs_perag_get(mp, args->agno);
+			if (no_min) args->minleft = 0;
+			error = xfs_alloc_fix_freelist(args, flags);
+			args->minleft = minleft;
+			if (error) {
+				trace_xfs_alloc_vextent_nofix(args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (no_min == 1) {
+					args->agbno = NULLAGBLOCK;
+					trace_xfs_alloc_vextent_allfailed(args);
+					break;
+				}
+				if (flags == 0) {
+					no_min = 1;
+				} else {
+					flags = 0;
+					if (type == XFS_ALLOCTYPE_START_BNO) {
+						args->agbno = XFS_FSB_TO_AGBNO(mp,
+							args->fsbno);
+						args->type = XFS_ALLOCTYPE_NEAR_BNO;
+					}
+				}
+			}
+			xfs_perag_put(args->pag);
+		}
+		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+			if (args->agno == sagno)
+				mp->m_agfrotor = (mp->m_agfrotor + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+			else
+				mp->m_agfrotor = (args->agno * rotorstep + 1) %
+					(mp->m_sb.sb_agcount * rotorstep);
+		}
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+	}
+	xfs_perag_put(args->pag);
+	return 0;
+error0:
+	xfs_perag_put(args->pag);
+	return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len)	/* length of extent */
+{
+	xfs_alloc_arg_t	args;
+	int		error;
+
+	ASSERT(len != 0);
+	memset(&args, 0, sizeof(xfs_alloc_arg_t));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	if (args.agno >= args.mp->m_sb.sb_agcount)
+		return EFSCORRUPTED;
+
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	if (args.agbno >= args.mp->m_sb.sb_agblocks)
+		return EFSCORRUPTED;
+
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
+
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
+
+	/* validate the extent size is legal now we have the agf locked */
+	if (args.agbno + len >
+			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+	if (!error)
+		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+	xfs_perag_put(args.pag);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..8358f1ded94d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	int			error;
+	xfs_agblock_t		bno;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+				       &bno, 1);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	if (bno == NULLAGBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	new->s = cpu_to_be32(bno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	if (error)
+		return error;
+
+	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+			      XFS_EXTENT_BUSY_SKIP_DISCARD);
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_rec	*rec,
+	int			ptr,
+	int			reason)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
+	__be32			len;
+	int			numrecs;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+	switch (reason) {
+	case LASTREC_UPDATE:
+		/*
+		 * If this is the last leaf block and it's the last record,
+		 * then update the size of the longest extent in the AG.
+		 */
+		if (ptr != xfs_btree_get_numrecs(block))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_INSREC:
+		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+		    be32_to_cpu(agf->agf_longest))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_DELREC:
+		numrecs = xfs_btree_get_numrecs(block);
+		if (ptr <= numrecs)
+			return;
+		ASSERT(ptr == numrecs + 1);
+
+		if (numrecs) {
+			xfs_alloc_rec_t *rrp;
+
+			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+			len = rrp->ar_blockcount;
+		} else {
+			len = 0;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	agf->agf_longest = len;
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
+	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(rec->alloc.ar_startblock != 0);
+
+	key->alloc.ar_startblock = rec->alloc.ar_startblock;
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->alloc.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = key->alloc.ar_startblock;
+	rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(cur->bc_rec.a.ar_startblock != 0);
+
+	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+	ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
+	xfs_alloc_key_t		*kp = &key->alloc;
+	__int64_t		diff;
+
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+				rec->ar_startblock;
+	}
+
+	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+	if (diff)
+		return diff;
+
+	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+static bool
+xfs_allocbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	/*
+	 * magic number and level verification
+	 *
+	 * During growfs operations, we can't verify the exact level or owner as
+	 * the perag is not fully initialised and hence not attached to the
+	 * buffer.  In this case, check against the maximum tree depth.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agf information will not yet have been initialised
+	 * from the on disk AGF. Again, we can only check against maximum limits
+	 * in this case.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTB_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTC_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_allocbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_allocbt_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_allocbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_allocbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(k1->alloc.ar_startblock) <
+		       be32_to_cpu(k2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(k1->alloc.ar_blockcount) <
+			be32_to_cpu(k2->alloc.ar_blockcount) ||
+			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+			 be32_to_cpu(k1->alloc.ar_startblock) <
+			 be32_to_cpu(k2->alloc.ar_startblock));
+	}
+}
+
+STATIC int
+xfs_allocbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(r1->alloc.ar_startblock) +
+			be32_to_cpu(r1->alloc.ar_blockcount) <=
+			be32_to_cpu(r2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(r1->alloc.ar_blockcount) <
+			be32_to_cpu(r2->alloc.ar_blockcount) ||
+			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+			 be32_to_cpu(r1->alloc.ar_startblock) <
+			 be32_to_cpu(r2->alloc.ar_startblock));
+	}
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_allocbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_allocbt_key_diff,
+	.buf_ops		= &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_allocbt_keys_inorder,
+	.recs_inorder		= xfs_allocbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *			/* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agf structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* btree identifier */
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_ops = &xfs_allocbt_ops;
+
+	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+	} else {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+	}
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_alloc_rec_t);
+	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000000..7d95b16f0919
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1459 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+
+STATIC int
+xfs_attr_args_init(
+	struct xfs_da_args	*args,
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	int			flags)
+{
+
+	if (!name)
+		return EINVAL;
+
+	memset(args, 0, sizeof(*args));
+	args->geo = dp->i_mount->m_attr_geo;
+	args->whichfork = XFS_ATTR_FORK;
+	args->dp = dp;
+	args->flags = flags;
+	args->name = name;
+	args->namelen = strlen((const char *)name);
+	if (args->namelen >= MAXNAMELEN)
+		return EFAULT;		/* match IRIX behaviour */
+
+	args->hashval = xfs_da_hashname(args->name, args->namelen);
+	return 0;
+}
+
+int
+xfs_inode_hasattr(
+	struct xfs_inode	*ip)
+{
+	if (!XFS_IFORK_Q(ip) ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0))
+		return 0;
+	return 1;
+}
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+int
+xfs_attr_get(
+	struct xfs_inode	*ip,
+	const unsigned char	*name,
+	unsigned char		*value,
+	int			*valuelenp,
+	int			flags)
+{
+	struct xfs_da_args	args;
+	uint			lock_mode;
+	int			error;
+
+	XFS_STATS_INC(xs_attr_get);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EIO;
+
+	if (!xfs_inode_hasattr(ip))
+		return ENOATTR;
+
+	error = xfs_attr_args_init(&args, ip, name, flags);
+	if (error)
+		return error;
+
+	args.value = value;
+	args.valuelen = *valuelenp;
+
+	lock_mode = xfs_ilock_attr_map_shared(ip);
+	if (!xfs_inode_hasattr(ip))
+		error = ENOATTR;
+	else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+		error = xfs_attr_shortform_getvalue(&args);
+	else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_get(&args);
+	else
+		error = xfs_attr_node_get(&args);
+	xfs_iunlock(ip, lock_mode);
+
+	*valuelenp = args.valuelen;
+	return error == EEXIST ? 0 : error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	int			size;
+	int			nblks;
+
+	/*
+	 * Determine space new attribute will use, and if it would be
+	 * "local" or "remote" (note: local != inline).
+	 */
+	size = xfs_attr_leaf_newentsize(args, local);
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (*local) {
+		if (size > (args->geo->blksize / 2)) {
+			/* Double split possible */
+			nblks *= 2;
+		}
+	} else {
+		/*
+		 * Out of line attribute, cannot double split, but
+		 * make room for the attribute value itself.
+		 */
+		uint	dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	return nblks;
+}
+
+int
+xfs_attr_set(
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	unsigned char		*value,
+	int			valuelen,
+	int			flags)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_args	args;
+	struct xfs_bmap_free	flist;
+	struct xfs_trans_res	tres;
+	xfs_fsblock_t		firstblock;
+	int			rsvd = (flags & ATTR_ROOT) != 0;
+	int			error, err2, committed, local;
+
+	XFS_STATS_INC(xs_attr_set);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return EIO;
+
+	error = xfs_attr_args_init(&args, dp, name, flags);
+	if (error)
+		return error;
+
+	args.value = value;
+	args.valuelen = valuelen;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+	args.total = xfs_attr_calc_size(&args, &local);
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * If the inode doesn't have an attribute fork, add one.
+	 * (inode must not be locked when we call this routine)
+	 */
+	if (XFS_IFORK_Q(dp) == 0) {
+		int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+			XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+
+		error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (rsvd)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+			 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+	tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+	error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+	if (error) {
+		xfs_trans_cancel(args.trans, 0);
+		return error;
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+				       XFS_QMOPT_RES_REGBLKS);
+	if (error) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+
+	xfs_trans_ijoin(args.trans, dp, 0);
+
+	/*
+	 * If the attribute list is non-existent or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			xfs_attr_shortform_create(&args);
+
+		/*
+		 * Try to add the attr to the attribute list in
+		 * the inode.
+		 */
+		error = xfs_attr_shortform_addname(&args);
+		if (error != ENOSPC) {
+			/*
+			 * Commit the shortform mods, and we're done.
+			 * NOTE: this is also the error path (EEXIST, etc).
+			 */
+			ASSERT(args.trans != NULL);
+
+			/*
+			 * If this is a synchronous mount, make sure that
+			 * the transaction goes to disk before returning
+			 * to the user.
+			 */
+			if (mp->m_flags & XFS_MOUNT_WSYNC)
+				xfs_trans_set_sync(args.trans);
+
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_trans_ichgtime(args.trans, dp,
+							XFS_ICHGTIME_CHG);
+			}
+			err2 = xfs_trans_commit(args.trans,
+						 XFS_TRANS_RELEASE_LOG_RES);
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+			return error ? error : err2;
+		}
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		xfs_bmap_init(args.flist, args.firstblock);
+		error = xfs_attr_shortform_to_leaf(&args);
+		if (!error) {
+			error = xfs_bmap_finish(&args.trans, args.flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args.trans = NULL;
+			xfs_bmap_cancel(&flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args.trans, dp, 0);
+
+		/*
+		 * Commit the leaf transformation.  We'll need another (linked)
+		 * transaction to add the new attribute to the leaf.
+		 */
+
+		error = xfs_trans_roll(&args.trans, dp);
+		if (error)
+			goto out;
+
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_addname(&args);
+	else
+		error = xfs_attr_node_addname(&args);
+	if (error)
+		goto out;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(args.trans);
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return error;
+
+out:
+	if (args.trans) {
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+int
+xfs_attr_remove(
+	struct xfs_inode	*dp,
+	const unsigned char	*name,
+	int			flags)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_args	args;
+	struct xfs_bmap_free	flist;
+	xfs_fsblock_t		firstblock;
+	int			error;
+
+	XFS_STATS_INC(xs_attr_remove);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return EIO;
+
+	if (!xfs_inode_hasattr(dp))
+		return ENOATTR;
+
+	error = xfs_attr_args_init(&args, dp, name, flags);
+	if (error)
+		return error;
+
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+
+	/*
+	 * we have no control over the attribute names that userspace passes us
+	 * to remove, so we have to allow the name lookup prior to attribute
+	 * removal to fail.
+	 */
+	args.op_flags = XFS_DA_OP_OKNOENT;
+
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (flags & ATTR_ROOT)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+				  XFS_ATTRRM_SPACE_RES(mp), 0);
+	if (error) {
+		xfs_trans_cancel(args.trans, 0);
+		return error;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks not allocate in the common case.
+	 */
+	xfs_trans_ijoin(args.trans, dp, 0);
+
+	if (!xfs_inode_hasattr(dp)) {
+		error = ENOATTR;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(&args);
+	} else {
+		error = xfs_attr_node_removename(&args);
+	}
+
+	if (error)
+		goto out;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(args.trans);
+
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return error;
+
+out:
+	if (args.trans) {
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+	int newsize, forkoff, retval;
+
+	trace_xfs_attr_sf_addname(args);
+
+	retval = xfs_attr_shortform_lookup(args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		return retval;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			return retval;
+		retval = xfs_attr_shortform_remove(args);
+		ASSERT(retval == 0);
+	}
+
+	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+		return ENOSPC;
+
+	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+
+	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+	if (!forkoff)
+		return ENOSPC;
+
+	xfs_attr_shortform_add(args, forkoff);
+	return 0;
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int retval, error, committed, forkoff;
+
+	trace_xfs_attr_leaf_addname(args);
+
+	/*
+	 * Read the (only) block in the attribute list in.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the given attribute in the leaf block.  Figure out if
+	 * the given flags produce an error or call for an atomic rename.
+	 */
+	retval = xfs_attr3_leaf_lookup_int(bp, args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		xfs_trans_brelse(args->trans, bp);
+		return retval;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE) {	/* pure create op */
+			xfs_trans_brelse(args->trans, bp);
+			return retval;
+		}
+
+		trace_xfs_attr_leaf_replace(args);
+
+		/* save the attribute state for later removal*/
+		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
+	}
+
+	/*
+	 * Add the attribute to the leaf block, transitioning to a Btree
+	 * if required.
+	 */
+	retval = xfs_attr3_leaf_add(bp, args);
+	if (retval == ENOSPC) {
+		/*
+		 * Promote the attribute list to the Btree format, then
+		 * Commit that transaction so that the node_addname() call
+		 * can manage its own transactions.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr3_leaf_to_node(args);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		/*
+		 * Commit the current trans (including the inode) and start
+		 * a new one.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return error;
+
+		/*
+		 * Fob the whole rest of the problem off on the Btree code.
+		 */
+		error = xfs_attr_node_addname(args);
+		return error;
+	}
+
+	/*
+	 * Commit the transaction that added the attr name so that
+	 * later routines can manage their own transactions.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		return error;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr3_leaf_flipflags(args);
+		if (error)
+			return error;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Read in the block containing the "old" attr, then
+		 * remove the "old" attr from that block (neat, huh!)
+		 */
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+					   -1, &bp);
+		if (error)
+			return error;
+
+		xfs_attr3_leaf_remove(bp, args);
+
+		/*
+		 * If the result is small enough, shrink it all into the inode.
+		 */
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				return error;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		}
+
+		/*
+		 * Commit the remove and start the next trans in series.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr3_leaf_clearflag(args);
+	}
+	return error;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int error, committed, forkoff;
+
+	trace_xfs_attr_leaf_removename(args);
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_lookup_int(bp, args);
+	if (error == ENOATTR) {
+		xfs_trans_brelse(args->trans, bp);
+		return error;
+	}
+
+	xfs_attr3_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+		/* bp is gone due to xfs_da_shrink_inode */
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+	}
+	return 0;
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	struct xfs_buf *bp;
+	int error;
+
+	trace_xfs_attr_leaf_get(args);
+
+	args->blkno = 0;
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	error = xfs_attr3_leaf_lookup_int(bp, args);
+	if (error != EEXIST)  {
+		xfs_trans_brelse(args->trans, bp);
+		return error;
+	}
+	error = xfs_attr3_leaf_getvalue(bp, args);
+	xfs_trans_brelse(args->trans, bp);
+	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+		error = xfs_attr_rmtval_get(args);
+	}
+	return error;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_mount_t *mp;
+	int committed, retval, error;
+
+	trace_xfs_attr_node_addname(args);
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+restart:
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = mp;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		goto out;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			goto out;
+
+		trace_xfs_attr_node_replace(args);
+
+		/* save the attribute state for later removal*/
+		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
+	}
+
+	retval = xfs_attr3_leaf_add(blk->bp, state->args);
+	if (retval == ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree.
+			 */
+			xfs_da_state_free(state);
+			state = NULL;
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_node(args);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+
+			/*
+			 * Commit the node conversion and start the next
+			 * trans in the chain.
+			 */
+			error = xfs_trans_roll(&args->trans, dp);
+			if (error)
+				goto out;
+
+			goto restart;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da3_split(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+
+	/*
+	 * Kill the state structure, we're done with it and need to
+	 * allow the buffers to come back later.
+	 */
+	xfs_da_state_free(state);
+	state = NULL;
+
+	/*
+	 * Commit the leaf addition or btree split and start the next
+	 * trans in the chain.
+	 */
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
+		goto out;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr3_leaf_flipflags(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Re-find the "old" attribute entry after any split ops.
+		 * The INCOMPLETE flag means that we will find the "old"
+		 * attr, not the "new" one.
+		 */
+		args->flags |= XFS_ATTR_INCOMPLETE;
+		state = xfs_da_state_alloc();
+		state->args = args;
+		state->mp = mp;
+		state->inleaf = 0;
+		error = xfs_da3_node_lookup_int(state, &retval);
+		if (error)
+			goto out;
+
+		/*
+		 * Remove the name and update the hashvals in the tree.
+		 */
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+		error = xfs_attr3_leaf_remove(blk->bp, args);
+		xfs_da3_fixhashpath(state, &state->path);
+
+		/*
+		 * Check to see if the tree needs to be collapsed.
+		 */
+		if (retval && (state->path.active > 1)) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_da3_join(state);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		}
+
+		/*
+		 * Commit and start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr3_leaf_clearflag(args);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	if (state)
+		xfs_da_state_free(state);
+	if (error)
+		return error;
+	return retval;
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	struct xfs_buf *bp;
+	int retval, error, committed, forkoff;
+
+	trace_xfs_attr_node_removename(args);
+
+	/*
+	 * Tie a string around our finger to remind us where we are.
+	 */
+	dp = args->dp;
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = dp->i_mount;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error || (retval != EEXIST)) {
+		if (error == 0)
+			error = retval;
+		goto out;
+	}
+
+	/*
+	 * If there is an out-of-line value, de-allocate the blocks.
+	 * This is done before we remove the attribute so that we don't
+	 * overflow the maximum size of a transaction and/or hit a deadlock.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->bp != NULL);
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if (args->rmtblkno > 0) {
+		/*
+		 * Fill in disk block numbers in the state structure
+		 * so that we can get the buffers back after we commit
+		 * several transactions in the following calls.
+		 */
+		error = xfs_attr_fillstate(state);
+		if (error)
+			goto out;
+
+		/*
+		 * Mark the attribute as INCOMPLETE, then bunmapi() the
+		 * remote value.
+		 */
+		error = xfs_attr3_leaf_setflag(args);
+		if (error)
+			goto out;
+		error = xfs_attr_rmtval_remove(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Refill the state structure with buffers, the prior calls
+		 * released our buffers.
+		 */
+		error = xfs_attr_refillstate(state);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr3_leaf_remove(blk->bp, args);
+	xfs_da3_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_da3_join(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		/*
+		 * Commit the Btree join operation and start a new trans.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * If the result is small enough, push it all into the inode.
+	 */
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		/*
+		 * Have to get rid of the copy of this dabuf in the state.
+		 */
+		ASSERT(state->path.active == 1);
+		ASSERT(state->path.blk[0].bp);
+		state->path.blk[0].bp = NULL;
+
+		error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
+		if (error)
+			goto out;
+
+		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+			xfs_bmap_init(args->flist, args->firstblock);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp, 0);
+		} else
+			xfs_trans_brelse(args->trans, bp);
+	}
+	error = 0;
+
+out:
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	trace_xfs_attr_fillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	trace_xfs_attr_refillstate(state->args);
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da3_node_read(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return error;
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int error, retval;
+	int i;
+
+	trace_xfs_attr_node_get(args);
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da3_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	} else if (retval == EEXIST) {
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->bp != NULL);
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Get the value, local or "remote"
+		 */
+		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+		if (!retval && (args->rmtblkno > 0)
+		    && !(args->flags & ATTR_KERNOVAL)) {
+			retval = xfs_attr_rmtval_get(args);
+		}
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return retval;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..127d96aba845
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,2697 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+				 xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+			xfs_da_state_blk_t *leaf_blk_1,
+			struct xfs_attr3_icleaf_hdr *ichdr1,
+			xfs_da_state_blk_t *leaf_blk_2,
+			struct xfs_attr3_icleaf_hdr *ichdr2,
+			int *number_entries_in_blk1,
+			int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+			struct xfs_attr_leafblock *src_leaf,
+			struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+			struct xfs_attr_leafblock *dst_leaf,
+			struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+			int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	int	i;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+	       from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->count);
+		to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+		to->firstused = be16_to_cpu(hdr3->firstused);
+		to->holes = hdr3->holes;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+			to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+		}
+		return;
+	}
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+	to->firstused = be16_to_cpu(from->hdr.firstused);
+	to->holes = from->hdr.holes;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+		to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	int	i;
+
+	ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+	       from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->count = cpu_to_be16(from->count);
+		hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+		hdr3->firstused = cpu_to_be16(from->firstused);
+		hdr3->holes = from->holes;
+		hdr3->pad1 = 0;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+			hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+		}
+		return;
+	}
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+	to->hdr.firstused = cpu_to_be16(from->firstused);
+	to->hdr.holes = from->holes;
+	to->hdr.pad1 = 0;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+		to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+	}
+}
+
+static bool
+xfs_attr3_leaf_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_attr_leafblock *leaf = bp->b_addr;
+	struct xfs_attr3_icleaf_hdr ichdr;
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+			return false;
+	}
+	if (ichdr.count == 0)
+		return false;
+
+	/* XXX: need to range check rest of attr header values */
+	/* XXX: hash order check? */
+
+	return true;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_attr3_leaf_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_attr3_leaf_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+	.verify_read = xfs_attr3_leaf_read_verify,
+	.verify_write = xfs_attr3_leaf_write_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+	return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+	int offset;
+	int minforkoff;	/* lower limit on valid forkoff locations */
+	int maxforkoff;	/* upper limit on valid forkoff locations */
+	int dsize;
+	xfs_mount_t *mp = dp->i_mount;
+
+	/* rounded down */
+	offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	case XFS_DINODE_FMT_UUID:
+		minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		return (offset >= minforkoff) ? minforkoff : 0;
+	}
+
+	/*
+	 * If the requested numbers of bytes is smaller or equal to the
+	 * current attribute fork size we can always proceed.
+	 *
+	 * Note that if_bytes in the data fork might actually be larger than
+	 * the current data fork size is due to delalloc extents. In that
+	 * case either the extent count will go down when they are converted
+	 * to real extents, or the delalloc conversion will take care of the
+	 * literal area rebalancing.
+	 */
+	if (bytes <= XFS_IFORK_ASIZE(dp))
+		return dp->i_d.di_forkoff;
+
+	/*
+	 * For attr2 we can try to move the forkoff if there is space in the
+	 * literal area, but for the old format we are done if there is no
+	 * space in the fixed attribute fork.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+		return 0;
+
+	dsize = dp->i_df.if_bytes;
+
+	switch (dp->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/*
+		 * If there is no attr fork and the data fork is extents, 
+		 * determine if creating the default attr fork will result
+		 * in the extents form migrating to btree. If so, the
+		 * minimum offset only needs to be the space required for
+		 * the btree root.
+		 */
+		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+		    xfs_default_attroffset(dp))
+			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/*
+		 * If we have a data btree then keep forkoff if we have one,
+		 * otherwise we are adding a new attr, so then we set
+		 * minforkoff to where the btree root can finish so we have
+		 * plenty of room for attrs
+		 */
+		if (dp->i_d.di_forkoff) {
+			if (offset < dp->i_d.di_forkoff)
+				return 0;
+			return dp->i_d.di_forkoff;
+		}
+		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+		break;
+	}
+
+	/*
+	 * A data fork btree root must have space for at least
+	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+	 */
+	minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+	minforkoff = roundup(minforkoff, 8) >> 3;
+
+	/* attr fork btree root can have at least this many key/ptr pairs */
+	maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+			XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = maxforkoff >> 3;	/* rounded down */
+
+	if (offset >= maxforkoff)
+		return maxforkoff;
+	if (offset >= minforkoff)
+		return offset;
+	return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+	if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+	xfs_attr_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_create(args);
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ifp = dp->i_afp;
+	ASSERT(ifp != NULL);
+	ASSERT(ifp->if_bytes == 0);
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+		ifp->if_flags |= XFS_IFINLINE;
+	} else {
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+	}
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+	hdr->count = 0;
+	hdr->totsize = cpu_to_be16(sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_add(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	dp->i_d.di_forkoff = forkoff;
+
+	ifp = dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		ASSERT(0);
+#endif
+	}
+
+	offset = (char *)sfe - (char *)sf;
+	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	sfe->valuelen = args->valuelen;
+	sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	memcpy(sfe->nameval, args->name, args->namelen);
+	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+	sf->hdr.count++;
+	be16_add_cpu(&sf->hdr.totsize, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+	struct xfs_inode	*ip,
+	struct xfs_trans	*tp)
+{
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	ip->i_d.di_forkoff = 0;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT(ip->i_afp == NULL);
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int base, size=0, end, totsize, i;
+	xfs_mount_t *mp;
+	xfs_inode_t *dp;
+
+	trace_xfs_attr_sf_remove(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	base = sizeof(xfs_attr_sf_hdr_t);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	end = sf->hdr.count;
+	for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+					base += size, i++) {
+		size = XFS_ATTR_SF_ENTSIZE(sfe);
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		break;
+	}
+	if (i == end)
+		return ENOATTR;
+
+	/*
+	 * Fix up the attribute fork data, covering the hole
+	 */
+	end = base + size;
+	totsize = be16_to_cpu(sf->hdr.totsize);
+	if (end != totsize)
+		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+	sf->hdr.count--;
+	be16_add_cpu(&sf->hdr.totsize, -size);
+
+	/*
+	 * Fix up the start offset of the attribute fork
+	 */
+	totsize -= size;
+	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+	    (mp->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+		xfs_attr_fork_reset(dp, args->trans);
+	} else {
+		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+		ASSERT(dp->i_d.di_forkoff);
+		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+				(args->op_flags & XFS_DA_OP_ADDNAME) ||
+				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
+				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+		xfs_trans_log_inode(args->trans, dp,
+					XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	}
+
+	xfs_sbversion_add_attr2(mp, args->trans);
+
+	return 0;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_lookup(args);
+
+	ifp = args->dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		return EEXIST;
+	}
+	return ENOATTR;
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+
+	ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count;
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+			continue;
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = sfe->valuelen;
+			return EEXIST;
+		}
+		if (args->valuelen < sfe->valuelen) {
+			args->valuelen = sfe->valuelen;
+			return ERANGE;
+		}
+		args->valuelen = sfe->valuelen;
+		memcpy(args->value, &sfe->nameval[args->namelen],
+						    args->valuelen);
+		return EEXIST;
+	}
+	return ENOATTR;
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_da_args_t nargs;
+	char *tmpbuffer;
+	int error, i, size;
+	xfs_dablk_t blkno;
+	struct xfs_buf *bp;
+	xfs_ifork_t *ifp;
+
+	trace_xfs_attr_sf_to_leaf(args);
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	size = be16_to_cpu(sf->hdr.totsize);
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+	sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		/*
+		 * If we hit an IO error middle of the transaction inside
+		 * grow_inode(), we may have inconsistent data. Bail out.
+		 */
+		if (error == EIO)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	ASSERT(blkno == 0);
+	error = xfs_attr3_leaf_create(args, blkno, &bp);
+	if (error) {
+		error = xfs_da_shrink_inode(args, 0, bp);
+		bp = NULL;
+		if (error)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
+		goto out;
+	}
+
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.geo = args->geo;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < sf->hdr.count; i++) {
+		nargs.name = sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = &sfe->nameval[nargs.namelen];
+		nargs.valuelen = sfe->valuelen;
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
+						sfe->namelen);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == ENOATTR);
+		error = xfs_attr3_leaf_add(bp, &nargs);
+		ASSERT(error != ENOSPC);
+		if (error)
+			goto out;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	int			bytes;
+	int			i;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < leafhdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return 0;
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return 0;
+		bytes += sizeof(struct xfs_attr_sf_entry) - 1
+				+ name_loc->namelen
+				+ be16_to_cpu(name_loc->valuelen);
+	}
+	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
+		return -1;
+	return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args,
+	int			forkoff)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_da_args	nargs;
+	struct xfs_inode	*dp = args->dp;
+	char			*tmpbuffer;
+	int			error;
+	int			i;
+
+	trace_xfs_attr_leaf_to_sf(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	if (!tmpbuffer)
+		return ENOMEM;
+
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	/* XXX (dgc): buffer is about to be marked stale - why zero it? */
+	memset(bp->b_addr, 0, args->geo->blksize);
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+
+	if (forkoff == -1) {
+		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+		ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+		xfs_attr_fork_reset(dp, args->trans);
+		goto out;
+	}
+
+	xfs_attr_shortform_create(args);
+
+	/*
+	 * Copy the attributes
+	 */
+	memset((char *)&nargs, 0, sizeof(nargs));
+	nargs.geo = args->geo;
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+	for (i = 0; i < ichdr.count; entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (!entry->nameidx)
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
+		nargs.name = name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = &name_loc->nameval[nargs.namelen];
+		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+		nargs.hashval = be32_to_cpu(entry->hashval);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+		xfs_attr_shortform_add(&nargs, forkoff);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer);
+	return error;
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr icleafhdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr icnodehdr;
+	struct xfs_da_intnode	*node;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp1 = NULL;
+	struct xfs_buf		*bp2 = NULL;
+	xfs_dablk_t		blkno;
+	int			error;
+
+	trace_xfs_attr_leaf_to_node(args);
+
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+	if (error)
+		goto out;
+
+	error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+
+	/* copy leaf to new buffer, update identifiers */
+	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+	bp2->b_ops = bp1->b_ops;
+	memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+		hdr3->blkno = cpu_to_be64(bp2->b_bn);
+	}
+	xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->b_addr;
+	dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+
+	leaf = bp2->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
+	/* both on-disk, don't endian-flip twice */
+	btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+	btree[0].before = cpu_to_be32(blkno);
+	icnodehdr.count = 1;
+	dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+	xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+	error = 0;
+out:
+	return error;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	trace_xfs_attr_leaf_create(args);
+
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_attr3_leaf_buf_ops;
+	xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+	leaf = bp->b_addr;
+	memset(leaf, 0, args->geo->blksize);
+
+	memset(&ichdr, 0, sizeof(ichdr));
+	ichdr.firstused = args->geo->blksize;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+		ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+	} else {
+		ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+		ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+	}
+	ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	trace_xfs_attr_leaf_split(state->args);
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return error;
+	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return error;
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error)
+		return error;
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf) {
+		trace_xfs_attr_leaf_add_old(state->args);
+		error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+	} else {
+		trace_xfs_attr_leaf_add_new(state->args);
+		error = xfs_attr3_leaf_add(newblk->bp, state->args);
+	}
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return error;
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	int			tablesize;
+	int			entsize;
+	int			sum;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index >= 0 && args->index <= ichdr.count);
+	entsize = xfs_attr_leaf_newentsize(args, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+		if (tablesize > ichdr.firstused) {
+			sum += ichdr.freemap[i].size;
+			continue;
+		}
+		if (!ichdr.freemap[i].size)
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (ichdr.freemap[i].base < ichdr.firstused)
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (ichdr.freemap[i].size >= tmp) {
+			tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+			goto out_log_hdr;
+		}
+		sum += ichdr.freemap[i].size;
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!ichdr.holes && sum < entsize)
+		return ENOSPC;
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr3_leaf_compact(args, &ichdr, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].  If it is not big enough, give up.
+	 */
+	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+		tmp = ENOSPC;
+		goto out_log_hdr;
+	}
+
+	tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+				xfs_attr3_leaf_hdr_size(leaf)));
+	return tmp;
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+	struct xfs_buf		*bp,
+	struct xfs_attr3_icleaf_hdr *ichdr,
+	struct xfs_da_args	*args,
+	int			mapindex)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_mount	*mp;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_add_work(args);
+
+	leaf = bp->b_addr;
+	ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+	ASSERT(args->index >= 0 && args->index <= ichdr->count);
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (args->index < ichdr->count) {
+		tmp  = ichdr->count - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		memmove(entry + 1, entry, tmp);
+		xfs_trans_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	ichdr->count++;
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	mp = args->trans->t_mountp;
+	ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+	ASSERT(ichdr->freemap[mapindex].size >=
+		xfs_attr_leaf_newentsize(args, NULL));
+	ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+	ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+	ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+	entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+				     ichdr->freemap[mapindex].size);
+	entry->hashval = cpu_to_be32(args->hashval);
+	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+	if (args->op_flags & XFS_DA_OP_RENAME) {
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) ||
+	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+	ASSERT((args->index == ichdr->count - 1) ||
+	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+	/*
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		name_loc->valuelen = cpu_to_be16(args->valuelen);
+		memcpy((char *)name_loc->nameval, args->name, args->namelen);
+		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+				   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		memcpy((char *)name_rmt->name, args->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		name_rmt->valuelen = 0;
+		name_rmt->valueblk = 0;
+		args->rmtblkno = 1;
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		args->rmtvaluelen = args->valuelen;
+	}
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+		ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+	ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf));
+	tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (ichdr->freemap[i].base == tmp) {
+			ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+			ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
+		}
+	}
+	ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+	return 0;
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+	struct xfs_da_args	*args,
+	struct xfs_attr3_icleaf_hdr *ichdr_dst,
+	struct xfs_buf		*bp)
+{
+	struct xfs_attr_leafblock *leaf_src;
+	struct xfs_attr_leafblock *leaf_dst;
+	struct xfs_attr3_icleaf_hdr ichdr_src;
+	struct xfs_trans	*trans = args->trans;
+	char			*tmpbuffer;
+
+	trace_xfs_attr_leaf_compact(args);
+
+	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+	memset(bp->b_addr, 0, args->geo->blksize);
+	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_dst = bp->b_addr;
+
+	/*
+	 * Copy the on-disk header back into the destination buffer to ensure
+	 * all the information in the header that is not part of the incore
+	 * header structure is preserved.
+	 */
+	memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+	/* Initialise the incore headers */
+	ichdr_src = *ichdr_dst;	/* struct copy */
+	ichdr_dst->firstused = args->geo->blksize;
+	ichdr_dst->usedbytes = 0;
+	ichdr_dst->count = 0;
+	ichdr_dst->holes = 0;
+	ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+	ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+						ichdr_dst->freemap[0].base;
+
+	/* write the header back to initialise the underlying buffer */
+	xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+				leaf_dst, ichdr_dst, 0, ichdr_src.count);
+	/*
+	 * this logs the entire buffer, but the caller must write the header
+	 * back to the buffer when it is finished modifying it.
+	 */
+	xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+
+	kmem_free(tmpbuffer);
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_attr3_icleaf_hdr *leaf1hdr,
+	struct xfs_buf	*leaf2_bp,
+	struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+
+	entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+	entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+	if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+	    ((be32_to_cpu(entries2[0].hashval) <
+	      be32_to_cpu(entries1[0].hashval)) ||
+	     (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+	      be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+int
+xfs_attr_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_buf	*leaf2_bp)
+{
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+	return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_args	*args;
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+	int			count;
+	int			totallen;
+	int			max;
+	int			space;
+	int			swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+	ASSERT(ichdr2.count == 0);
+	args = state->args;
+
+	trace_xfs_attr_leaf_rebalance(args);
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+		struct xfs_da_state_blk	*tmp_blk;
+		struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+
+		/* struct copies to swap them rather than reconverting */
+		tmp_ichdr = ichdr1;
+		ichdr1 = ichdr2;
+		ichdr2 = tmp_ichdr;
+
+		leaf1 = blk1->bp->b_addr;
+		leaf2 = blk2->bp->b_addr;
+		swap = 1;
+	}
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+						      blk2, &ichdr2,
+						      &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < ichdr1.count) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = ichdr1.count - count;
+		space  = ichdr1.usedbytes - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+				ichdr1.count - count, leaf2, &ichdr2, 0, count);
+
+	} else if (count > ichdr1.count) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+		ASSERT(0);
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= ichdr1.count;
+		space  = totallen - ichdr1.usedbytes;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+		if (space > max)
+			xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+					ichdr1.count, count);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+	xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+	xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+	xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entries1 = xfs_attr3_leaf_entryp(leaf1);
+	entries2 = xfs_attr3_leaf_entryp(leaf2);
+	blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > ichdr1.count) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index - ichdr1.count;
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == ichdr1.count) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			/*
+			 * On a double leaf split, the original attr location
+			 * is already stored in blkno2/index2, so don't
+			 * overwrite it overwise we corrupt the tree.
+			 */
+			blk2->index = blk1->index - ichdr1.count;
+			args->index = blk2->index;
+			args->blkno = blk2->blkno;
+			if (!state->extravalid) {
+				/*
+				 * set the new attr location to match the old
+				 * one and let the higher level split code
+				 * decide where in the leaf to place it.
+				 */
+				args->index2 = blk2->index;
+				args->blkno2 = blk2->blkno;
+			}
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+	struct xfs_da_state		*state,
+	struct xfs_da_state_blk		*blk1,
+	struct xfs_attr3_icleaf_hdr	*ichdr1,
+	struct xfs_da_state_blk		*blk2,
+	struct xfs_attr3_icleaf_hdr	*ichdr2,
+	int				*countarg,
+	int				*usedbytesarg)
+{
+	struct xfs_attr_leafblock	*leaf1 = blk1->bp->b_addr;
+	struct xfs_attr_leafblock	*leaf2 = blk2->bp->b_addr;
+	struct xfs_attr_leaf_entry	*entry;
+	int				count;
+	int				max;
+	int				index;
+	int				totallen = 0;
+	int				half;
+	int				lastdelta;
+	int				foundit = 0;
+	int				tmp;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = ichdr1->count + ichdr2->count;
+	half = (max + 1) * sizeof(*entry);
+	half += ichdr1->usedbytes + ichdr2->usedbytes +
+			xfs_attr_leaf_newentsize(state->args, NULL);
+	half /= 2;
+	lastdelta = state->args->geo->blksize;
+	entry = xfs_attr3_leaf_entryp(leaf1);
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == ichdr1->count) {
+			leaf1 = leaf2;
+			entry = xfs_attr3_leaf_entryp(leaf1);
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args, NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return foundit;
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		blkno;
+	int			bytes;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+
+	trace_xfs_attr_leaf_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	leaf = blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	bytes = xfs_attr3_leaf_hdr_size(leaf) +
+		ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+		ichdr.usedbytes;
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (ichdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (ichdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = ichdr.forw < ichdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_attr3_icleaf_hdr ichdr2;
+		if (forward)
+			blkno = ichdr.forw;
+		else
+			blkno = ichdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+					blkno, -1, &bp);
+		if (error)
+			return error;
+
+		xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2) -
+			ichdr.usedbytes - ichdr2.usedbytes -
+			((ichdr.count + ichdr2.count) *
+					sizeof(xfs_attr_leaf_entry_t)) -
+			xfs_attr3_leaf_hdr_size(leaf);
+
+		xfs_trans_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return 0;
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	int			before;
+	int			after;
+	int			smallest;
+	int			entsize;
+	int			tablesize;
+	int			tmp;
+	int			i;
+
+	trace_xfs_attr_leaf_remove(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+	ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index >= 0 && args->index < ichdr.count);
+	ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+					xfs_attr3_leaf_hdr_size(leaf));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+	ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	tmp = ichdr.freemap[0].size;
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+		ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+		if (ichdr.freemap[i].base == tablesize) {
+			ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+			ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+		}
+
+		if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+				be16_to_cpu(entry->nameidx)) {
+			before = i;
+		} else if (ichdr.freemap[i].base ==
+				(be16_to_cpu(entry->nameidx) + entsize)) {
+			after = i;
+		} else if (ichdr.freemap[i].size < tmp) {
+			tmp = ichdr.freemap[i].size;
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			ichdr.freemap[before].size += entsize;
+			ichdr.freemap[before].size += ichdr.freemap[after].size;
+			ichdr.freemap[after].base = 0;
+			ichdr.freemap[after].size = 0;
+		} else if (before >= 0) {
+			ichdr.freemap[before].size += entsize;
+		} else {
+			ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[after].size += entsize;
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		if (ichdr.freemap[smallest].size < entsize) {
+			ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[smallest].size = entsize;
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+	ichdr.usedbytes -= entsize;
+	xfs_trans_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+				   entsize));
+
+	tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+	memmove(entry, entry + 1, tmp);
+	ichdr.count--;
+	xfs_trans_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+	memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = args->geo->blksize;
+		entry = xfs_attr3_leaf_entryp(leaf);
+		for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+			ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+			ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+			if (be16_to_cpu(entry->nameidx) < tmp)
+				tmp = be16_to_cpu(entry->nameidx);
+		}
+		ichdr.firstused = tmp;
+		if (!ichdr.firstused)
+			ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
+	} else {
+		ichdr.holes = 1;	/* mark as needing compaction */
+	}
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+					  xfs_attr3_leaf_hdr_size(leaf)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+	      ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+	return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+	struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+	struct xfs_attr3_icleaf_hdr drophdr;
+	struct xfs_attr3_icleaf_hdr savehdr;
+	struct xfs_attr_leaf_entry *entry;
+
+	trace_xfs_attr_leaf_unbalance(state->args);
+
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+	xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+	entry = xfs_attr3_leaf_entryp(drop_leaf);
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (savehdr.holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr, 0,
+						drophdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr,
+						savehdr.count, drophdr.count);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		struct xfs_attr_leafblock *tmp_leaf;
+		struct xfs_attr3_icleaf_hdr tmphdr;
+
+		tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+
+		/*
+		 * Copy the header into the temp leaf so that all the stuff
+		 * not in the incore header is present and gets copied back in
+		 * once we've moved all the entries.
+		 */
+		memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+		memset(&tmphdr, 0, sizeof(tmphdr));
+		tmphdr.magic = savehdr.magic;
+		tmphdr.forw = savehdr.forw;
+		tmphdr.back = savehdr.back;
+		tmphdr.firstused = state->args->geo->blksize;
+
+		/* write the header to the temp buffer to initialise it */
+		xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						drophdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						savehdr.count);
+		} else {
+			xfs_attr3_leaf_moveents(state->args,
+						save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						savehdr.count);
+			xfs_attr3_leaf_moveents(state->args,
+						drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						drophdr.count);
+		}
+		memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+		savehdr = tmphdr; /* struct copy */
+		kmem_free(tmp_leaf);
+	}
+
+	xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+	xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->args->geo->blksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	entry = xfs_attr3_leaf_entryp(save_leaf);
+	save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	xfs_dahash_t		hashval;
+	int			probe;
+	int			span;
+
+	trace_xfs_attr_leaf_lookup(args);
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+	ASSERT(ichdr.count < args->geo->blksize / 8);
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = ichdr.count / 2;
+	for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+		span /= 2;
+		if (be32_to_cpu(entry->hashval) < hashval)
+			probe += span;
+		else if (be32_to_cpu(entry->hashval) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+	ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+		entry--;
+		probe--;
+	}
+	while (probe < ichdr.count &&
+	       be32_to_cpu(entry->hashval) < hashval) {
+		entry++;
+		probe++;
+	}
+	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+		args->index = probe;
+		return ENOATTR;
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		/*
+		 * If we are looking for INCOMPLETE entries, show only those.
+		 * If we are looking for complete entries, show only those.
+		 */
+		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+			continue;
+		}
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+			if (name_loc->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, name_loc->nameval,
+							args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			return EEXIST;
+		} else {
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+			if (name_rmt->namelen != args->namelen)
+				continue;
+			if (memcmp(args->name, name_rmt->name,
+							args->namelen) != 0)
+				continue;
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
+				continue;
+			args->index = probe;
+			args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+			args->rmtblkcnt = xfs_attr3_rmt_blocks(
+							args->dp->i_mount,
+							args->rmtvaluelen);
+			return EEXIST;
+		}
+	}
+	args->index = probe;
+	return ENOATTR;
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr3_leaf_getvalue(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	int			valuelen;
+
+	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(ichdr.count < args->geo->blksize / 8);
+	ASSERT(args->index < ichdr.count);
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		valuelen = be16_to_cpu(name_loc->valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return 0;
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return ERANGE;
+		}
+		args->valuelen = valuelen;
+		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		ASSERT(name_rmt->namelen == args->namelen);
+		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+						       args->rmtvaluelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = args->rmtvaluelen;
+			return 0;
+		}
+		if (args->valuelen < args->rmtvaluelen) {
+			args->valuelen = args->rmtvaluelen;
+			return ERANGE;
+		}
+		args->valuelen = args->rmtvaluelen;
+	}
+	return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+	struct xfs_da_args		*args,
+	struct xfs_attr_leafblock	*leaf_s,
+	struct xfs_attr3_icleaf_hdr	*ichdr_s,
+	int				start_s,
+	struct xfs_attr_leafblock	*leaf_d,
+	struct xfs_attr3_icleaf_hdr	*ichdr_d,
+	int				start_d,
+	int				count)
+{
+	struct xfs_attr_leaf_entry	*entry_s;
+	struct xfs_attr_leaf_entry	*entry_d;
+	int				desti;
+	int				tmp;
+	int				i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+	       ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+	ASSERT(ichdr_s->magic == ichdr_d->magic);
+	ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+	ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+					+ xfs_attr3_leaf_hdr_size(leaf_s));
+	ASSERT(ichdr_d->count < args->geo->blksize / 8);
+	ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+					+ xfs_attr3_leaf_hdr_size(leaf_d));
+
+	ASSERT(start_s < ichdr_s->count);
+	ASSERT(start_d <= ichdr_d->count);
+	ASSERT(count <= ichdr_s->count);
+
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < ichdr_d->count) {
+		tmp  = ichdr_d->count - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+		memmove(entry_d, entry_s, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+	entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.  Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_s->count -= 1;
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			ichdr_d->firstused -= tmp;
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+			entry_d->flags = entry_s->flags;
+			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+							<= args->geo->blksize);
+			memmove(xfs_attr3_leaf_name(leaf_d, desti),
+				xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+							<= args->geo->blksize);
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_d->usedbytes += tmp;
+			ichdr_s->count -= 1;
+			ichdr_d->count += 1;
+			tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf_d);
+			ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == ichdr_s->count) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		memmove(entry_d, entry_s, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + args->geo->blksize));
+		memset(entry_s, 0, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+	ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+	ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+	ichdr_d->freemap[1].base = 0;
+	ichdr_d->freemap[2].base = 0;
+	ichdr_d->freemap[1].size = 0;
+	ichdr_d->freemap[2].size = 0;
+	ichdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+	struct xfs_buf	*bp,
+	int		*count)
+{
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entries;
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+	entries = xfs_attr3_leaf_entryp(bp->b_addr);
+	if (count)
+		*count = ichdr.count;
+	if (!ichdr.count)
+		return 0;
+	return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	struct xfs_attr_leaf_entry *entries;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if (entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+						   be16_to_cpu(name_loc->valuelen));
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+	}
+	return size;
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+	struct xfs_da_args	*args,
+	int			*local)
+{
+	int			size;
+
+	size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+		if (local)
+			*local = 1;
+		return size;
+	}
+	if (local)
+		*local = 0;
+	return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int			error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_clearflag(args);
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+	trace_xfs_attr_leaf_setflag(args);
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
+
+	leaf = bp->b_addr;
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+#endif
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+	struct xfs_da_args	*args)
+{
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr_leaf_entry *entry1;
+	struct xfs_attr_leaf_entry *entry2;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp1;
+	struct xfs_buf		*bp2;
+	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	trace_xfs_attr_leaf_flipflags(args);
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+	if (error)
+		return error;
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+					   -1, &bp2);
+		if (error)
+			return error;
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->b_addr;
+	entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+	leaf2 = bp2->b_addr;
+	entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+	ASSERT(args->index < ichdr1.count);
+	ASSERT(args->index >= 0);
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+	ASSERT(args->index2 < ichdr2.count);
+	ASSERT(args->index2 >= 0);
+
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+		xfs_trans_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_trans_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+		name_rmt->valueblk = 0;
+		name_rmt->valuelen = 0;
+		xfs_trans_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_trans_roll(&args->trans, args->dp);
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000000..a8bbc562ff35
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+	struct xfs_mount *mp,
+	int		attrlen)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+		return (attrlen + buflen - 1) / buflen;
+	}
+	return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (bno != be64_to_cpu(rmt->rm_blkno))
+		return false;
+	if (offset != be32_to_cpu(rmt->rm_offset))
+		return false;
+	if (size != be32_to_cpu(rmt->rm_bytes))
+		return false;
+	if (ino != be64_to_cpu(rmt->rm_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	int			fsbsize,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+		return false;
+	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be64_to_cpu(rmt->rm_blkno) != bno)
+		return false;
+	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+		return false;
+	if (be32_to_cpu(rmt->rm_offset) +
+				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+		return false;
+	if (rmt->rm_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+	int		blksize = mp->m_attr_geo->blksize;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+			xfs_buf_ioerror(bp, EFSBADCRC);
+			break;
+		}
+		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			break;
+		}
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+	else
+		ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+	int		blksize = mp->m_attr_geo->blksize;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= blksize);
+
+	while (len > 0) {
+		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			xfs_verifier_error(bp);
+			return;
+		}
+		if (bip) {
+			struct xfs_attr3_rmt_hdr *rmt;
+
+			rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+			rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+		}
+		xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+		len -= blksize;
+		ptr += blksize;
+		bno += BTOBB(blksize);
+	}
+	ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+	.verify_read = xfs_attr3_rmt_read_verify,
+	.verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+	rmt->rm_offset = cpu_to_be32(offset);
+	rmt->rm_bytes = cpu_to_be32(size);
+	uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+	rmt->rm_owner = cpu_to_be64(ino);
+	rmt->rm_blkno = cpu_to_be64(bno);
+
+	return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	__uint8_t	**dst)
+{
+	char		*src = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size = 0;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+						  byte_cnt, bno)) {
+				xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+					bno, *offset, byte_cnt, ino);
+				return EFSCORRUPTED;
+			}
+			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+		}
+
+		memcpy(*dst, src + hdr_size, byte_cnt);
+
+		/* roll buffer forwards */
+		len -= blksize;
+		src += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*dst += byte_cnt;
+		*offset += byte_cnt;
+	}
+	return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	__uint8_t	**src)
+{
+	char		*dst = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+	int		blksize = mp->m_attr_geo->blksize;
+
+	ASSERT(len >= blksize);
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+		byte_cnt = min(*valuelen, byte_cnt);
+		hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+						 byte_cnt, bno);
+
+		memcpy(dst + hdr_size, *src, byte_cnt);
+
+		/*
+		 * If this is the last block, zero the remainder of it.
+		 * Check that we are actually the last block, too.
+		 */
+		if (byte_cnt + hdr_size < blksize) {
+			ASSERT(*valuelen - byte_cnt == 0);
+			ASSERT(len == blksize);
+			memset(dst + hdr_size + byte_cnt, 0,
+					blksize - hdr_size - byte_cnt);
+		}
+
+		/* roll buffer forwards */
+		len -= blksize;
+		dst += blksize;
+		bno += BTOBB(blksize);
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*src += byte_cnt;
+		*offset += byte_cnt;
+	}
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+	struct xfs_da_args	*args)
+{
+	struct xfs_bmbt_irec	map[ATTR_RMTVALUE_MAPSIZE];
+	struct xfs_mount	*mp = args->dp->i_mount;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		lblkno = args->rmtblkno;
+	__uint8_t		*dst = args->value;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			blkcnt = args->rmtblkcnt;
+	int			i;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_get(args);
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+	ASSERT(args->rmtvaluelen == args->valuelen);
+
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			xfs_daddr_t	dblkno;
+			int		dblkcnt;
+
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+						   dblkno, dblkcnt, 0, &bp,
+						   &xfs_attr3_rmt_buf_ops);
+			if (error)
+				return error;
+
+			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+							&offset, &valuelen,
+							&dst);
+			xfs_buf_relse(bp);
+			if (error)
+				return error;
+
+			/* roll attribute extent map forwards */
+			lblkno += map[i].br_blockcount;
+			blkcnt -= map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_bmbt_irec	map;
+	xfs_dablk_t		lblkno;
+	xfs_fileoff_t		lfileoff = 0;
+	__uint8_t		*src = args->value;
+	int			blkcnt;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_set(args);
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into. Because CRC enable
+	 * attributes have headers, we can't just do a straight byte to FSB
+	 * conversion and have to take the header space into account.
+	 */
+	blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error)
+		return error;
+
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		int	committed;
+
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	valuelen = args->rmtvaluelen;
+	while (valuelen > 0) {
+		struct xfs_buf	*bp;
+		xfs_daddr_t	dblkno;
+		int		dblkcnt;
+
+		ASSERT(blkcnt > 0);
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+		if (!bp)
+			return ENOMEM;
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+				       &valuelen, &src);
+
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
+
+
+		/* roll attribute extent map forwards */
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+	struct xfs_da_args	*args)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	xfs_dablk_t		lblkno;
+	int			blkcnt;
+	int			error;
+	int			done;
+
+	trace_xfs_attr_rmtval_remove(args);
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's blocks.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	while (blkcnt > 0) {
+		struct xfs_bmbt_irec	map;
+		struct xfs_buf		*bp;
+		xfs_daddr_t		dblkno;
+		int			dblkcnt;
+		int			nmap;
+
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+		if (bp) {
+			xfs_buf_stale(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		int committed;
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist,
+				    &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, args->dp, 0);
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, args->dp);
+		if (error)
+			return error;
+	}
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000000..b44d63189dab
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,5609 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+
+
+kmem_zone_t		*xfs_bmap_free_item_zone;
+
+/*
+ * Miscellaneous helper functions
+ */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+	 * with ATTR2 and then mounted back with ATTR1, keeping the
+	 * di_forkoff's fixed but probably at various positions. Therefore,
+	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+	 * of a minimum size available.
+	 */
+	if (whichfork == XFS_DATA_FORK) {
+		maxleafents = MAXEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+	} else {
+		maxleafents = MAXAEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	}
+	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int				/* error */
+xfs_bmbt_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) >
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		XFS_IFORK_NEXTENTS(ip, whichfork) <=
+			XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+	struct xfs_btree_cur	*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	xfs_exntst_t		state)
+{
+	union xfs_btree_rec	rec;
+
+	xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			offset;
+
+	if (mp->m_sb.sb_inodesize == 256) {
+		offset = XFS_LITINO(mp, ip->i_d.di_version) -
+				XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	} else {
+		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+	}
+
+	ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+	return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	if (whichfork == XFS_ATTR_FORK &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = dfl_forkoff;
+	}
+}
+
+/*
+ * Debug/sanity checking code
+ */
+
+STATIC int
+xfs_bmap_sanity_check(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	int			level)
+{
+	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+	if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+	    block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+		return 0;
+
+	if (be16_to_cpu(block->bb_level) != level ||
+	    be16_to_cpu(block->bb_numrecs) == 0 ||
+	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return 0;
+
+	return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		bno)
+{
+	struct xfs_log_item_desc *lidp;
+	int			i;
+
+	if (!cur)
+		return NULL;
+
+	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		if (!cur->bc_bufs[i])
+			break;
+		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+			return cur->bc_bufs[i];
+	}
+
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+		struct xfs_buf_log_item	*bip;
+		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_ADDR(bip->bli_buf) == bno)
+			return bip->bli_buf;
+	}
+
+	return NULL;
+}
+
+STATIC void
+xfs_check_block(
+	struct xfs_btree_block	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	__be64			*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+		if (prevp) {
+			ASSERT(be64_to_cpu(prevp->br_startoff) <
+			       be64_to_cpu(keyp->br_startoff));
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+		if (root)
+			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+		else
+			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+			if (root)
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+			else
+				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+			if (*thispa == *pp) {
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+					__func__, j, i,
+					(unsigned long long)be64_to_cpu(*thispa));
+				panic("%s: ptrs are equal in node\n",
+					__func__);
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
+	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	for (;;) {
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		if (i) {
+			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+			       xfs_bmbt_disk_get_blockcount(&last) <=
+			       xfs_bmbt_disk_get_startoff(ep));
+		}
+		for (j = 1; j < num_recs; j++) {
+			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+			       xfs_bmbt_disk_get_blockcount(ep) <=
+			       xfs_bmbt_disk_get_startoff(nextp));
+			ep = nextp;
+		}
+
+		last = *ep;
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	xfs_warn(mp, "%s: at error0", __func__);
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork,	/* data or attr fork */
+	unsigned long	caller_ip)
+{
+	xfs_extnum_t	idx;		/* extent record index */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	for (idx = 0; idx < cnt; idx++)
+		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!isnullstartblock(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist,	/* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork)  /* data or attr fork */
+{
+	/* REFERENCED */
+	struct xfs_btree_block	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	__be64			*pp;	/* ptr to block address */
+	struct xfs_btree_block	*rblock;/* root btree block */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+		return error;
+#endif
+	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+				&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	cblock = XFS_BUF_TO_BLOCK(cbp);
+	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	struct xfs_btree_block	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i, cnt;		/* extent record index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = mp;
+	args.firstblock = *firstblock;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	abp->b_ops = &xfs_bmbt_buf_ops;
+	ablock = XFS_BUF_TO_BLOCK(abp);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS);
+
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (cnt = i = 0; i < nextents; i++) {
+		ep = xfs_iext_get_ext(ifp, i);
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+			arp->l0 = cpu_to_be64(ep->l0);
+			arp->l1 = cpu_to_be64(ep->l1);
+			arp++; cnt++;
+		}
+	}
+	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_btree_set_numrecs(ablock, cnt);
+
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+						be16_to_cpu(block->bb_level)));
+	*pp = cpu_to_be64(args.fsbno);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+	return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	ASSERT(ifp->if_bytes == 0);
+	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+	xfs_bmap_forkoff_reset(ip, whichfork);
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork,
+	void		(*init_fn)(struct xfs_trans *tp,
+				   struct xfs_buf *bp,
+				   struct xfs_inode *ip,
+				   struct xfs_ifork *ifp))
+{
+	int		error = 0;
+	int		flags;		/* logging flags returned */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_alloc_arg_t	args;		/* allocation arguments */
+	xfs_buf_t	*bp;		/* buffer for extent block */
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is invalid.
+	 */
+	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+	if (!ifp->if_bytes) {
+		xfs_bmap_local_to_extents_empty(ip, whichfork);
+		flags = XFS_ILOG_CORE;
+		goto done;
+	}
+
+	flags = 0;
+	error = 0;
+	ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+								XFS_IFINLINE);
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = ip->i_mount;
+	args.firstblock = *firstblock;
+	/*
+	 * Allocate a block.  We know we need only one, since the
+	 * file currently fits in an inode.
+	 */
+	if (*firstblock == NULLFSBLOCK) {
+		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.fsbno = *firstblock;
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+	args.total = total;
+	args.minlen = args.maxlen = args.prod = 1;
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto done;
+
+	/* Can't fail, the space was reserved. */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(args.len == 1);
+	*firstblock = args.fsbno;
+	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+	/* initialise the block and copy the data */
+	init_fn(tp, bp, ip, ifp);
+
+	/* account for the change in fork size and log everything */
+	xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+	xfs_bmap_local_to_extents_empty(ip, whichfork);
+	flags |= XFS_ILOG_CORE;
+
+	xfs_iext_add(ifp, 0, 1);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+	trace_xfs_bmap_post_update(ip, 0,
+			whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+			_THIS_IP_);
+	XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+	ip->i_d.di_nblocks = 1;
+	xfs_trans_mod_dquot_byino(tp, ip,
+		XFS_TRANS_DQ_BCOUNT, 1L);
+	flags |= xfs_ilog_fext(whichfork);
+
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.firstblock = *firstblock;
+		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+			goto error0;
+		/* must be at least one entry */
+		XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+		if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return ENOSPC;
+		}
+		*firstblock = cur->bc_private.b.firstblock;
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+		flags, XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_da_args_t		dargs;		/* args for dir/attr code */
+
+	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+		return 0;
+
+	if (S_ISDIR(ip->i_d.di_mode)) {
+		memset(&dargs, 0, sizeof(dargs));
+		dargs.geo = ip->i_mount->m_dir_geo;
+		dargs.dp = ip;
+		dargs.firstblock = firstblock;
+		dargs.flist = flist;
+		dargs.total = dargs.geo->fsbcount;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		return xfs_dir2_sf_to_block(&dargs);
+	}
+
+	if (S_ISLNK(ip->i_d.di_mode))
+		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+						 flags, XFS_DATA_FORK,
+						 xfs_symlink_local_to_remote);
+
+	/* should only be called for types that support local format data */
+	ASSERT(0);
+	return EFSCORRUPTED;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			size,		/* space new attribute needs */
+	int			rsvd)		/* xact may use reserved blks */
+{
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent records */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			blks;		/* space reservation */
+	int			version = 1;	/* superblock attr version */
+	int			committed;	/* xaction was committed */
+	int			logflags;	/* logging flags */
+	int			error;		/* error return value */
+	int			cancel_flags = 0;
+
+	ASSERT(XFS_IFORK_Q(ip) == 0);
+
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+			XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto trans_cancel;
+	cancel_flags |= XFS_TRANS_ABORT;
+	if (XFS_IFORK_Q(ip))
+		goto trans_cancel;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+		else if (mp->m_flags & XFS_MOUNT_ATTR2)
+			version = 2;
+		break;
+	default:
+		ASSERT(0);
+		error = EINVAL;
+		goto trans_cancel;
+	}
+
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	xfs_bmap_init(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto bmap_cancel;
+	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+		__int64_t sbfields = 0;
+
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+			xfs_sb_version_addattr(&mp->m_sb);
+			sbfields |= XFS_SB_VERSIONNUM;
+		}
+		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+		}
+		if (sbfields) {
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, sbfields);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+
+	error = xfs_bmap_finish(&tp, &flist, &committed);
+	if (error)
+		goto bmap_cancel;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+bmap_cancel:
+	xfs_bmap_cancel(&flist);
+trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+	xfs_extnum_t		i, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent records.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+		xfs_extnum_t	start;
+
+		num_recs = xfs_btree_get_numrecs(block);
+		if (unlikely(i + num_recs > room)) {
+			ASSERT(i + num_recs <= room);
+			xfs_warn(ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).",
+				(unsigned long long) ip->i_ino);
+			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+				XFS_ERRLEVEL_LOW, ip->i_mount, block);
+			goto error0;
+		}
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, 0),
+			error0);
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1,
+					     &xfs_bmbt_buf_ops);
+		/*
+		 * Copy records into the extent records.
+		 */
+		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		start = i;
+		for (j = 0; j < num_recs; j++, i++, frp++) {
+			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+			trp->l0 = be64_to_cpu(frp->l0);
+			trp->l1 = be64_to_cpu(frp->l1);
+		}
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (unlikely(xfs_check_nostate_extents(ifp,
+					start, num_recs))) {
+				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+						 XFS_ERRLEVEL_LOW,
+						 ip->i_mount);
+				goto error0;
+			}
+		}
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return EFSCORRUPTED;
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
+	xfs_extnum_t	lastx;		/* last extent index */
+
+	/*
+	 * Initialize the extent entry structure to catch access to
+	 * uninitialized br_startblock field.
+	 */
+	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+	gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+	gotp->br_startblock = 0xffffa5a5;
+#endif
+	prevp->br_startoff = NULLFILEOFF;
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+	if (lastx > 0) {
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+	}
+	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, gotp);
+		*eofp = 0;
+	} else {
+		if (lastx > 0) {
+			*gotp = *prevp;
+		}
+		*eofp = 1;
+		ep = NULL;
+	}
+	*lastxp = lastx;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t     *ip,            /* incore inode pointer */
+	xfs_fileoff_t   bno,            /* block number searched for */
+	int             fork,      	/* data or attr fork */
+	int             *eofp,          /* out: end of file found */
+	xfs_extnum_t    *lastxp,        /* out: last extent index */
+	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+
+	XFS_STATS_INC(xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, fork);
+
+	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+				"Access to block zero in inode %llu "
+				"start_block: %llx start_off: %llx "
+				"blkcnt: %llx extent-state: %x lastx: %x",
+			(unsigned long long)ip->i_ino,
+			(unsigned long long)gotp->br_startblock,
+			(unsigned long long)gotp->br_startoff,
+			(unsigned long long)gotp->br_blockcount,
+			gotp->br_state, *lastxp);
+		*lastxp = NULLEXTNUM;
+		*eofp = 1;
+		return NULL;
+	}
+	return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	int		error;			/* error return value */
+	int		idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t	got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t	prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return EIO;
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			error;
+	int			nextents;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*is_empty = 1;
+		return 0;
+	}
+
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+	*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	bma->aeof = 0;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
+	if (error)
+		return error;
+
+	if (is_empty) {
+		bma->aeof = 1;
+		return 0;
+	}
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+		return 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+	       return EIO;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
+		return error;
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t	s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK)
+		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_bmbt_irec	*new = &bma->got;
+	int			diff;	/* temp value */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	xfs_filblks_t		temp=0;	/* value for da_new calculations */
+	xfs_filblks_t		temp2=0;/* value for da_new calculations */
+	int			tmp_rval;	/* partial logging flags */
+
+	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	ep = xfs_iext_get_ext(ifp, bma->idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	da_old = startblockval(PREV.br_startblock);
+	da_new = 0;
+
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (bma->idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == new->br_state &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
+
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    new->br_state == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		bma->idx--;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+		bma->ip->i_d.di_nextents--;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		bma->idx--;
+
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount, LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
+					new->br_startblock,
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, PREV.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					new->br_blockcount,
+					LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx--;
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist,
+					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount,
+			RIGHT.br_state);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+					RIGHT.br_blockcount,
+					RIGHT.br_state);
+			if (error)
+				goto done;
+		}
+
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock));
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx++;
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur, 1,
+				&tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+			startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		ep = xfs_iext_get_ext(ifp, bma->idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		bma->idx++;
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
+		 */
+		temp = new->br_startoff - PREV.br_startoff;
+		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
+		LEFT = *new;
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startblock = nullstartblock(
+				(int)xfs_bmap_worst_indlen(bma->ip, temp2));
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount = temp2;
+		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+		bma->ip->i_d.di_nextents++;
+		if (bma->cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+
+		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+					bma->firstblock, bma->flist, &bma->cur,
+					1, &tmp_rval, XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = xfs_bmap_worst_indlen(bma->ip, temp);
+		temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
+		if (diff > 0) {
+			error = xfs_icsb_modify_counters(bma->ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					-((int64_t)diff), 0);
+			ASSERT(!error);
+			if (error)
+				goto done;
+		}
+
+		ep = xfs_iext_get_ext(ifp, bma->idx);
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
+			nullstartblock((int)temp2));
+		trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+
+		bma->idx++;
+		da_new = temp + temp2;
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* adjust for changes in reserved delayed indirect blocks */
+	if (da_old || da_new) {
+		temp = da_new;
+		if (bma->cur)
+			temp += bma->cur->bc_private.b.allocated;
+		ASSERT(temp <= da_old);
+		if (temp < da_old)
+			xfs_icsb_modify_counters(bma->ip->i_mount,
+					XFS_SBS_FDBLOCKS,
+					(int64_t)(da_old - temp), 0);
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+done:
+	bma->logflags |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	struct xfs_trans	*tp,
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_exntst_t		newext;	/* new extent state */
+	xfs_exntst_t		oldext;	/* old extent state */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0;	/* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+
+	*logflagsp = 0;
+
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+#define	LEFT		r[0]
+#define	RIGHT		r[1]
+#define	PREV		r[2]
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &PREV);
+	newext = new->br_state;
+	oldext = (newext == XFS_EXT_UNWRITTEN) ?
+		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+	ASSERT(PREV.br_state == oldext);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.br_startoff == new->br_startoff)
+		state |= BMAP_LEFT_FILLING;
+	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+		state |= BMAP_RIGHT_FILLING;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+		if (isnullstartblock(LEFT.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+	    LEFT.br_state == newext &&
+	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+		if (isnullstartblock(RIGHT.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new_endoff == RIGHT.br_startoff &&
+	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+	    newext == RIGHT.br_state &&
+	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING)) !=
+		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+		       BMAP_RIGHT_FILLING) ||
+	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+			<= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 2, state);
+		ip->i_d.di_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount +
+				RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		--*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+			LEFT.br_blockcount + PREV.br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount,
+				LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_delete(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_state(ep, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		--*idx;
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_decrement(cur, 0, &i)))
+				goto done;
+			error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + new->br_blockcount,
+				LEFT.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount, newext);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock,
+					PREV.br_blockcount, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_btree_increment(cur, 0, &i)))
+				goto done;
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep,
+			new->br_startoff - PREV.br_startoff);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = oldext;
+
+		++*idx;
+		xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+		ip->i_d.di_nextents += 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			/* new right extent - oldext */
+			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+				r[1].br_startblock, r[1].br_blockcount,
+				r[1].br_state)))
+				goto done;
+			/* new left extent - oldext */
+			cur->bc_rec.b = PREV;
+			cur->bc_rec.b.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			/*
+			 * Reset the cursor to the position of the new extent
+			 * we are about to insert as we can't trust it after
+			 * the previous insert.
+			 */
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			/* new middle extent - newext */
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_btree_insert(cur, &i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+
+	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+	case BMAP_LEFT_CONTIG:
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+				0, &tmp_logflags, XFS_DATA_FORK);
+		*logflagsp |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+
+	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+done:
+	*logflagsp |= rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+}
+
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/insert */
+	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
+{
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;  /* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
+
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	state = 0;
+	ASSERT(isnullstartblock(new->br_startblock));
+
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (*idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     (left.br_blockcount + new->br_blockcount +
+	      right.br_blockcount <= MAXEXTLEN)))
+		state |= BMAP_RIGHT_CONTIG;
+
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+		xfs_iext_remove(ip, *idx + 1, 1, state);
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--*idx;
+		temp = left.br_blockcount + new->br_blockcount;
+
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+			nullstartblock((int)newlen));
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+			new->br_startoff,
+			nullstartblock((int)newlen), temp, right.br_state);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+			(int64_t)(oldlen - newlen), 0);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+	}
+}
+
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	*new = &bma->got;
+	int			error;	/* error return value */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
+	int			state;	/* state bits, accessed thru macros */
+
+	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+	ASSERT(bma->idx >= 0);
+	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(!isnullstartblock(new->br_startblock));
+	ASSERT(!bma->cur ||
+	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+	XFS_STATS_INC(xs_add_exlist);
+
+	state = 0;
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (bma->idx > 0) {
+		state |= BMAP_LEFT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+		if (isnullstartblock(left.br_startblock))
+			state |= BMAP_LEFT_DELAY;
+	}
+
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+		state |= BMAP_RIGHT_VALID;
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+		if (isnullstartblock(right.br_startblock))
+			state |= BMAP_RIGHT_DELAY;
+	}
+
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+	    left.br_startoff + left.br_blockcount == new->br_startoff &&
+	    left.br_startblock + left.br_blockcount == new->br_startblock &&
+	    left.br_state == new->br_state &&
+	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+		state |= BMAP_LEFT_CONTIG;
+
+	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+	    new->br_startoff + new->br_blockcount == right.br_startoff &&
+	    new->br_startblock + new->br_blockcount == right.br_startblock &&
+	    new->br_state == right.br_state &&
+	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+	    (!(state & BMAP_LEFT_CONTIG) ||
+	     left.br_blockcount + new->br_blockcount +
+	     right.br_blockcount <= MAXEXTLEN))
+		state |= BMAP_RIGHT_CONTIG;
+
+	error = 0;
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent record.
+		 */
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			left.br_blockcount + new->br_blockcount +
+			right.br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+		if (bma->cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+					right.br_startblock, right.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_btree_delete(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_btree_decrement(bma->cur, 0, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount +
+						right.br_blockcount,
+					left.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_LEFT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		--bma->idx;
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+			left.br_blockcount + new->br_blockcount);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		if (bma->cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+					left.br_startblock, left.br_blockcount,
+					&i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount,
+					left.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case BMAP_RIGHT_CONTIG:
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+			new->br_startoff, new->br_startblock,
+			new->br_blockcount + right.br_blockcount,
+			right.br_state);
+		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+		if (bma->cur == NULL) {
+			rval = xfs_ilog_fext(whichfork);
+		} else {
+			rval = 0;
+			error = xfs_bmbt_lookup_eq(bma->cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			error = xfs_bmbt_update(bma->cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+						right.br_blockcount,
+					right.br_state);
+			if (error)
+				goto done;
+		}
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+		if (bma->cur == NULL) {
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			error = xfs_bmbt_lookup_eq(bma->cur,
+					new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+			bma->cur->bc_rec.b.br_state = new->br_state;
+			error = xfs_btree_insert(bma->cur, &i);
+			if (error)
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		break;
+	}
+
+	/* convert to a btree if necessary */
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(bma->cur == NULL);
+		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+				bma->firstblock, bma->flist, &bma->cur,
+				0, &tmp_logflags, whichfork);
+		bma->logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+
+	/* clear out the allocated field, done with it now in any case. */
+	if (bma->cur)
+		bma->cur->bc_private.b.allocated = 0;
+
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+	bma->logflags |= rval;
+	return error;
+}
+
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+	xfs_mount_t	*mp,
+	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
+	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
+	xfs_extlen_t	extsz,		/* align to this extent size */
+	int		rt,		/* is this a realtime inode? */
+	int		eof,		/* is extent at end-of-file? */
+	int		delay,		/* creating delalloc extent? */
+	int		convert,	/* overwriting unwritten extent? */
+	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
+	xfs_extlen_t	*lenp)		/* in/out: aligned length */
+{
+	xfs_fileoff_t	orig_off;	/* original offset */
+	xfs_extlen_t	orig_alen;	/* original length */
+	xfs_fileoff_t	orig_end;	/* original off+len */
+	xfs_fileoff_t	nexto;		/* next file offset */
+	xfs_fileoff_t	prevo;		/* previous file offset */
+	xfs_fileoff_t	align_off;	/* temp for offset */
+	xfs_extlen_t	align_alen;	/* temp for length */
+	xfs_extlen_t	temp;		/* temp for calculations */
+
+	if (convert)
+		return 0;
+
+	orig_off = align_off = *offp;
+	orig_alen = align_alen = *lenp;
+	orig_end = orig_off + orig_alen;
+
+	/*
+	 * If this request overlaps an existing extent, then don't
+	 * attempt to perform any additional alignment.
+	 */
+	if (!delay && !eof &&
+	    (orig_off >= gotp->br_startoff) &&
+	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+		return 0;
+	}
+
+	/*
+	 * If the file offset is unaligned vs. the extent size
+	 * we need to align it.  This will be possible unless
+	 * the file was previously written with a kernel that didn't
+	 * perform this alignment, or if a truncate shot us in the
+	 * foot.
+	 */
+	temp = do_mod(orig_off, extsz);
+	if (temp) {
+		align_alen += temp;
+		align_off -= temp;
+	}
+	/*
+	 * Same adjustment for the end of the requested area.
+	 */
+	if ((temp = (align_alen % extsz))) {
+		align_alen += extsz - temp;
+	}
+	/*
+	 * If the previous block overlaps with this proposed allocation
+	 * then move the start forward without adjusting the length.
+	 */
+	if (prevp->br_startoff != NULLFILEOFF) {
+		if (prevp->br_startblock == HOLESTARTBLOCK)
+			prevo = prevp->br_startoff;
+		else
+			prevo = prevp->br_startoff + prevp->br_blockcount;
+	} else
+		prevo = 0;
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	/*
+	 * If the next block overlaps with this proposed allocation
+	 * then move the start back without adjusting the length,
+	 * but not before offset 0.
+	 * This may of course make the start overlap previous block,
+	 * and if we hit the offset 0 limit then the next block
+	 * can still overlap too.
+	 */
+	if (!eof && gotp->br_startoff != NULLFILEOFF) {
+		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+			nexto = gotp->br_startoff + gotp->br_blockcount;
+		else
+			nexto = gotp->br_startoff;
+	} else
+		nexto = NULLFILEOFF;
+	if (!eof &&
+	    align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto)
+		align_off = nexto > align_alen ? nexto - align_alen : 0;
+	/*
+	 * If we're now overlapping the next or previous extent that
+	 * means we can't fit an extsz piece in this hole.  Just move
+	 * the start forward to the first valid spot and set
+	 * the length so we hit the end.
+	 */
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	if (align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto &&
+	    nexto != NULLFILEOFF) {
+		ASSERT(nexto > prevo);
+		align_alen = nexto - align_off;
+	}
+
+	/*
+	 * If realtime, and the result isn't a multiple of the realtime
+	 * extent size we need to remove blocks until it is.
+	 */
+	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+		/*
+		 * We're not covering the original request, or
+		 * we won't be able to once we fix the length.
+		 */
+		if (orig_off < align_off ||
+		    orig_end > align_off + align_alen ||
+		    align_alen - temp < orig_alen)
+			return EINVAL;
+		/*
+		 * Try to fix it by moving the start up.
+		 */
+		if (align_off + temp <= orig_off) {
+			align_alen -= temp;
+			align_off += temp;
+		}
+		/*
+		 * Try to fix it by moving the end in.
+		 */
+		else if (align_off + align_alen - temp >= orig_end)
+			align_alen -= temp;
+		/*
+		 * Set the start to the minimum then trim the length.
+		 */
+		else {
+			align_alen -= orig_off - align_off;
+			align_off = orig_off;
+			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+		}
+		/*
+		 * Result doesn't cover the request, fail it.
+		 */
+		if (orig_off < align_off || orig_end > align_off + align_alen)
+			return EINVAL;
+	} else {
+		ASSERT(orig_off >= align_off);
+		ASSERT(orig_end <= align_off + align_alen);
+	}
+
+#ifdef DEBUG
+	if (!eof && gotp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off + align_alen <= gotp->br_startoff);
+	if (prevp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+	*lenp = align_alen;
+	*offp = align_off;
+	return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+void
+xfs_bmap_adjacent(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+
+#define	ISVALID(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	mp = ap->ip->i_mount;
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use its last block as our starting point.
+	 */
+	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+	    !isnullstartblock(ap->prev.br_startblock) &&
+	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+		    ap->prev.br_startblock)) {
+		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->offset -
+			(ap->prev.br_startoff + ap->prev.br_blockcount);
+		if (adjust &&
+		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+			ap->blkno += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prev.br_startoff != NULLFILEOFF &&
+		    !isnullstartblock(ap->prev.br_startblock) &&
+		    (prevbno = ap->prev.br_startblock +
+			       ap->prev.br_blockcount) &&
+		    ISVALID(prevbno, ap->prev.br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->offset -
+				(ap->prev.br_startoff +
+				 ap->prev.br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(prevbno + prevdiff,
+				    ap->prev.br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!isnullstartblock(ap->got.br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->got.br_startoff - ap->offset;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->got.br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an invalid block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+			    ISVALID(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISVALID(gotbno - ap->length, gotbno)) {
+				gotbno -= ap->length;
+				gotdiff += adjust - ap->length;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->blkno is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->blkno = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->blkno = gotbno;
+	}
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		ag,
+	xfs_extlen_t		*blen,
+	int			*notinit)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	xfs_extlen_t		longest;
+	int			error = 0;
+
+	pag = xfs_perag_get(mp, ag);
+	if (!pag->pagf_init) {
+		error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+		if (error)
+			goto out;
+
+		if (!pag->pagf_init) {
+			*notinit = 1;
+			goto out;
+		}
+	}
+
+	longest = xfs_alloc_longest_free_extent(mp, pag);
+	if (*blen < longest)
+		*blen = longest;
+
+out:
+	xfs_perag_put(pag);
+	return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen,
+	int			notinit)
+{
+	if (notinit || *blen < ap->minlen) {
+		/*
+		 * Since we did a BUF_TRYLOCK above, it is possible that
+		 * there is space for this request.
+		 */
+		args->minlen = ap->minlen;
+	} else if (*blen < args->maxlen) {
+		/*
+		 * If the best seen length is less than the request length,
+		 * use the best as the minimum.
+		 */
+		args->minlen = *blen;
+	} else {
+		/*
+		 * Otherwise we've seen an extent as big as maxlen, use that
+		 * as the minimum.
+		 */
+		args->minlen = args->maxlen;
+	}
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag, startag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_START_BNO;
+	args->total = ap->total;
+
+	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (startag == NULLAGNUMBER)
+		startag = ag = 0;
+
+	while (*blen < args->maxlen) {
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+		if (++ag == mp->m_sb.sb_agcount)
+			ag = 0;
+		if (ag == startag)
+			break;
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_agnumber_t		ag;
+	int			notinit = 0;
+	int			error;
+
+	args->type = XFS_ALLOCTYPE_NEAR_BNO;
+	args->total = ap->total;
+
+	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (ag == NULLAGNUMBER)
+		ag = 0;
+
+	error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+	if (error)
+		return error;
+
+	if (*blen < args->maxlen) {
+		error = xfs_filestream_new_ag(ap, &ag);
+		if (error)
+			return error;
+
+		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+						     &notinit);
+		if (error)
+			return error;
+
+	}
+
+	xfs_bmap_select_minlen(ap, args, blen, notinit);
+
+	/*
+	 * Set the failure fallback case to look in the selected AG as stream
+	 * may have moved.
+	 */
+	ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+	return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	xfs_mount_t	*mp;		/* mount point structure */
+	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_agnumber_t	ag;
+	xfs_alloc_arg_t	args;
+	xfs_extlen_t	blen;
+	xfs_extlen_t	nextminlen = 0;
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		isaligned;
+	int		tryagain;
+	int		error;
+	int		stripe_align;
+
+	ASSERT(ap->length);
+
+	mp = ap->ip->i_mount;
+
+	/* stripe alignment for allocation is determined by mount parameters */
+	stripe_align = 0;
+	if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+		stripe_align = mp->m_swidth;
+	else if (mp->m_dalign)
+		stripe_align = mp->m_dalign;
+
+	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+	if (unlikely(align)) {
+		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+						align, 0, ap->eof, 0, ap->conv,
+						&ap->offset, &ap->length);
+		ASSERT(!error);
+		ASSERT(ap->length);
+	}
+
+
+	nullfb = *ap->firstblock == NULLFSBLOCK;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+	if (nullfb) {
+		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+			ag = xfs_filestream_lookup_ag(ap->ip);
+			ag = (ag != NULLAGNUMBER) ? ag : 0;
+			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+		} else {
+			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+		}
+	} else
+		ap->blkno = *ap->firstblock;
+
+	xfs_bmap_adjacent(ap);
+
+	/*
+	 * If allowed, use ap->blkno; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+		;
+	else
+		ap->blkno = *ap->firstblock;
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	tryagain = isaligned = 0;
+	memset(&args, 0, sizeof(args));
+	args.tp = ap->tp;
+	args.mp = mp;
+	args.fsbno = ap->blkno;
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+	args.firstblock = *ap->firstblock;
+	blen = 0;
+	if (nullfb) {
+		/*
+		 * Search for an allocation group with a single extent large
+		 * enough for the request.  If one isn't found, then adjust
+		 * the minimum allocation size to the largest space found.
+		 */
+		if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+		else
+			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+		if (error)
+			return error;
+	} else if (ap->flist->xbf_low) {
+		if (xfs_inode_is_filestream(ap->ip))
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+		else
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		args.total = args.minlen = ap->minlen;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.total = ap->total;
+		args.minlen = ap->minlen;
+	}
+	/* apply extent size hints if obtained earlier */
+	if (unlikely(align)) {
+		args.prod = align;
+		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+		args.prod = 1;
+		args.mod = 0;
+	} else {
+		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
+			args.mod = (xfs_extlen_t)(args.prod - args.mod);
+	}
+	/*
+	 * If we are not low on available data blocks, and the
+	 * underlying logical volume manager is a stripe, and
+	 * the file offset is zero then try to allocate data
+	 * blocks on stripe unit boundary.
+	 * NOTE: ap->aeof is only set if the allocation length
+	 * is >= the stripe unit and the allocation offset is
+	 * at the end of file.
+	 */
+	if (!ap->flist->xbf_low && ap->aeof) {
+		if (!ap->offset) {
+			args.alignment = stripe_align;
+			atype = args.type;
+			isaligned = 1;
+			/*
+			 * Adjust for alignment
+			 */
+			if (blen > args.alignment && blen <= args.maxlen)
+				args.minlen = blen - args.alignment;
+			args.minalignslop = 0;
+		} else {
+			/*
+			 * First try an exact bno allocation.
+			 * If it fails then do a near or start bno
+			 * allocation with alignment turned on.
+			 */
+			atype = args.type;
+			tryagain = 1;
+			args.type = XFS_ALLOCTYPE_THIS_BNO;
+			args.alignment = 1;
+			/*
+			 * Compute the minlen+alignment for the
+			 * next case.  Set slop so that the value
+			 * of minlen+alignment+slop doesn't go up
+			 * between the calls.
+			 */
+			if (blen > stripe_align && blen <= args.maxlen)
+				nextminlen = blen - stripe_align;
+			else
+				nextminlen = args.minlen;
+			if (nextminlen + stripe_align > args.minlen + 1)
+				args.minalignslop =
+					nextminlen + stripe_align -
+					args.minlen - 1;
+			else
+				args.minalignslop = 0;
+		}
+	} else {
+		args.alignment = 1;
+		args.minalignslop = 0;
+	}
+	args.minleft = ap->minleft;
+	args.wasdel = ap->wasdel;
+	args.isfl = 0;
+	args.userdata = ap->userdata;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	if (tryagain && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * Exact allocation failed. Now try with alignment
+		 * turned on.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = stripe_align;
+		args.minlen = nextminlen;
+		args.minalignslop = 0;
+		isaligned = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		/*
+		 * allocation failed, so turn off alignment and
+		 * try again.
+		 */
+		args.type = atype;
+		args.fsbno = ap->blkno;
+		args.alignment = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb &&
+	    args.minlen > ap->minlen) {
+		args.minlen = ap->minlen;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = ap->blkno;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+	if (args.fsbno == NULLFSBLOCK && nullfb) {
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.total = ap->minlen;
+		args.minleft = 0;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		ap->flist->xbf_low = 1;
+	}
+	if (args.fsbno != NULLFSBLOCK) {
+		/*
+		 * check the allocation happened at the same or higher AG than
+		 * the first block that was allocated.
+		 */
+		ASSERT(*ap->firstblock == NULLFSBLOCK ||
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+		       (ap->flist->xbf_low &&
+			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+			XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+		ap->blkno = args.fsbno;
+		if (*ap->firstblock == NULLFSBLOCK)
+			*ap->firstblock = args.fsbno;
+		ASSERT(nullfb || fb_agno == args.agno ||
+		       (ap->flist->xbf_low && fb_agno < args.agno));
+		ap->length = args.len;
+		ap->ip->i_d.di_nblocks += args.len;
+		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+		if (ap->wasdel)
+			ap->ip->i_delayed_blks -= args.len;
+		/*
+		 * Adjust the disk quota also. This was reserved
+		 * earlier.
+		 */
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+			ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+					XFS_TRANS_DQ_BCOUNT,
+			(long) args.len);
+	} else {
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+	}
+	return 0;
+}
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
+{
+	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+		return xfs_bmap_rtalloc(ap);
+	return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+	struct xfs_bmbt_irec	*mval,
+	struct xfs_bmbt_irec	*got,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			n,
+	int			flags)
+{
+	if ((flags & XFS_BMAPI_ENTIRE) ||
+	    got->br_startoff + got->br_blockcount <= obno) {
+		*mval = *got;
+		if (isnullstartblock(got->br_startblock))
+			mval->br_startblock = DELAYSTARTBLOCK;
+		return;
+	}
+
+	if (obno > *bno)
+		*bno = obno;
+	ASSERT((*bno >= obno) || (n == 0));
+	ASSERT(*bno < end);
+	mval->br_startoff = *bno;
+	if (isnullstartblock(got->br_startblock))
+		mval->br_startblock = DELAYSTARTBLOCK;
+	else
+		mval->br_startblock = got->br_startblock +
+					(*bno - got->br_startoff);
+	/*
+	 * Return the minimum of what we got and what we asked for for
+	 * the length.  We can use the len variable here because it is
+	 * modified below and we could have been there before coming
+	 * here if the first part of the allocation didn't overlap what
+	 * was asked for.
+	 */
+	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+			got->br_blockcount - (*bno - got->br_startoff));
+	mval->br_state = got->br_state;
+	ASSERT(mval->br_blockcount <= len);
+	return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+	struct xfs_bmbt_irec	**map,
+	xfs_fileoff_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_fileoff_t		obno,
+	xfs_fileoff_t		end,
+	int			*n,
+	int			flags)
+{
+	xfs_bmbt_irec_t	*mval = *map;
+
+	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+	       ((mval->br_startoff + mval->br_blockcount) <= end));
+	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+	       (mval->br_startoff < obno));
+
+	*bno = mval->br_startoff + mval->br_blockcount;
+	*len = end - *bno;
+	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+		/* update previous map with new information */
+		ASSERT(mval->br_startblock == mval[-1].br_startblock);
+		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+		ASSERT(mval->br_state == mval[-1].br_state);
+		mval[-1].br_blockcount = mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock != HOLESTARTBLOCK &&
+		   mval->br_startblock == mval[-1].br_startblock +
+					  mval[-1].br_blockcount &&
+		   ((flags & XFS_BMAPI_IGSTATE) ||
+			mval[-1].br_state == mval->br_state)) {
+		ASSERT(mval->br_startoff ==
+		       mval[-1].br_startoff + mval[-1].br_blockcount);
+		mval[-1].br_blockcount += mval->br_blockcount;
+	} else if (*n > 0 &&
+		   mval->br_startblock == DELAYSTARTBLOCK &&
+		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+		   mval->br_startoff ==
+		   mval[-1].br_startoff + mval[-1].br_blockcount) {
+		mval[-1].br_blockcount += mval->br_blockcount;
+		mval[-1].br_state = mval->br_state;
+	} else if (!((*n == 0) &&
+		     ((mval->br_startoff + mval->br_blockcount) <=
+		      obno))) {
+		mval++;
+		(*n)++;
+	}
+	*map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*mval,
+	int			*nmap,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmbt_irec	got;
+	struct xfs_bmbt_irec	prev;
+	xfs_fileoff_t		obno;
+	xfs_fileoff_t		end;
+	xfs_extnum_t		lastx;
+	int			error;
+	int			eof;
+	int			n = 0;
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+			   XFS_BMAPI_IGSTATE)));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return EIO;
+
+	XFS_STATS_INC(xs_blk_mapr);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		/* Reading past eof, act as though there's a hole up to end. */
+		if (eof)
+			got.br_startoff = end;
+		if (got.br_startoff > bno) {
+			/* Reading in a hole.  */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+	*nmap = n;
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		aoff,
+	xfs_filblks_t		len,
+	struct xfs_bmbt_irec	*got,
+	struct xfs_bmbt_irec	*prev,
+	xfs_extnum_t		*lastx,
+	int			eof)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	xfs_extlen_t		alen;
+	xfs_extlen_t		indlen;
+	char			rt = XFS_IS_REALTIME_INODE(ip);
+	xfs_extlen_t		extsz;
+	int			error;
+
+	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	if (!eof)
+		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+	/* Figure out the extent size, adjust alen */
+	extsz = xfs_get_extsz_hint(ip);
+	if (extsz) {
+		/*
+		 * Make sure we don't exceed a single extent length when we
+		 * align the extent by reducing length we are going to
+		 * allocate by the maximum amount extent size aligment may
+		 * require.
+		 */
+		alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+					       1, 0, &aoff, &alen);
+		ASSERT(!error);
+	}
+
+	if (rt)
+		extsz = alen / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Make a transaction-less quota reservation for delayed allocation
+	 * blocks.  This number gets adjusted later.  We return if we haven't
+	 * allocated blocks already inside this loop.
+	 */
+	error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+			rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
+
+	/*
+	 * Split changing sb for alen and indlen since they could be coming
+	 * from different places.
+	 */
+	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+	ASSERT(indlen > 0);
+
+	if (rt) {
+		error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+					  -((int64_t)extsz), 0);
+	} else {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 -((int64_t)alen), 0);
+	}
+
+	if (error)
+		goto out_unreserve_quota;
+
+	error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+					 -((int64_t)indlen), 0);
+	if (error)
+		goto out_unreserve_blocks;
+
+
+	ip->i_delayed_blks += alen;
+
+	got->br_startoff = aoff;
+	got->br_startblock = nullstartblock(indlen);
+	got->br_blockcount = alen;
+	got->br_state = XFS_EXT_NORM;
+	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+	 * might have merged it into one of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+	ASSERT(got->br_startoff <= aoff);
+	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+	ASSERT(isnullstartblock(got->br_startblock));
+	ASSERT(got->br_state == XFS_EXT_NORM);
+	return 0;
+
+out_unreserve_blocks:
+	if (rt)
+		xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+	else
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
+				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+	struct xfs_inode	*ip,	/* incore inode */
+	xfs_fileoff_t		bno,	/* starting file offs. mapped */
+	xfs_filblks_t		len,	/* length to map in file */
+	struct xfs_bmbt_irec	*mval,	/* output: map values */
+	int			*nmap,	/* i/o: mval size/count */
+	int			flags)	/* XFS_BMAPI_... */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	got;	/* current file extent record */
+	struct xfs_bmbt_irec	prev;	/* previous file extent record */
+	xfs_fileoff_t		obno;	/* old block number (offset) */
+	xfs_fileoff_t		end;	/* end of mapped file region */
+	xfs_extnum_t		lastx;	/* last useful extent number */
+	int			eof;	/* we've hit the end of extents */
+	int			n = 0;	/* current extent index */
+	int			error = 0;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return EIO;
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
+
+	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+	end = bno + len;
+	obno = bno;
+
+	while (bno < end && n < *nmap) {
+		if (eof || got.br_startoff > bno) {
+			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+							   &prev, &lastx, eof);
+			if (error) {
+				if (n == 0) {
+					*nmap = 0;
+					return error;
+				}
+				break;
+			}
+		}
+
+		/* set up the extent map to return. */
+		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/* If we're done, stop now. */
+		if (bno >= end || n >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		prev = got;
+		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+		else
+			eof = 1;
+	}
+
+	*nmap = n;
+	return 0;
+}
+
+
+int
+__xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma)
+{
+	struct xfs_mount	*mp = bma->ip->i_mount;
+	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	ASSERT(bma->length > 0);
+
+	/*
+	 * For the wasdelay case, we could also just allocate the stuff asked
+	 * for in this bmap call but that wouldn't be as good.
+	 */
+	if (bma->wasdel) {
+		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+		bma->offset = bma->got.br_startoff;
+		if (bma->idx != NULLEXTNUM && bma->idx) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+					 &bma->prev);
+		}
+	} else {
+		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+		if (!bma->eof)
+			bma->length = XFS_FILBLKS_MIN(bma->length,
+					bma->got.br_startoff - bma->offset);
+	}
+
+	/*
+	 * Indicate if this is the first user data in the file, or just any
+	 * user data.
+	 */
+	if (!(bma->flags & XFS_BMAPI_METADATA)) {
+		bma->userdata = (bma->offset == 0) ?
+			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+	}
+
+	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+	/*
+	 * Only want to do the alignment at the eof if it is userdata and
+	 * allocation length is larger than a stripe unit.
+	 */
+	if (mp->m_dalign && bma->length >= mp->m_dalign &&
+	    !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+		error = xfs_bmap_isaeof(bma, whichfork);
+		if (error)
+			return error;
+	}
+
+	error = xfs_bmap_alloc(bma);
+	if (error)
+		return error;
+
+	if (bma->flist->xbf_low)
+		bma->minleft = 0;
+	if (bma->cur)
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+	if (bma->blkno == NULLFSBLOCK)
+		return 0;
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	/*
+	 * Bump the number of extents we've allocated
+	 * in this call.
+	 */
+	bma->nallocs++;
+
+	if (bma->cur)
+		bma->cur->bc_private.b.flags =
+			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+	bma->got.br_startoff = bma->offset;
+	bma->got.br_startblock = bma->blkno;
+	bma->got.br_blockcount = bma->length;
+	bma->got.br_state = XFS_EXT_NORM;
+
+	/*
+	 * A wasdelay extent has been initialized, so shouldn't be flagged
+	 * as unwritten.
+	 */
+	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+		bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+	if (bma->wasdel)
+		error = xfs_bmap_add_extent_delay_real(bma);
+	else
+		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
+	 * the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	ASSERT(bma->got.br_startoff <= bma->offset);
+	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+	       bma->offset + bma->length);
+	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+	       bma->got.br_state == XFS_EXT_UNWRITTEN);
+	return 0;
+}
+
+STATIC int
+xfs_bmapi_convert_unwritten(
+	struct xfs_bmalloca	*bma,
+	struct xfs_bmbt_irec	*mval,
+	xfs_filblks_t		len,
+	int			flags)
+{
+	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+						XFS_ATTR_FORK : XFS_DATA_FORK;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	int			tmp_logflags = 0;
+	int			error;
+
+	/* check if we need to do unwritten->real conversion */
+	if (mval->br_state == XFS_EXT_UNWRITTEN &&
+	    (flags & XFS_BMAPI_PREALLOC))
+		return 0;
+
+	/* check if we need to do real->unwritten conversion */
+	if (mval->br_state == XFS_EXT_NORM &&
+	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+		return 0;
+
+	/*
+	 * Modify (by adding) the state flag, if writing.
+	 */
+	ASSERT(mval->br_blockcount <= len);
+	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+					bma->ip, whichfork);
+		bma->cur->bc_private.b.firstblock = *bma->firstblock;
+		bma->cur->bc_private.b.flist = bma->flist;
+	}
+	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+			&bma->cur, mval, bma->firstblock, bma->flist,
+			&tmp_logflags);
+	bma->logflags |= tmp_logflags;
+	if (error)
+		return error;
+
+	/*
+	 * Update our extent pointer, given that
+	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
+	 * of the neighbouring ones.
+	 */
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+	/*
+	 * We may have combined previously unwritten space with written space,
+	 * so generate another request.
+	 */
+	if (mval->br_blockcount < len)
+		return EAGAIN;
+	return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int
+xfs_bmapi_write(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	struct xfs_bmap_free	*flist)		/* i/o: list extents to free */
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp;
+	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
+	xfs_fileoff_t		end;		/* end of mapped file region */
+	int			eof;		/* after the end of extents */
+	int			error;		/* error return */
+	int			n;		/* current extent index */
+	xfs_fileoff_t		obno;		/* old block number (offset) */
+	int			whichfork;	/* data or attr fork */
+	char			inhole;		/* current location is hole in file */
+	char			wasdelay;	/* old extent was delayed */
+
+#ifdef DEBUG
+	xfs_fileoff_t		orig_bno;	/* original block number value */
+	int			orig_flags;	/* original flags arg value */
+	xfs_filblks_t		orig_len;	/* original value of len arg */
+	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
+	int			orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+	ASSERT(tp != NULL);
+	ASSERT(len > 0);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return EIO;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	XFS_STATS_INC(xs_blk_mapw);
+
+	if (*firstblock == NULLFSBLOCK) {
+		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+		else
+			bma.minleft = 1;
+	} else {
+		bma.minleft = 0;
+	}
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			goto error0;
+	}
+
+	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+				&bma.prev);
+	n = 0;
+	end = bno + len;
+	obno = bno;
+
+	bma.tp = tp;
+	bma.ip = ip;
+	bma.total = total;
+	bma.userdata = 0;
+	bma.flist = flist;
+	bma.firstblock = firstblock;
+
+	if (flags & XFS_BMAPI_STACK_SWITCH)
+		bma.stack_switch = 1;
+
+	while (bno < end && n < *nmap) {
+		inhole = eof || bma.got.br_startoff > bno;
+		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (inhole || wasdelay) {
+			bma.eof = eof;
+			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+			bma.wasdel = wasdelay;
+			bma.offset = bno;
+			bma.flags = flags;
+
+			/*
+			 * There's a 32/64 bit type mismatch between the
+			 * allocation length request (which can be 64 bits in
+			 * length) and the bma length request, which is
+			 * xfs_extlen_t and therefore 32 bits. Hence we have to
+			 * check for 32-bit overflows and handle them here.
+			 */
+			if (len > (xfs_filblks_t)MAXEXTLEN)
+				bma.length = MAXEXTLEN;
+			else
+				bma.length = len;
+
+			ASSERT(len > 0);
+			ASSERT(bma.length > 0);
+			error = xfs_bmapi_allocate(&bma);
+			if (error)
+				goto error0;
+			if (bma.blkno == NULLFSBLOCK)
+				break;
+		}
+
+		/* Deal with the allocated space we found.  */
+		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+							end, n, flags);
+
+		/* Execute unwritten extent conversion if necessary */
+		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+		if (error == EAGAIN)
+			continue;
+		if (error)
+			goto error0;
+
+		/* update the extent map to return */
+		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+			break;
+
+		/* Else go on to the next record. */
+		bma.prev = bma.got;
+		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+					 &bma.got);
+		} else
+			eof = 1;
+	}
+	*nmap = n;
+
+	/*
+	 * Transform from btree to extents, give it cur.
+	 */
+	if (xfs_bmap_wants_extents(ip, whichfork)) {
+		int		tmp_logflags = 0;
+
+		ASSERT(bma.cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
+			&tmp_logflags, whichfork);
+		bma.logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_NEXTENTS(ip, whichfork) >
+		XFS_IFORK_MAXEXT(ip, whichfork));
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		bma.logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log whatever the flags say, even if error.  Otherwise we might miss
+	 * detecting a case where the data is changed, there's an error,
+	 * and it's not logged so we don't shutdown when we should.
+	 */
+	if (bma.logflags)
+		xfs_trans_log_inode(tp, ip, bma.logflags);
+
+	if (bma.cur) {
+		if (!error) {
+			ASSERT(*firstblock == NULLFSBLOCK ||
+			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+			       XFS_FSB_TO_AGNO(mp,
+				       bma.cur->bc_private.b.firstblock) ||
+			       (flist->xbf_low &&
+				XFS_FSB_TO_AGNO(mp, *firstblock) <
+				XFS_FSB_TO_AGNO(mp,
+					bma.cur->bc_private.b.firstblock)));
+			*firstblock = bma.cur->bc_private.b.firstblock;
+		}
+		xfs_btree_del_cursor(bma.cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	if (!error)
+		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+			orig_nmap, *nmap);
+	return error;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0;	/* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	uint			qfield;	/* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+	int			state = 0;
+
+	XFS_STATS_INC(xs_del_exlist);
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.  Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+			if (error)
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, *idx, 1,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+		--*idx;
+		if (delay)
+			break;
+
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_btree_delete(cur, &i)))
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_btree_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_btree_insert(cur, &i);
+				if (error && error != ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = ENOSPC;
+					goto done;
+				}
+				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			} else
+				flags |= xfs_ilog_fext(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+		++*idx;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new) {
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+			(int64_t)(da_old - da_new), 0);
+	}
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* misc flags */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done)		/* set if not done yet */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_irec_t		del;		/* extent being deleted */
+	int			eof;		/* is deleting at eof */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	xfs_bmbt_irec_t		got;		/* current extent record */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	xfs_extnum_t		lastx;		/* last extent index used */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_irec_t		prev;		/* previous extent record */
+	xfs_fileoff_t		start;		/* first file offset deleted */
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	xfs_fsblock_t		sum;
+
+	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (unlikely(
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return EFSCORRUPTED;
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return EIO;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*done = 1;
+		return 0;
+	}
+	XFS_STATS_INC(xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	start = bno;
+	bno = start + len - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+
+	/*
+	 * Check to see if the given block number is past the end of the
+	 * file, back up to the last block if so...
+	 */
+	if (eof) {
+		ep = xfs_iext_get_ext(ifp, --lastx);
+		xfs_bmbt_get_all(ep, &got);
+		bno = got.br_startoff + got.br_blockcount - 1;
+	}
+	logflags = 0;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else
+		cur = NULL;
+
+	if (isrt) {
+		/*
+		 * Synchronize by locking the bitmap inode.
+		 */
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+	}
+
+	extno = 0;
+	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which bno lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > bno) {
+			if (--lastx < 0)
+				break;
+			ep = xfs_iext_get_ext(ifp, lastx);
+			xfs_bmbt_get_all(ep, &got);
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		bno = XFS_FILEOFF_MIN(bno,
+			got.br_startoff + got.br_blockcount - 1);
+		if (bno < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		ASSERT(ep != NULL);
+		del = got;
+		wasdel = isnullstartblock(del.br_startblock);
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > bno + 1)
+			del.br_blockcount = bno + 1 - del.br_startoff;
+		sum = del.br_startblock + del.br_blockcount;
+		if (isrt &&
+		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN ||
+			    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(bno >= mod);
+				bno -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(xfs_iext_get_ext(
+							ifp, lastx), &got);
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+					&lastx, &cur, &del, firstblock, flist,
+					&logflags);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			mod = mp->m_sb.sb_rextsize - mod;
+			if (del.br_blockcount > mod) {
+				del.br_blockcount -= mod;
+				del.br_startoff += mod;
+				del.br_startblock += mod;
+			} else if ((del.br_startoff == start &&
+				    (del.br_state == XFS_EXT_UNWRITTEN ||
+				     xfs_trans_get_block_res(tp) == 0)) ||
+				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(bno >= del.br_blockcount);
+				bno -= del.br_blockcount;
+				if (got.br_startoff > bno) {
+					if (--lastx >= 0) {
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+						xfs_bmbt_get_all(ep, &got);
+					}
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				ASSERT(lastx > 0);
+				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						lastx - 1), &prev);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!isnullstartblock(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				if (prev.br_startoff < start) {
+					mod = start - prev.br_startoff;
+					prev.br_blockcount -= mod;
+					prev.br_startblock += mod;
+					prev.br_startoff = start;
+				}
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				lastx--;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &prev,
+						firstblock, flist, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent_unwritten_real(tp,
+						ip, &lastx, &cur, &del,
+						firstblock, flist, &logflags);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+		if (wasdel) {
+			ASSERT(startblockval(del.br_startblock) > 0);
+			/* Update realtime/data freespace, unreserve quota */
+			if (isrt) {
+				xfs_filblks_t rtexts;
+
+				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+				do_div(rtexts, mp->m_sb.sb_rextsize);
+				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+						(int64_t)rtexts, 0);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_RTBLKS);
+			} else {
+				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						(int64_t)del.br_blockcount, 0);
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
+					XFS_QMOPT_RES_REGBLKS);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+		/*
+		 * If it's the case where the directory code is running
+		 * with no block reservation, and the deleted block is in
+		 * the middle of its extent, and the resulting insert
+		 * of an extent would cause transformation to btree format,
+		 * then reject it.  The calling code will then swap
+		 * blocks around instead.
+		 * We have to do this now, rather than waiting for the
+		 * conversion to btree format, since the transaction
+		 * will be dirty.
+		 */
+		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+			XFS_IFORK_MAXEXT(ip, whichfork) &&
+		    del.br_startoff > got.br_startoff &&
+		    del.br_startoff + del.br_blockcount <
+		    got.br_startoff + got.br_blockcount) {
+			error = ENOSPC;
+			goto error0;
+		}
+		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+				&tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+		bno = del.br_startoff - 1;
+nodelete:
+		/*
+		 * If not done go on to the next (previous) record.
+		 */
+		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+			if (lastx >= 0) {
+				ep = xfs_iext_get_ext(ifp, lastx);
+				if (xfs_bmbt_get_startoff(ep) > bno) {
+					if (--lastx >= 0)
+						ep = xfs_iext_get_ext(ifp,
+								      lastx);
+				}
+				xfs_bmbt_get_all(ep, &got);
+			}
+			extno++;
+		}
+	}
+	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+			&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from btree to extents, give it cur
+	 */
+	else if (xfs_bmap_wants_extents(ip, whichfork)) {
+		ASSERT(cur != NULL);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+			whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from extents to local?
+	 */
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent records if we've converted to btree format.
+	 */
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~xfs_ilog_fbroot(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error) {
+			*firstblock = cur->bc_private.b.firstblock;
+			cur->bc_private.b.allocated = 0;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		offset_shift_fsb,
+	xfs_extnum_t		*current_ext,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			num_exts)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec            got;
+	struct xfs_bmbt_irec		left;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_extnum_t			nexts = 0;
+	xfs_fileoff_t			startoff;
+	int				error = 0;
+	int				i;
+	int				whichfork = XFS_DATA_FORK;
+	int				logflags;
+	xfs_filblks_t			blockcount = 0;
+	int				total_extents;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+				 XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return EIO;
+
+	ASSERT(current_ext != NULL);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If *current_ext is 0, we would need to lookup the extent
+	 * from where we would start shifting and store it in gotp.
+	 */
+	if (!*current_ext) {
+		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+		/*
+		 * gotp can be null in 2 cases: 1) if there are no extents
+		 * or 2) start_fsb lies in a hole beyond which there are
+		 * no extents. Either way, we are done.
+		 */
+		if (!gotp) {
+			*done = 1;
+			return 0;
+		}
+	}
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	/*
+	 * There may be delalloc extents in the data fork before the range we
+	 * are collapsing out, so we cannot
+	 * use the count of real extents here. Instead we have to calculate it
+	 * from the incore fork.
+	 */
+	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	while (nexts++ < num_exts && *current_ext < total_extents) {
+
+		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		xfs_bmbt_get_all(gotp, &got);
+		startoff = got.br_startoff - offset_shift_fsb;
+
+		/*
+		 * Before shifting extent into hole, make sure that the hole
+		 * is large enough to accomodate the shift.
+		 */
+		if (*current_ext) {
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						*current_ext - 1), &left);
+
+			if (startoff < left.br_startoff + left.br_blockcount)
+				error = EINVAL;
+		} else if (offset_shift_fsb > got.br_startoff) {
+			/*
+			 * When first extent is shifted, offset_shift_fsb
+			 * should be less than the stating offset of
+			 * the first extent.
+			 */
+			error = EINVAL;
+		}
+
+		if (error)
+			goto del_cursor;
+
+		if (cur) {
+			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+						   got.br_startblock,
+						   got.br_blockcount,
+						   &i);
+			if (error)
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+
+		/* Check if we can merge 2 adjacent extents */
+		if (*current_ext &&
+		    left.br_startoff + left.br_blockcount == startoff &&
+		    left.br_startblock + left.br_blockcount ==
+				got.br_startblock &&
+		    left.br_state == got.br_state &&
+		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+			blockcount = left.br_blockcount +
+				got.br_blockcount;
+			xfs_iext_remove(ip, *current_ext, 1, 0);
+			if (cur) {
+				error = xfs_btree_delete(cur, &i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+			gotp = xfs_iext_get_ext(ifp, --*current_ext);
+			xfs_bmbt_get_all(gotp, &got);
+
+			/* Make cursor point to the extent we will update */
+			if (cur) {
+				error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+							   got.br_startblock,
+							   got.br_blockcount,
+							   &i);
+				if (error)
+					goto del_cursor;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			}
+
+			xfs_bmbt_set_blockcount(gotp, blockcount);
+			got.br_blockcount = blockcount;
+		} else {
+			/* We have to update the startoff */
+			xfs_bmbt_set_startoff(gotp, startoff);
+			got.br_startoff = startoff;
+		}
+
+		if (cur) {
+			error = xfs_bmbt_update(cur, got.br_startoff,
+						got.br_startblock,
+						got.br_blockcount,
+						got.br_state);
+			if (error)
+				goto del_cursor;
+		}
+
+		(*current_ext)++;
+		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	}
+
+	/* Check if we are done */
+	if (*current_ext == total_extents)
+		*done = 1;
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_trans_log_inode(tp, ip, logflags);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..de65bb8bab04
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+	xfs_filblks_t		blks,
+	int			extent_flag)
+{
+	if (extent_flag) {
+		ASSERT(blks != 0);	/* saved for DMIG */
+		return XFS_EXT_UNWRITTEN;
+	}
+	return XFS_EXT_NORM;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	struct xfs_inode	*ip,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
+	rblock->bb_level = dblock->bb_level;
+	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+		__uint64_t l0,
+		__uint64_t l1,
+		xfs_bmbt_irec_t *s)
+{
+	int	ext_flag;
+	xfs_exntst_t st;
+
+	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+	s->br_startoff = ((xfs_fileoff_t)l0 &
+			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_BLKNOS
+	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+			   (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+	{
+		xfs_dfsbno_t	b;
+
+		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
+		    (((xfs_dfsbno_t)l1) >> 21);
+		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+		s->br_startblock = (xfs_fsblock_t)b;
+	}
+#else	/* !DEBUG */
+	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_BLKNOS */
+	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+	/* This is xfs_extent_state() in-line */
+	if (ext_flag) {
+		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
+		st = XFS_EXT_UNWRITTEN;
+	} else
+		st = XFS_EXT_NORM;
+	s->br_state = st;
+}
+
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_host_t	*r)
+{
+#if XFS_BIG_BLKNOS
+	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+	       (((xfs_fsblock_t)r->l1) >> 21);
+#else
+#ifdef DEBUG
+	xfs_dfsbno_t	b;
+
+	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
+	    (((xfs_dfsbno_t)r->l1) >> 21);
+	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+	return (xfs_fsblock_t)b;
+#else	/* !DEBUG */
+	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_host_t	*r)
+{
+	return ((xfs_fileoff_t)r->l0 &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_host_t	*r)
+{
+	int	ext_flag;
+
+	ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+				ext_flag);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+	xfs_bmbt_rec_t	*r)
+{
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+	xfs_bmbt_rec_t	*r)
+{
+	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_host_t	*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		((xfs_bmbt_rec_base_t)startoff << 9) |
+		((xfs_bmbt_rec_base_t)startblock >> 43);
+	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+		((xfs_bmbt_rec_base_t)blockcount &
+		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(startblock)) {
+		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)startoff << 9) |
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)blockcount &
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	} else {
+		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)startoff << 9);
+		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+			 ((xfs_bmbt_rec_base_t)blockcount &
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_host_t *r,
+	xfs_bmbt_irec_t	*s)
+{
+	xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+			     s->br_blockcount, s->br_state);
+}
+
+
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+	xfs_bmbt_rec_t		*r,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	xfs_exntst_t		state)
+{
+	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+	r->l0 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		 ((xfs_bmbt_rec_base_t)startoff << 9) |
+		 ((xfs_bmbt_rec_base_t)startblock >> 43));
+	r->l1 = cpu_to_be64(
+		((xfs_bmbt_rec_base_t)startblock << 21) |
+		 ((xfs_bmbt_rec_base_t)blockcount &
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(startblock)) {
+		r->l0 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			 ((xfs_bmbt_rec_base_t)startoff << 9) |
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)blockcount &
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+	} else {
+		r->l0 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			 ((xfs_bmbt_rec_base_t)startoff << 9));
+		r->l1 = cpu_to_be64(
+			((xfs_bmbt_rec_base_t)startblock << 21) |
+			 ((xfs_bmbt_rec_base_t)blockcount &
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+				  s->br_blockcount, s->br_state);
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_host_t *r,
+	xfs_filblks_t	v)
+{
+	ASSERT((v & xfs_mask64hi(43)) == 0);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fsblock_t	v)
+{
+#if XFS_BIG_BLKNOS
+	ASSERT((v & xfs_mask64hi(12)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+		  (xfs_bmbt_rec_base_t)(v >> 43);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+		  (xfs_bmbt_rec_base_t)(v << 21);
+#else	/* !XFS_BIG_BLKNOS */
+	if (isnullstartblock(v)) {
+		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
+			  ((xfs_bmbt_rec_base_t)v << 21) |
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	} else {
+		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+	}
+#endif	/* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_host_t *r,
+	xfs_fileoff_t	v)
+{
+	ASSERT((v & xfs_mask64hi(9)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+		((xfs_bmbt_rec_base_t)v << 9) |
+		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_host_t *r,
+	xfs_exntst_t	v)
+{
+	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+	if (v == XFS_EXT_NORM)
+		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+	else
+		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	__be64			*fpp;
+	xfs_bmbt_key_t		*tkp;
+	__be64			*tpp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+		ASSERT(rblock->bb_u.l.bb_blkno ==
+		       cpu_to_be64(XFS_BUF_DADDR_NULL));
+	} else
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+	ASSERT(rblock->bb_level != 0);
+	dblock->bb_level = rblock->bb_level;
+	dblock->bb_numrecs = rblock->bb_numrecs;
+	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+	dmxr = be16_to_cpu(dblock->bb_numrecs);
+	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+
+int
+xfs_check_nostate_extents(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	xfs_extnum_t		num)
+{
+	for (; num > 0; num--, idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		if ((ep->l0 >>
+		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+			ASSERT(0);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_cur	*new;
+
+	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+	/*
+	 * Copy the firstblock, flist, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+	new->bc_private.b.flist = cur->bc_private.b.flist;
+	new->bc_private.b.flags = cur->bc_private.b.flags;
+
+	return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+	struct xfs_btree_cur	*src,
+	struct xfs_btree_cur	*dst)
+{
+	ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+	       (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+	ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+	dst->bc_private.b.allocated += src->bc_private.b.allocated;
+	dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+	src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = cur->bc_private.b.firstblock;
+	args.firstblock = args.fsbno;
+
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = be64_to_cpu(start->l);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		/*
+		 * Make sure there is sufficient room left in the AG to
+		 * complete a full tree split for an extent insert.  If
+		 * we are converting the middle part of an extent then
+		 * we may need space for two tree splits.
+		 *
+		 * We are relying on the caller to make the correct block
+		 * reservation for this operation to succeed.  If the
+		 * reservation amount is insufficient then we may fail a
+		 * block allocation here and corrupt the filesystem.
+		 */
+		args.minleft = xfs_trans_get_block_res(args.tp);
+	} else if (cur->bc_private.b.flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	}
+
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+		error = ENOSPC;
+		goto error0;
+	}
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto error0;
+
+	if (args.fsbno == NULLFSBLOCK && args.minleft) {
+		/*
+		 * Could not find an AG with enough free space to satisfy
+		 * a full btree split.  Try again without minleft and if
+		 * successful activate the lowspace algorithm.
+		 */
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.minleft = 0;
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			goto error0;
+		cur->bc_private.b.flist->xbf_low = 1;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+
+	new->l = cpu_to_be64(args.fsbno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+ error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_trans	*tp = cur->bc_tp;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, bp);
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0) / 2;
+	}
+
+	return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp;
+
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+				    cur->bc_private.b.whichfork);
+
+		return xfs_bmbt_maxrecs(cur->bc_mp,
+					ifp->if_broot_bytes, level == 0);
+	}
+
+	return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	if (level != cur->bc_nlevels - 1)
+		return cur->bc_mp->m_bmap_dmxr[level != 0];
+	return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->bmbt.br_startoff =
+		cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->bmbt.br_startoff != 0);
+
+	xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+			       0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+				      cur->bc_rec.b.br_startoff;
+}
+
+static bool
+xfs_bmbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	unsigned int		level;
+
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+			return false;
+		/*
+		 * XXX: need a better way of verifying the owner here. Right now
+		 * just make sure there has been one set.
+		 */
+		if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_BMAP_MAGIC):
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * numrecs and level verification.
+	 *
+	 * We don't know what fork we belong to, so just verify that the level
+	 * is less than the maximum of the two. Later checks will be more
+	 * precise.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.l.bb_leftsib ||
+	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+		return false;
+	if (!block->bb_u.l.bb_rightsib ||
+	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_bmbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_lblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_bmbt_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_bmbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_bmbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+	.verify_read = xfs_bmbt_read_verify,
+	.verify_write = xfs_bmbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_bmbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be64_to_cpu(k1->bmbt.br_startoff) <
+		be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+		xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+		xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+	.rec_len		= sizeof(xfs_bmbt_rec_t),
+	.key_len		= sizeof(xfs_bmbt_key_t),
+
+	.dup_cursor		= xfs_bmbt_dup_cursor,
+	.update_cursor		= xfs_bmbt_update_cursor,
+	.alloc_block		= xfs_bmbt_alloc_block,
+	.free_block		= xfs_bmbt_free_block,
+	.get_maxrecs		= xfs_bmbt_get_maxrecs,
+	.get_minrecs		= xfs_bmbt_get_minrecs,
+	.get_dmaxrecs		= xfs_bmbt_get_dmaxrecs,
+	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_bmbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
+	.key_diff		= xfs_bmbt_key_diff,
+	.buf_ops		= &xfs_bmbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_bmbt_keys_inorder,
+	.recs_inorder		= xfs_bmbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *				/* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* inode owning the btree */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_btnum = XFS_BTNUM_BMAP;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	cur->bc_ops = &xfs_bmbt_ops;
+	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+	cur->bc_private.b.ip = ip;
+	cur->bc_private.b.firstblock = NULLFSBLOCK;
+	cur->bc_private.b.flist = NULL;
+	cur->bc_private.b.allocated = 0;
+	cur->bc_private.b.flags = 0;
+	cur->bc_private.b.whichfork = whichfork;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmbt_rec_t);
+	return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= sizeof(xfs_bmdr_block_t);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_bmdr_rec_t);
+	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_ino_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(tp || buffer_list);
+	ASSERT(!(tp && buffer_list));
+	if (whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+	else
+		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+	if (!cur)
+		return ENOMEM;
+
+	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000000..036b4fd34bf7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,3989 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+	  XFS_FIBT_MAGIC },
+	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+};
+#define xfs_btree_magic(cur) \
+	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer for block, if any */
+{
+	int			lblock_ok = 1; /* block passes checks */
+	struct xfs_mount	*mp;	/* file system mount point */
+
+	mp = cur->bc_mp;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		lblock_ok = lblock_ok &&
+			uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.l.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	lblock_ok = lblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		block->bb_u.l.bb_leftsib &&
+		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+		block->bb_u.l.bb_rightsib &&
+		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+	return 0;
+}
+
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block */
+{
+	struct xfs_mount	*mp;	/* file system mount point */
+	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
+	struct xfs_agf		*agf;	/* ag. freespace structure */
+	xfs_agblock_t		agflen;	/* native ag. freespace length */
+	int			sblock_ok = 1; /* block passes checks */
+
+	mp = cur->bc_mp;
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	agflen = be32_to_cpu(agf->agf_length);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		sblock_ok = sblock_ok &&
+			uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.s.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	sblock_ok = sblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be16_to_cpu(block->bb_level) == level &&
+		be16_to_cpu(block->bb_numrecs) <=
+			cur->bc_ops->get_maxrecs(cur, level) &&
+		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+		block->bb_u.s.bb_leftsib &&
+		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+		block->bb_u.s.bb_rightsib;
+
+	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_btree_block	*block,	/* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp)	/* buffer containing block, if any */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_check_lblock(cur, block, level, bp);
+	else
+		return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_dfsbno_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		bno != NULLDFSBNO &&
+		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* btree block disk address */
+	int			level)	/* btree block level */
+{
+	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
+
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		bno != NULLAGBLOCK &&
+		bno != 0 &&
+		bno < agblocks);
+	return 0;
+}
+
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int				/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_ptr	*ptr,	/* btree block disk address */
+	int			index,	/* offset from ptr to check */
+	int			level)	/* btree block level */
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		return xfs_btree_check_lptr(cur,
+				be64_to_cpu((&ptr->l)[index]), level);
+	} else {
+		return xfs_btree_check_sptr(cur,
+				be32_to_cpu((&ptr->s)[index]), level);
+	}
+}
+#endif
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
+	return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
+	return true;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t	*cur,		/* btree cursor */
+	int		error)		/* del because of error */
+{
+	int		i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers, and release the buffers.
+	 * If we're doing this in the face of an error, we
+	 * need to make sure to inspect all of the entries
+	 * in the bc_bufs array for buffers to be unlocked.
+	 * This is because some of the btree code works from
+	 * level n down to 0, and if we get an error along
+	 * the way we won't have initialized all the entries
+	 * down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_bufs[i])
+			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+		else if (!error)
+			break;
+	}
+	/*
+	 * Can't free a bmap cursor without having dealt with the
+	 * allocated indirect blocks' accounting.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+	       cur->bc_private.b.allocated == 0);
+	/*
+	 * Free the cursor.
+	 */
+	kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t	*cur,		/* input cursor */
+	xfs_btree_cur_t	**ncur)		/* output cursor */
+{
+	xfs_buf_t	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	xfs_btree_cur_t	*new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = cur->bc_ops->dup_cursor(cur);
+
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_ptrs[i] = cur->bc_ptrs[i];
+		new->bc_ra[i] = cur->bc_ra[i];
+		bp = cur->bc_bufs[i];
+		if (bp) {
+			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+						   XFS_BUF_ADDR(bp), mp->m_bsize,
+						   0, &bp,
+						   cur->bc_ops->buf_ops);
+			if (error) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+		}
+		new->bc_bufs[i] = bp;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *		+--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+			return XFS_BTREE_LBLOCK_CRC_LEN;
+		return XFS_BTREE_LBLOCK_LEN;
+	}
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+		return XFS_BTREE_SBLOCK_CRC_LEN;
+	return XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+		sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+	struct xfs_btree_cur	*cur,
+	int			n)
+{
+	return xfs_btree_block_len(cur) +
+		(n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	int			level)
+{
+	return xfs_btree_block_len(cur) +
+		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+		(n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_rec *)
+		((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	return (union xfs_btree_key *)
+		((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+	struct xfs_btree_cur	*cur,
+	int			n,
+	struct xfs_btree_block	*block)
+{
+	int			level = xfs_btree_get_level(block);
+
+	ASSERT(block->bb_level != 0);
+
+	return (union xfs_btree_ptr *)
+		((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block *		/* generic btree block pointer */
+xfs_btree_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp)	/* buffer containing the block */
+{
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*bpp = NULL;
+		return xfs_btree_get_iroot(cur);
+	}
+
+	*bpp = cur->bc_bufs[level];
+	return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to check */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+	else
+		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int				/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	struct xfs_btree_block	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	__int64_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	uint			lock,		/* lock flags for read_buf */
+	struct xfs_buf		**bpp,		/* buffer for fsbno */
+	int			refval,		/* ref count value for buffer */
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int			error;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, lock, &bp, ops);
+	if (error)
+		return error;
+	if (bp)
+		xfs_buf_set_ref(bp, refval);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_agblock_t		agbno,		/* allocation group block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	const struct xfs_buf_ops *ops)
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block	*block)
+{
+	int			rval = 0;
+	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+				     cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block *block)
+{
+	int			rval = 0;
+	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     left, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				     right, 1, cur->bc_ops->buf_ops);
+		rval++;
+	}
+
+	return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	struct xfs_btree_block	*block;
+
+	/*
+	 * No readahead needed if we are at the root level and the
+	 * btree root is stored in the inode.
+	 */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (lev == cur->bc_nlevels - 1))
+		return 0;
+
+	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+		return 0;
+
+	cur->bc_ra[lev] |= lr;
+	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return xfs_btree_readahead_lblock(cur, lr, block);
+	return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+	} else {
+		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+					be32_to_cpu(ptr->s));
+	}
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	xfs_extlen_t		count)
+{
+	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+			  xfs_btree_ptr_to_daddr(cur, ptr),
+			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	xfs_buf_t		*bp)	/* new buffer to set */
+{
+	struct xfs_btree_block	*b;	/* btree block */
+
+	if (cur->bc_bufs[lev])
+		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+	cur->bc_bufs[lev] = bp;
+	cur->bc_ra[lev] = 0;
+
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	}
+}
+
+STATIC int
+xfs_btree_ptr_is_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return ptr->l == cpu_to_be64(NULLDFSBNO);
+	else
+		return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(NULLDFSBNO);
+	else
+		ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->l = block->bb_u.l.bb_rightsib;
+		else
+			ptr->l = block->bb_u.l.bb_leftsib;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			ptr->s = block->bb_u.s.bb_rightsib;
+		else
+			ptr->s = block->bb_u.s.bb_leftsib;
+	}
+}
+
+STATIC void
+xfs_btree_set_sibling(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_ptr	*ptr,
+	int			lr)
+{
+	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.l.bb_rightsib = ptr->l;
+		else
+			block->bb_u.l.bb_leftsib = ptr->l;
+	} else {
+		if (lr == XFS_BB_RIGHTSIB)
+			block->bb_u.s.bb_rightsib = ptr->s;
+		else
+			block->bb_u.s.bb_leftsib = ptr->s;
+	}
+}
+
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	__u32			magic,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags)
+{
+	buf->bb_magic = cpu_to_be32(magic);
+	buf->bb_level = cpu_to_be16(level);
+	buf->bb_numrecs = cpu_to_be16(numrecs);
+
+	if (flags & XFS_BTREE_LONG_PTRS) {
+		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+			buf->bb_u.l.bb_pad = 0;
+			buf->bb_u.l.bb_lsn = 0;
+		}
+	} else {
+		/* owner is a 32 bit value on short blocks */
+		__u32 __owner = (__u32)owner;
+
+		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+			buf->bb_u.s.bb_lsn = 0;
+		}
+	}
+}
+
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	__u32		magic,
+	__u16		level,
+	__u16		numrecs,
+	__u64		owner,
+	unsigned int	flags)
+{
+	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 magic, level, numrecs, owner, flags);
+}
+
+STATIC void
+xfs_btree_init_block_cur(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	int			numrecs)
+{
+	__u64 owner;
+
+	/*
+	 * we can pull the owner from the cursor right now as the different
+	 * owners align directly with the pointer size of the btree. This may
+	 * change in future, but is safe for current users of the generic btree
+	 * code.
+	 */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		owner = cur->bc_private.b.ip->i_ino;
+	else
+		owner = cur->bc_private.a.agno;
+
+	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 xfs_btree_magic(cur), level, numrecs,
+				 owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level)
+{
+	union xfs_btree_ptr	ptr;
+
+	if (level > 0)
+		return 0;
+	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+		return 0;
+
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &ptr))
+		return 0;
+	return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	else {
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+					XFS_BUF_ADDR(bp)));
+	}
+}
+
+STATIC void
+xfs_btree_set_refs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+		break;
+	case XFS_BTNUM_INO:
+	case XFS_BTNUM_FINO:
+		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+		break;
+	case XFS_BTNUM_BMAP:
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+				 mp->m_bsize, flags);
+
+	if (!*bpp)
+		return ENOMEM;
+
+	(*bpp)->b_ops = cur->bc_ops->buf_ops;
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			flags,
+	struct xfs_btree_block	**block,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_daddr_t		d;
+	int			error;
+
+	/* need to sort out how callers deal with failures first */
+	ASSERT(!(flags & XBF_TRYLOCK));
+
+	d = xfs_btree_ptr_to_daddr(cur, ptr);
+	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, flags, bpp,
+				   cur->bc_ops->buf_ops);
+	if (error)
+		return error;
+
+	xfs_btree_set_refs(cur, *bpp);
+	*block = XFS_BUF_TO_BLOCK(*bpp);
+	return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*dst_key,
+	union xfs_btree_key	*src_key,
+	int			numkeys)
+{
+	ASSERT(numkeys >= 0);
+	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*dst_rec,
+	union xfs_btree_rec	*src_rec,
+	int			numrecs)
+{
+	ASSERT(numrecs >= 0);
+	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*dst_ptr,
+	union xfs_btree_ptr	*src_ptr,
+	int			numptrs)
+{
+	ASSERT(numptrs >= 0);
+	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key,
+	int			dir,
+	int			numkeys)
+{
+	char			*dst_key;
+
+	ASSERT(numkeys >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec,
+	int			dir,
+	int			numrecs)
+{
+	char			*dst_rec;
+
+	ASSERT(numrecs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			dir,
+	int			numptrs)
+{
+	char			*dst_ptr;
+
+	ASSERT(numptrs >= 0);
+	ASSERT(dir == 1 || dir == -1);
+
+	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				  xfs_btree_key_offset(cur, first),
+				  xfs_btree_key_offset(cur, last + 1) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(cur->bc_tp, bp,
+			  xfs_btree_rec_offset(cur, first),
+			  xfs_btree_rec_offset(cur, last + 1) - 1);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			first,	/* index of first pointer to log */
+	int			last)	/* index of last pointer to log */
+{
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+	if (bp) {
+		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+		int			level = xfs_btree_get_level(block);
+
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp,
+				xfs_btree_ptr_offset(cur, first, level),
+				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	struct xfs_buf		*bp,	/* buffer containing btree block */
+	int			fields)	/* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	soffsets[] = {	/* table of offsets (short) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+		XFS_BTREE_SBLOCK_CRC_LEN
+	};
+	static const short	loffsets[] = {	/* table of offsets (long) */
+		offsetof(struct xfs_btree_block, bb_magic),
+		offsetof(struct xfs_btree_block, bb_level),
+		offsetof(struct xfs_btree_block, bb_numrecs),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+		XFS_BTREE_LBLOCK_CRC_LEN
+	};
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+	if (bp) {
+		int nbits;
+
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+			/*
+			 * We don't log the CRC when updating a btree
+			 * block but instead recreate it during log
+			 * recovery.  As the log buffers have checksums
+			 * of their own this is safe and avoids logging a crc
+			 * update in a lot of places.
+			 */
+			if (fields == XFS_BB_ALL_BITS)
+				fields = XFS_BB_ALL_BITS_CRC;
+			nbits = XFS_BB_NUM_BITS_CRC;
+		} else {
+			nbits = XFS_BB_NUM_BITS;
+		}
+		xfs_btree_offsets(fields,
+				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+					loffsets : soffsets,
+				  nbits, &first, &last);
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	} else {
+		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_increment(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	union xfs_btree_ptr	ptr;
+	struct xfs_buf		*bp;
+	int			error;		/* error return value */
+	int			lev;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the right at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* We're done if we remain in the block after the increment. */
+	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+		goto out1;
+
+	/* Fail if we just went off the right edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, increment);
+
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, lev, bp);
+		if (error)
+			goto error0;
+#endif
+
+		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+			break;
+
+		/* Read-ahead the right block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+
+	/*
+	 * If we went off the root then we are either seriously
+	 * confused or have the tree root in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = 1;
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_btree_decrement(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	struct xfs_btree_block	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+	int			lev;
+	union xfs_btree_ptr	ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	ASSERT(level < cur->bc_nlevels);
+
+	/* Read-ahead to the left at this level. */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+	/* We're done if we remain in the block after the decrement. */
+	if (--cur->bc_ptrs[level] > 0)
+		goto out1;
+
+	/* Get a pointer to the btree block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we just went off the left edge of the tree. */
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &ptr))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, decrement);
+
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/* Read-ahead the left block for the next loop. */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+
+	/*
+	 * If we went off the root then we are seriously confused.
+	 * or the root of the tree is in an inode.
+	 */
+	if (lev == cur->bc_nlevels) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+			goto out0;
+		ASSERT(0);
+		error = EFSCORRUPTED;
+		goto error0;
+	}
+	ASSERT(lev < cur->bc_nlevels);
+
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+		union xfs_btree_ptr	*ptrp;
+
+		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+		--lev;
+		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+		if (error)
+			goto error0;
+		xfs_btree_setbuf(cur, lev, bp);
+		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+	}
+out1:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in the btree */
+	union xfs_btree_ptr	*pp,	/* ptr to btree block */
+	struct xfs_btree_block	**blkp) /* return btree block */
+{
+	struct xfs_buf		*bp;	/* buffer pointer for btree block */
+	int			error = 0;
+
+	/* special case the root block if in an inode */
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1)) {
+		*blkp = xfs_btree_get_iroot(cur);
+		return 0;
+	}
+
+	/*
+	 * If the old buffer at this level for the disk address we are
+	 * looking for re-use it.
+	 *
+	 * Otherwise throw it away and get a new one.
+	 */
+	bp = cur->bc_bufs[level];
+	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+		*blkp = XFS_BUF_TO_BLOCK(bp);
+		return 0;
+	}
+
+	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+	if (error)
+		return error;
+
+	xfs_btree_setbuf(cur, level, bp);
+	return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			keyno,
+	struct xfs_btree_block	*block,
+	union xfs_btree_key	*kp)
+{
+	if (level == 0) {
+		cur->bc_ops->init_key_from_rec(kp,
+				xfs_btree_rec_addr(cur, keyno, block));
+		return kp;
+	}
+
+	return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int					/* error */
+xfs_btree_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	__int64_t		diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno;	/* current key number */
+	int			level;	/* level in the btree */
+	union xfs_btree_ptr	*pp;	/* ptr to btree block */
+	union xfs_btree_ptr	ptr;	/* ptr to btree block */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, dir);
+
+	XFS_BTREE_STATS_INC(cur, lookup);
+
+	block = NULL;
+	keyno = 0;
+
+	/* initialise start pointer from cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	pp = &ptr;
+
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		/* Get the block we need to do the lookup on. */
+		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+		if (error)
+			goto error0;
+
+		if (diff == 0) {
+			/*
+			 * If we already had a key match at a higher level, we
+			 * know we need to use the first entry in this block.
+			 */
+			keyno = 1;
+		} else {
+			/* Otherwise search this block. Do a binary search. */
+
+			int	high;	/* high entry number */
+			int	low;	/* low entry number */
+
+			/* Set low and high entry numbers, 1-based. */
+			low = 1;
+			high = xfs_btree_get_numrecs(block);
+			if (!high) {
+				/* Block is empty, must be an empty leaf. */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 0;
+				return 0;
+			}
+
+			/* Binary search the block. */
+			while (low <= high) {
+				union xfs_btree_key	key;
+				union xfs_btree_key	*kp;
+
+				XFS_BTREE_STATS_INC(cur, compare);
+
+				/* keyno is average of low and high. */
+				keyno = (low + high) >> 1;
+
+				/* Get current search key */
+				kp = xfs_lookup_get_search_key(cur, level,
+						keyno, block, &key);
+
+				/*
+				 * Compute difference to get next direction:
+				 *  - less than, move right
+				 *  - greater than, move left
+				 *  - equal, we're done
+				 */
+				diff = cur->bc_ops->key_diff(cur, kp);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+			error = xfs_btree_check_ptr(cur, pp, 0, level);
+			if (error)
+				goto error0;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+
+	/* Done with the search. See if we need to adjust the results. */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > xfs_btree_get_numrecs(block) &&
+		    !xfs_btree_ptr_is_null(cur, &ptr)) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+			*stat = 1;
+			return 0;
+		}
+	} else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+
+	/* Return if we succeeded or not. */
+	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+		*stat = 0;
+	else if (dir != XFS_LOOKUP_EQ || diff == 0)
+		*stat = 1;
+	else
+		*stat = 0;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*keyp,
+	int			level)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_key	*kp;
+	int			ptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+		int		error;
+#endif
+		block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, block, level, bp);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		xfs_btree_copy_keys(cur, kp, keyp, 1);
+		xfs_btree_log_keys(cur, bp, ptr, ptr);
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	int			error;
+	int			ptr;
+	union xfs_btree_rec	*rp;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGR(cur, rec);
+
+	/* Pick up the current block. */
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		goto error0;
+#endif
+	/* Get the address of the rec to be updated. */
+	ptr = cur->bc_ptrs[0];
+	rp = xfs_btree_rec_addr(cur, ptr, block);
+
+	/* Fill in the new contents and log them. */
+	xfs_btree_copy_recs(cur, rp, rec, 1);
+	xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, 0)) {
+		cur->bc_ops->update_lastrec(cur, block, rec,
+					    ptr, LASTREC_UPDATE);
+	}
+
+	/* Updating first rec in leaf. Pass new key value up to our parent. */
+	if (ptr == 1) {
+		union xfs_btree_key	key;
+
+		cur->bc_ops->init_key_from_rec(&key, rec);
+		error = xfs_btree_updkey(cur, &key, 1);
+		if (error)
+			goto error0;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_lshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs;		/* left record count */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	int			rrecs;		/* right record count */
+	union xfs_btree_ptr	lptr;		/* left btree pointer */
+	union xfs_btree_key	*rkp = NULL;	/* right btree key */
+	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
+	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
+	int			error;		/* error return value */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1)
+		goto out0;
+
+	/* Set up variables for this block as "right". */
+	right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, right, level, rbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no left sibling then we can't shift an entry left. */
+	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	if (xfs_btree_ptr_is_null(cur, &lptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1)
+		goto out0;
+
+	/* Set up the left neighbor as "left". */
+	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	rrecs = xfs_btree_get_numrecs(right);
+
+	/*
+	 * We add one entry to the left side and remove one for the right side.
+	 * Account for it here, the changes will be updated on disk and logged
+	 * later.
+	 */
+	lrecs++;
+	rrecs--;
+
+	XFS_BTREE_STATS_INC(cur, lshift);
+	XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 * Log the changes to the left block.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, rpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, 1);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur,
+			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, 1);
+		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+		ASSERT(cur->bc_ops->recs_inorder(cur,
+			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+	}
+
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+		int			i;		/* loop index */
+
+		for (i = 0; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_shift_keys(cur,
+				xfs_btree_key_addr(cur, 2, right),
+				-1, rrecs);
+		xfs_btree_shift_ptrs(cur,
+				xfs_btree_ptr_addr(cur, 2, right),
+				-1, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		/* It's a leaf. operate on records */
+		xfs_btree_shift_recs(cur,
+			xfs_btree_rec_addr(cur, 2, right),
+			-1, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		cur->bc_ops->init_key_from_rec(&key,
+			xfs_btree_rec_addr(cur, 1, right));
+		rkp = &key;
+	}
+
+	/* Update the parent key values of right. */
+	error = xfs_btree_updkey(cur, rkp, level + 1);
+	if (error)
+		goto error0;
+
+	/* Slide the cursor value left one. */
+	cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_btree_rshift(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_key	key;		/* btree key */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	union xfs_btree_ptr	rptr;		/* right block pointer */
+	union xfs_btree_key	*rkp;		/* right btree key */
+	int			rrecs;		/* right record count */
+	int			lrecs;		/* left record count */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level == cur->bc_nlevels - 1))
+		goto out0;
+
+	/* Set up variables for this block as "left". */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	/* If we've got no right sibling then we can't shift an entry right. */
+	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		goto out0;
+
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	if (cur->bc_ptrs[level] >= lrecs)
+		goto out0;
+
+	/* Set up the right neighbor as "right". */
+	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* If it's full, it can't take another entry. */
+	rrecs = xfs_btree_get_numrecs(right);
+	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, rshift);
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+		union xfs_btree_ptr	*rpp;
+
+		lkp = xfs_btree_key_addr(cur, lrecs, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = rrecs - 1; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, lpp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_keys(cur, rkp, lkp, 1);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+			xfs_btree_key_addr(cur, 2, right)));
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec	*lrp;
+		union xfs_btree_rec	*rrp;
+
+		lrp = xfs_btree_rec_addr(cur, lrecs, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+		/* Now put the new data in, and log it. */
+		xfs_btree_copy_recs(cur, rrp, lrp, 1);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+		cur->bc_ops->init_key_from_rec(&key, rrp);
+		rkp = &key;
+
+		ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+			xfs_btree_rec_addr(cur, 2, right)));
+	}
+
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	xfs_btree_set_numrecs(left, --lrecs);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+	xfs_btree_set_numrecs(right, ++rrecs);
+	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	error = xfs_btree_increment(tcur, level, &i);
+	if (error)
+		goto error1;
+
+	error = xfs_btree_updkey(tcur, rkp, level + 1);
+	if (error)
+		goto error1;
+
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+
+error1:
+	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int					/* error */
+xfs_btree_split(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	union xfs_btree_ptr	*ptrp,
+	union xfs_btree_key	*key,
+	struct xfs_btree_cur	**curp,
+	int			*stat)		/* success/failure */
+{
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	int			lrecs;
+	int			rrecs;
+	int			src_index;
+	int			error;		/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+	XFS_BTREE_STATS_INC(cur, split);
+
+	/* Set up left block (current one). */
+	left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, left, level, lbp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block as "right". */
+	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+	if (error)
+		goto error0;
+
+	/* Fill in the btree header for the new right block. */
+	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+	/*
+	 * Split the entries between the old and the new block evenly.
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	lrecs = xfs_btree_get_numrecs(left);
+	rrecs = lrecs / 2;
+	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+		rrecs++;
+	src_index = (lrecs - rrecs + 1);
+
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+	/*
+	 * Copy btree block entries from the left block over to the
+	 * new block, the right. Update the right block and log the
+	 * changes.
+	 */
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, src_index, left);
+		lpp = xfs_btree_ptr_addr(cur, src_index, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+		for (i = src_index; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+		xfs_btree_log_keys(cur, rbp, 1, rrecs);
+		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+		/* Grab the keys to the entries moved to the right block */
+		xfs_btree_copy_keys(cur, key, rkp, 1);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, src_index, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+		xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+		cur->bc_ops->init_key_from_rec(key,
+			xfs_btree_rec_addr(cur, 1, right));
+	}
+
+
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+	lrecs -= rrecs;
+	xfs_btree_set_numrecs(left, lrecs);
+	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+		error = xfs_btree_read_buf_block(cur, &rrptr,
+							0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > lrecs + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= lrecs;
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		error = xfs_btree_dup_cursor(cur, curp);
+		if (error)
+			goto error0;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*ptrp = rptr;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int						/* error */
+xfs_btree_new_iroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	struct xfs_buf		*cbp;		/* buffer for cblock */
+	struct xfs_btree_block	*block;		/* btree block */
+	struct xfs_btree_block	*cblock;	/* child btree block */
+	union xfs_btree_key	*ckp;		/* child key pointer */
+	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
+	union xfs_btree_key	*kp;		/* pointer to btree key */
+	union xfs_btree_ptr	*pp;		/* pointer to block addr */
+	union xfs_btree_ptr	nptr;		/* new block addr */
+	int			level;		/* btree level */
+	int			error;		/* error return code */
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	level = cur->bc_nlevels - 1;
+
+	block = xfs_btree_get_iroot(cur);
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return 0;
+	}
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Copy the root into a real block. */
+	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+	if (error)
+		goto error0;
+
+	/*
+	 * we can't just memcpy() the root in for CRC enabled btree blocks.
+	 * In that case have to also ensure the blkno remains correct
+	 */
+	memcpy(cblock, block, xfs_btree_block_len(cur));
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+		else
+			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+	}
+
+	be16_add_cpu(&block->bb_level, 1);
+	xfs_btree_set_numrecs(block, 1);
+	cur->bc_nlevels++;
+	cur->bc_ptrs[level + 1] = 1;
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+		error = xfs_btree_check_ptr(cur, pp, i, level);
+		if (error)
+			goto error0;
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+	if (error)
+		goto error0;
+#endif
+	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+	xfs_iroot_realloc(cur->bc_private.b.ip,
+			  1 - xfs_btree_get_numrecs(cblock),
+			  cur->bc_private.b.whichfork);
+
+	xfs_btree_setbuf(cur, level, cbp);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+	*logflags |=
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+	*stat = 1;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_btree_new_root(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* one half of the old root block */
+	struct xfs_buf		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	struct xfs_buf		*lbp;	/* left buffer pointer */
+	struct xfs_btree_block	*left;	/* left btree block */
+	struct xfs_buf		*nbp;	/* new (root) buffer */
+	struct xfs_btree_block	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	struct xfs_buf		*rbp;	/* right buffer pointer */
+	struct xfs_btree_block	*right;	/* right btree block */
+	union xfs_btree_ptr	rptr;
+	union xfs_btree_ptr	lptr;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, newroot);
+
+	/* initialise our start point from the cursor */
+	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+	/* Allocate the new block. If we can't do it, we're toast. Give up. */
+	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+	if (error)
+		goto error0;
+	if (*stat == 0)
+		goto out0;
+	XFS_BTREE_STATS_INC(cur, alloc);
+
+	/* Set up the new block. */
+	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+	if (error)
+		goto error0;
+
+	/* Set the root in the holding structure  increasing the level by 1. */
+	cur->bc_ops->set_root(cur, &lptr, 1);
+
+	/*
+	 * At the previous root level there are now two blocks: the old root,
+	 * and the new block generated when it was split.  We don't know which
+	 * one the cursor is pointing at, so we set up variables "left" and
+	 * "right" for each case.
+	 */
+	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+	if (error)
+		goto error0;
+#endif
+
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/* Our block is left, pick up the right block. */
+		lbp = bp;
+		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+		left = block;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+		bp = rbp;
+		nptr = 1;
+	} else {
+		/* Our block is right, pick up the left block. */
+		rbp = bp;
+		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+		right = block;
+		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+		bp = lbp;
+		nptr = 2;
+	}
+	/* Fill in the new block's btree header and log it. */
+	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+			!xfs_btree_ptr_is_null(cur, &rptr));
+
+	/* Fill in the key data in the new root. */
+	if (xfs_btree_get_level(left) > 0) {
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_key_addr(cur, 1, left), 1);
+		xfs_btree_copy_keys(cur,
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_key_addr(cur, 1, right), 1);
+	} else {
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 1, new),
+				xfs_btree_rec_addr(cur, 1, left));
+		cur->bc_ops->init_key_from_rec(
+				xfs_btree_key_addr(cur, 2, new),
+				xfs_btree_rec_addr(cur, 1, right));
+	}
+	xfs_btree_log_keys(cur, nbp, 1, 2);
+
+	/* Fill in the pointer data in the new root. */
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+	xfs_btree_copy_ptrs(cur,
+		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+	xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+	/* Fix up the cursor. */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 0;
+	return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* btree level */
+	int			numrecs,/* # of recs in block */
+	int			*oindex,/* old tree index */
+	int			*index,	/* new tree index */
+	union xfs_btree_ptr	*nptr,	/* new btree ptr */
+	struct xfs_btree_cur	**ncur,	/* new btree cursor */
+	union xfs_btree_rec	*nrec,	/* new record */
+	int			*stat)
+{
+	union xfs_btree_key	key;	/* new btree key value */
+	int			error = 0;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+	    	struct xfs_inode *ip = cur->bc_private.b.ip;
+
+		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+			/* A root block that can be made bigger. */
+			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+		} else {
+			/* A root block that needs replacing */
+			int	logflags = 0;
+
+			error = xfs_btree_new_iroot(cur, &logflags, stat);
+			if (error || *stat == 0)
+				return error;
+
+			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+		}
+
+		return 0;
+	}
+
+	/* First, try shifting an entry to the right neighbor. */
+	error = xfs_btree_rshift(cur, level, stat);
+	if (error || *stat)
+		return error;
+
+	/* Next, try shifting an entry to the left neighbor. */
+	error = xfs_btree_lshift(cur, level, stat);
+	if (error)
+		return error;
+
+	if (*stat) {
+		*oindex = *index = cur->bc_ptrs[level];
+		return 0;
+	}
+
+	/*
+	 * Next, try splitting the current block in half.
+	 *
+	 * If this works we have to re-set our variables because we
+	 * could be in a different block now.
+	 */
+	error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+	if (error || *stat == 0)
+		return error;
+
+
+	*index = cur->bc_ptrs[level];
+	cur->bc_ops->init_rec_from_key(&key, nrec);
+	return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
+	union xfs_btree_rec	*recp,	/* i/o: record data inserted */
+	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer for block */
+	union xfs_btree_key	key;	/* btree key */
+	union xfs_btree_ptr	nptr;	/* new block ptr */
+	struct xfs_btree_cur	*ncur;	/* new btree cursor */
+	union xfs_btree_rec	nrec;	/* new record count */
+	int			optr;	/* old key/record index */
+	int			ptr;	/* key/record index */
+	int			numrecs;/* number of records */
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+	ncur = NULL;
+
+	/*
+	 * If we have an external root pointer, and we've made it to the
+	 * root level, allocate a new root block and we're done.
+	 */
+	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    (level >= cur->bc_nlevels)) {
+		error = xfs_btree_new_root(cur, stat);
+		xfs_btree_set_ptr_null(cur, ptrp);
+
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		return error;
+	}
+
+	/* If we're off the left edge, return failure. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Make a key out of the record data to be inserted, and save it. */
+	cur->bc_ops->init_key_from_rec(&key, recp);
+
+	optr = ptr;
+
+	XFS_BTREE_STATS_INC(cur, insrec);
+
+	/* Get pointers to the btree buffer and block. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+
+	/* Check that the new entry is being inserted in the right place. */
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+				xfs_btree_rec_addr(cur, ptr, block)));
+		} else {
+			ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+				xfs_btree_key_addr(cur, ptr, block)));
+		}
+	}
+#endif
+
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	xfs_btree_set_ptr_null(cur, &nptr);
+	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+		error = xfs_btree_make_block_unfull(cur, level, numrecs,
+					&optr, &ptr, &nptr, &ncur, &nrec, stat);
+		if (error || *stat == 0)
+			goto error0;
+	}
+
+	/*
+	 * The current block may have changed if the block was
+	 * previously full and we have just made space in it.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+	if (level > 0) {
+		/* It's a nonleaf. make a hole in the keys and ptrs */
+		union xfs_btree_key	*kp;
+		union xfs_btree_ptr	*pp;
+
+		kp = xfs_btree_key_addr(cur, ptr, block);
+		pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+		for (i = numrecs - ptr; i >= 0; i--) {
+			error = xfs_btree_check_ptr(cur, pp, i, level);
+			if (error)
+				return error;
+		}
+#endif
+
+		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+		if (error)
+			goto error0;
+#endif
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_keys(cur, kp, &key, 1);
+		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+		numrecs++;
+		xfs_btree_set_numrecs(block, numrecs);
+		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+		xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+				xfs_btree_key_addr(cur, ptr + 1, block)));
+		}
+#endif
+	} else {
+		/* It's a leaf. make a hole in the records */
+		union xfs_btree_rec             *rp;
+
+		rp = xfs_btree_rec_addr(cur, ptr, block);
+
+		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+		/* Now put the new data in, bump numrecs and log it. */
+		xfs_btree_copy_recs(cur, rp, recp, 1);
+		xfs_btree_set_numrecs(block, ++numrecs);
+		xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+		if (ptr < numrecs) {
+			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+				xfs_btree_rec_addr(cur, ptr + 1, block)));
+		}
+#endif
+	}
+
+	/* Log the new number of records in the btree header. */
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/* If we inserted at the start of a block, update the parents' keys. */
+	if (optr == 1) {
+		error = xfs_btree_updkey(cur, &key, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, recp,
+					    ptr, LASTREC_INSREC);
+	}
+
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*ptrp = nptr;
+	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+		*recp = nrec;
+		*curp = ncur;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+	struct xfs_btree_cur	*cur,
+	int			*stat)
+{
+	int			error;	/* error return value */
+	int			i;	/* result value, 0 for failure */
+	int			level;	/* current level number in btree */
+	union xfs_btree_ptr	nptr;	/* new block number (split result) */
+	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
+	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
+	union xfs_btree_rec	rec;	/* record to insert */
+
+	level = 0;
+	ncur = NULL;
+	pcur = cur;
+
+	xfs_btree_set_ptr_null(cur, &nptr);
+	cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nptr into this level of the tree.
+		 * Note if we fail, nptr will be null.
+		 */
+		error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+		if (error) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			goto error0;
+		}
+
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		level++;
+
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur &&
+		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+			/* Save the state from the cursor before we trash it */
+			if (cur->bc_ops->update_cursor)
+				cur->bc_ops->update_cursor(pcur, cur);
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/* If we got a new cursor, switch to it. */
+		if (ncur) {
+			pcur = ncur;
+			ncur = NULL;
+		}
+	} while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+	struct xfs_btree_cur	*cur)
+{
+	int			whichfork = cur->bc_private.b.whichfork;
+	struct xfs_inode	*ip = cur->bc_private.b.ip;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_btree_block	*block;
+	struct xfs_btree_block	*cblock;
+	union xfs_btree_key	*kp;
+	union xfs_btree_key	*ckp;
+	union xfs_btree_ptr	*pp;
+	union xfs_btree_ptr	*cpp;
+	struct xfs_buf		*cbp;
+	int			level;
+	int			index;
+	int			numrecs;
+#ifdef DEBUG
+	union xfs_btree_ptr	ptr;
+	int			i;
+#endif
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_nlevels > 1);
+
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	level = cur->bc_nlevels - 1;
+	if (level == 1)
+		goto out0;
+
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	block = xfs_btree_get_iroot(cur);
+	if (xfs_btree_get_numrecs(block) != 1)
+		goto out0;
+
+	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+	numrecs = xfs_btree_get_numrecs(cblock);
+
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+		goto out0;
+
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+	if (index) {
+		xfs_iroot_realloc(cur->bc_private.b.ip, index,
+				  cur->bc_private.b.whichfork);
+		block = ifp->if_broot;
+	}
+
+	be16_add_cpu(&block->bb_numrecs, index);
+	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+	kp = xfs_btree_key_addr(cur, 1, block);
+	ckp = xfs_btree_key_addr(cur, 1, cblock);
+	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+	pp = xfs_btree_ptr_addr(cur, 1, block);
+	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+	for (i = 0; i < numrecs; i++) {
+		int		error;
+
+		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+		if (error) {
+			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+			return error;
+		}
+	}
+#endif
+	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+	cur->bc_ops->free_block(cur, cbp);
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level - 1] = NULL;
+	be16_add_cpu(&block->bb_level, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+	cur->bc_nlevels--;
+out0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level] = NULL;
+	cur->bc_ra[level] = 0;
+	cur->bc_nlevels--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			*stat)
+{
+	int			error;
+	int			i;
+
+	if (level > 0) {
+		error = xfs_btree_decrement(cur, level, &i);
+		if (error)
+			return error;
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int					/* error */
+xfs_btree_delrec(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			level,		/* level removing record from */
+	int			*stat)		/* fail/done/go-on */
+{
+	struct xfs_btree_block	*block;		/* btree block */
+	union xfs_btree_ptr	cptr;		/* current block ptr */
+	struct xfs_buf		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+	int			i;		/* loop counter */
+	union xfs_btree_key	key;		/* storage for keyp */
+	union xfs_btree_key	*keyp = &key;	/* passed to the next level */
+	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
+	struct xfs_buf		*lbp;		/* left buffer pointer */
+	struct xfs_btree_block	*left;		/* left btree block */
+	int			lrecs = 0;	/* left record count */
+	int			ptr;		/* key/record index */
+	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
+	struct xfs_buf		*rbp;		/* right buffer pointer */
+	struct xfs_btree_block	*right;		/* right btree block */
+	struct xfs_btree_block	*rrblock;	/* right-right btree block */
+	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
+	int			rrecs = 0;	/* right record count */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_TRACE_ARGI(cur, level);
+
+	tcur = NULL;
+
+	/* Get the index of the entry being deleted, check for nothing there. */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	/* Get the buffer & block containing the record or key/ptr. */
+	block = xfs_btree_get_block(cur, level, &bp);
+	numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, level, bp);
+	if (error)
+		goto error0;
+#endif
+
+	/* Fail if we're off the end of the block. */
+	if (ptr > numrecs) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+
+	XFS_BTREE_STATS_INC(cur, delrec);
+	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+	/* Excise the entries being deleted. */
+	if (level > 0) {
+		/* It's a nonleaf. operate on keys and ptrs */
+		union xfs_btree_key	*lkp;
+		union xfs_btree_ptr	*lpp;
+
+		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+		for (i = 0; i < numrecs - ptr; i++) {
+			error = xfs_btree_check_ptr(cur, lpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+
+		if (ptr < numrecs) {
+			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need to pass a
+		 * key up to the next level (updkey).
+		 */
+		if (ptr == 1)
+			keyp = xfs_btree_key_addr(cur, 1, block);
+	} else {
+		/* It's a leaf. operate on records */
+		if (ptr < numrecs) {
+			xfs_btree_shift_recs(cur,
+				xfs_btree_rec_addr(cur, ptr + 1, block),
+				-1, numrecs - ptr);
+			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			cur->bc_ops->init_key_from_rec(&key,
+					xfs_btree_rec_addr(cur, 1, block));
+			keyp = &key;
+		}
+	}
+
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	xfs_btree_set_numrecs(block, --numrecs);
+	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+	/*
+	 * If we are tracking the last record in the tree and
+	 * we are at the far right edge of the tree, update it.
+	 */
+	if (xfs_btree_is_lastrec(cur, block, level)) {
+		cur->bc_ops->update_lastrec(cur, block, NULL,
+					    ptr, LASTREC_DELREC);
+	}
+
+	/*
+	 * We're at the root level.  First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.  If we can't then there's
+	 * nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+					  cur->bc_private.b.whichfork);
+
+			error = xfs_btree_kill_iroot(cur);
+			if (error)
+				goto error0;
+
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			*stat = 1;
+			return 0;
+		}
+
+		/*
+		 * If this is the root level, and there's only one entry left,
+		 * and it's NOT the leaf level, then we can get rid of this
+		 * level.
+		 */
+		if (numrecs == 1 && level > 0) {
+			union xfs_btree_ptr	*pp;
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			pp = xfs_btree_ptr_addr(cur, 1, block);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
+			if (error)
+				goto error0;
+		} else if (level > 0) {
+			error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+		}
+		*stat = 1;
+		return 0;
+	}
+
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1) {
+		error = xfs_btree_updkey(cur, keyp, level + 1);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		/*
+		 * One child of root, need to get a chance to copy its contents
+		 * into the root and delete it. Can't go up to next level,
+		 * there's nothing to delete there.
+		 */
+		if (xfs_btree_ptr_is_null(cur, &rptr) &&
+		    xfs_btree_ptr_is_null(cur, &lptr) &&
+		    level == cur->bc_nlevels - 2) {
+			error = xfs_btree_kill_iroot(cur);
+			if (!error)
+				error = xfs_btree_dec_cursor(cur, level, stat);
+			if (error)
+				goto error0;
+			return 0;
+		}
+	}
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+	       !xfs_btree_ptr_is_null(cur, &lptr));
+
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	error = xfs_btree_dup_cursor(cur, &tcur);
+	if (error)
+		goto error0;
+
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_btree_increment(tcur, level, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(tcur, right, level, rbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(right) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_lshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+
+				error = xfs_btree_dec_cursor(cur, level, stat);
+				if (error)
+					goto error0;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = xfs_btree_get_numrecs(right);
+		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			error = xfs_btree_decrement(tcur, level, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_btree_decrement(tcur, level, &i);
+		if (error)
+			goto error0;
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		/* Grab a pointer to the block. */
+		left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+		error = xfs_btree_check_block(cur, left, level, lbp);
+		if (error)
+			goto error0;
+#endif
+		/* Grab the current block number, for future use. */
+		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (xfs_btree_get_numrecs(left) - 1 >=
+		    cur->bc_ops->get_minrecs(tcur, level)) {
+			error = xfs_btree_rshift(tcur, level, &i);
+			if (error)
+				goto error0;
+			if (i) {
+				ASSERT(xfs_btree_get_numrecs(block) >=
+				       cur->bc_ops->get_minrecs(tcur, level));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = xfs_btree_get_numrecs(left);
+	}
+
+	/* Delete the temp cursor, we're done with it. */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+
+	/* If here, we need to do a join to keep the tree balanced. */
+	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+	    lrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rptr = cptr;
+		right = block;
+		rbp = bp;
+		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+		   rrecs + xfs_btree_get_numrecs(block) <=
+			cur->bc_ops->get_maxrecs(cur, level)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lptr = cptr;
+		left = block;
+		lbp = bp;
+		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+		if (error)
+			goto error0;
+
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.  This is probably a logic error, but it's not fatal.
+	 */
+	} else {
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		return 0;
+	}
+
+	rrecs = xfs_btree_get_numrecs(right);
+	lrecs = xfs_btree_get_numrecs(left);
+
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+	if (level > 0) {
+		/* It's a non-leaf.  Move keys and pointers. */
+		union xfs_btree_key	*lkp;	/* left btree key */
+		union xfs_btree_ptr	*lpp;	/* left address pointer */
+		union xfs_btree_key	*rkp;	/* right btree key */
+		union xfs_btree_ptr	*rpp;	/* right address pointer */
+
+		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+		rkp = xfs_btree_key_addr(cur, 1, right);
+		rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+		for (i = 1; i < rrecs; i++) {
+			error = xfs_btree_check_ptr(cur, rpp, i, level);
+			if (error)
+				goto error0;
+		}
+#endif
+		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	} else {
+		/* It's a leaf.  Move records.  */
+		union xfs_btree_rec	*lrp;	/* left record pointer */
+		union xfs_btree_rec	*rrp;	/* right record pointer */
+
+		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+		rrp = xfs_btree_rec_addr(cur, 1, right);
+
+		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+	}
+
+	XFS_BTREE_STATS_INC(cur, join);
+
+	/*
+	 * Fix up the number of records and right block pointer in the
+	 * surviving block, and log it.
+	 */
+	xfs_btree_set_numrecs(left, lrecs + rrecs);
+	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+	/* If there is a right sibling, point it to the remaining block. */
+	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+		if (error)
+			goto error0;
+		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+
+	/* Free the deleted block. */
+	error = cur->bc_ops->free_block(cur, rbp);
+	if (error)
+		goto error0;
+	XFS_BTREE_STATS_INC(cur, free);
+
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += lrecs;
+		cur->bc_ra[level] = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+		   (level + 1 < cur->bc_nlevels)) {
+		error = xfs_btree_increment(cur, level + 1, &i);
+		if (error)
+			goto error0;
+	}
+
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	/* Return value means the next level up has something to do. */
+	*stat = 2;
+	return 0;
+
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_btree_delete(
+	struct xfs_btree_cur	*cur,
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			level;
+	int			i;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 *
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		error = xfs_btree_delrec(cur, level, &i);
+		if (error)
+			goto error0;
+	}
+
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				error = xfs_btree_decrement(cur, level, &i);
+				if (error)
+					goto error0;
+				break;
+			}
+		}
+	}
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_btree_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	union xfs_btree_rec	**recp,	/* output: btree record */
+	int			*stat)	/* output: success/failure */
+{
+	struct xfs_btree_block	*block;	/* btree block */
+	struct xfs_buf		*bp;	/* buffer pointer */
+	int			ptr;	/* record number */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+
+	ptr = cur->bc_ptrs[0];
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+	error = xfs_btree_check_block(cur, block, 0, bp);
+	if (error)
+		return error;
+#endif
+
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+
+	/*
+	 * Point to the record and extract its data.
+	 */
+	*recp = xfs_btree_rec_addr(cur, ptr, block);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_ptr     rptr;
+
+	/* do right sibling readahead */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* modify the owner */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+	else
+		block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+	/*
+	 * If the block is a root block hosted in an inode, we might not have a
+	 * buffer pointer here and we shouldn't attempt to log the change as the
+	 * information is already held in the inode and discarded when the root
+	 * block is formatted into the on-disk inode fork. We still change it,
+	 * though, so everything is consistent in memory.
+	 */
+	if (bp) {
+		if (cur->bc_tp) {
+			xfs_trans_ordered_buf(cur->bc_tp, bp);
+			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+		} else {
+			xfs_buf_delwri_queue(bp, buffer_list);
+		}
+	} else {
+		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+		ASSERT(level == cur->bc_nlevels - 1);
+	}
+
+	/* now read rh sibling block for next iteration */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		return ENOENT;
+
+	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+	struct xfs_btree_cur	*cur,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	union xfs_btree_ptr     lptr;
+	int			level;
+	struct xfs_btree_block	*block = NULL;
+	int			error = 0;
+
+	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+	/* for each level */
+	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+		/* grab the left hand block */
+		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+		if (error)
+			return error;
+
+		/* readahead the left most block for the next level down */
+		if (level > 0) {
+			union xfs_btree_ptr     *ptr;
+
+			ptr = xfs_btree_ptr_addr(cur, 1, block);
+			xfs_btree_readahead_ptr(cur, ptr, 1);
+
+			/* save for the next iteration of the loop */
+			lptr = *ptr;
+		}
+
+		/* for each buffer in the level */
+		do {
+			error = xfs_btree_block_change_owner(cur, level,
+							     new_owner,
+							     buffer_list);
+		} while (!error);
+
+		if (error != ENOENT)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000000..a1a4e3e47a1e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2665 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC int	xfs_da3_blk_unlink(xfs_da_state_t *state,
+				  xfs_da_state_blk_t *drop_blk,
+				  xfs_da_state_blk_t *save_blk);
+
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++)
+		state->altpath.blk[i].bp = NULL;
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+	memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+static bool
+xfs_da3_node_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_da_intnode	*hdr = bp->b_addr;
+	struct xfs_da3_icnode_hdr ichdr;
+	const struct xfs_dir_ops *ops;
+
+	ops = xfs_dir_get_ops(mp, NULL);
+
+	ops->node_hdr_from_disk(&ichdr, hdr);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_DA_NODE_MAGIC)
+			return false;
+	}
+	if (ichdr.level == 0)
+		return false;
+	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+		return false;
+	if (ichdr.count == 0)
+		return false;
+
+	/*
+	 * we don't know if the node is for and attribute or directory tree,
+	 * so only fail if the count is outside both bounds
+	 */
+	if (ichdr.count > mp->m_dir_geo->node_ents &&
+	    ichdr.count > mp->m_attr_geo->node_ents)
+		return false;
+
+	/* XXX: hash order check? */
+
+	return true;
+}
+
+static void
+xfs_da3_node_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_da3_node_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+		case XFS_DA3_NODE_MAGIC:
+			if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+				xfs_buf_ioerror(bp, EFSBADCRC);
+				break;
+			}
+			/* fall through */
+		case XFS_DA_NODE_MAGIC:
+			if (!xfs_da3_node_verify(bp)) {
+				xfs_buf_ioerror(bp, EFSCORRUPTED);
+				break;
+			}
+			return;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			bp->b_ops = &xfs_attr3_leaf_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			bp->b_ops = &xfs_dir3_leafn_buf_ops;
+			bp->b_ops->verify_read(bp);
+			return;
+		default:
+			break;
+	}
+
+	/* corrupt block */
+	xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+	.verify_read = xfs_da3_node_read_verify,
+	.verify_write = xfs_da3_node_write_verify,
+};
+
+int
+xfs_da3_node_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			which_fork)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+					which_fork, &xfs_da3_node_buf_ops);
+	if (!err && tp) {
+		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
+		int			type;
+
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			type = XFS_BLFT_DA_NODE_BUF;
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			type = XFS_BLFT_ATTR_LEAF_BUF;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			type = XFS_BLFT_DIR_LEAFN_BUF;
+			break;
+		default:
+			type = 0;
+			ASSERT(0);
+			break;
+		}
+		xfs_trans_buf_set_type(tp, *bpp, type);
+	}
+	return err;
+}
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	int			level,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_da3_icnode_hdr ichdr = {0};
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_inode	*dp = args->dp;
+
+	trace_xfs_da_node_create(args);
+	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_da3_node_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	node = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_DA3_NODE_MAGIC;
+		hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+		uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		ichdr.magic = XFS_DA_NODE_MAGIC;
+	}
+	ichdr.level = level;
+
+	dp->d_ops->node_hdr_to_disk(node, &ichdr);
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da3_split(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*oldblk;
+	struct xfs_da_state_blk	*newblk;
+	struct xfs_da_state_blk	*addblk;
+	struct xfs_da_intnode	*node;
+	struct xfs_buf		*bp;
+	int			max;
+	int			action = 0;
+	int			error;
+	int			i;
+
+	trace_xfs_da_split(state->args);
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return error;	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				trace_xfs_attr_leaf_split_before(state->args);
+				error = xfs_attr3_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				trace_xfs_attr_leaf_split_after(state->args);
+				error = xfs_attr3_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return error;	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			addblk->bp = NULL;
+			if (error)
+				return error;	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da3_fixhashpath(state, &state->path);
+	}
+	if (!addblk)
+		return 0;
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da3_root_split(state, oldblk, addblk);
+	if (error) {
+		addblk->bp = NULL;
+		return error;	/* GROT: dir is inconsistent */
+	}
+
+	/*
+	 * Update pointers to the node which used to be block 0 and
+	 * just got bumped because of the addition of a new root node.
+	 * There might be three blocks involved if a double split occurred,
+	 * and the original block 0 could be at any position in the list.
+	 *
+	 * Note: the magic numbers and sibling pointers are in the same
+	 * physical place for both v2 and v3 headers (by design). Hence it
+	 * doesn't matter which version of the xfs_da_intnode structure we use
+	 * here as the result will be the same using either structure.
+	 */
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.forw) {
+		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->b_addr;
+		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->b_addr;
+	if (node->hdr.info.back) {
+		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->b_addr;
+		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+		xfs_trans_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	addblk->bp = NULL;
+	return 0;
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da3_root_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_dir2_leaf	*leaf;
+	xfs_dablk_t		blkno;
+	int			level;
+	int			error;
+	int			size;
+
+	trace_xfs_da_root_split(state->args);
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return error;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = state->mp;
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+	if (error)
+		return error;
+	node = bp->b_addr;
+	oldroot = blk1->bp->b_addr;
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
+		dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+		btree = dp->d_ops->node_tree_p(oldroot);
+		size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+		level = nodehdr.level;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+	} else {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
+
+		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+		size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+		level = 0;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+
+	/*
+	 * we can copy most of the information in the node from one block to
+	 * another, but for CRC enabled headers we have to make sure that the
+	 * block specific identifiers are kept intact. We update the buffer
+	 * directly for this.
+	 */
+	memcpy(node, oldroot, size);
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+		node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+	}
+	xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+	bp->b_ops = blk1->bp->b_ops;
+	xfs_trans_buf_copy_type(bp, blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da3_node_create(args,
+		(args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+		level + 1, &bp, args->whichfork);
+	if (error)
+		return error;
+
+	node = bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+	btree[0].hashval = cpu_to_be32(blk1->hashval);
+	btree[0].before = cpu_to_be32(blk1->blkno);
+	btree[1].hashval = cpu_to_be32(blk2->hashval);
+	btree[1].before = cpu_to_be32(blk2->blkno);
+	nodehdr.count = 2;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+
+#ifdef DEBUG
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		ASSERT(blk1->blkno >= args->geo->leafblk &&
+		       blk1->blkno < args->geo->freeblk);
+		ASSERT(blk2->blkno >= args->geo->leafblk &&
+		       blk2->blkno < args->geo->freeblk);
+	}
+#endif
+
+	/* Header is already logged by xfs_da_node_create */
+	xfs_trans_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+
+	return 0;
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da3_node_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk,
+	struct xfs_da_state_blk	*addblk,
+	int			treelevel,
+	int			*result)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno;
+	int			newcount;
+	int			error;
+	int			useextra;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_split(state->args);
+
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+
+	/*
+	 * With V2 dirs the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if (nodehdr.count + newcount > state->args->geo->node_ents) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+
+		error = xfs_da3_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return error;	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da3_node_rebalance(state, oldblk, newblk);
+		error = xfs_da3_blk_link(state, oldblk, newblk);
+		if (error)
+			return error;
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da3_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (oldblk->index <= nodehdr.count) {
+		oldblk->index++;
+		xfs_da3_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da3_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da3_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da3_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_intnode	*tmpnode;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da_node_entry *btree_s;
+	struct xfs_da_node_entry *btree_d;
+	struct xfs_da3_icnode_hdr nodehdr1;
+	struct xfs_da3_icnode_hdr nodehdr2;
+	struct xfs_trans	*tp;
+	int			count;
+	int			tmp;
+	int			swap = 0;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_rebalance(state->args);
+
+	node1 = blk1->bp->b_addr;
+	node2 = blk2->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+	dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
+
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+			be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+		tmpnode = node1;
+		node1 = node2;
+		node2 = tmpnode;
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
+		swap = 1;
+	}
+
+	count = (nodehdr1.count - nodehdr2.count) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		tmp = nodehdr2.count;
+		if (tmp > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &btree2[0];
+			btree_d = &btree2[count];
+			memmove(btree_d, btree_s, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		nodehdr2.count += count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree1[nodehdr1.count - count];
+		btree_d = &btree2[0];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count -= count;
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[0];
+		btree_d = &btree1[nodehdr1.count];
+		memcpy(btree_d, btree_s, tmp);
+		nodehdr1.count += count;
+
+		xfs_trans_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = nodehdr2.count - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &btree2[count];
+		btree_d = &btree2[0];
+		memmove(btree_d, btree_s, tmp);
+		nodehdr2.count -= count;
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+	xfs_trans_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+
+	dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+	xfs_trans_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+				dp->d_ops->node_hdr_size +
+				(sizeof(btree2[0]) * nodehdr2.count)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	if (swap) {
+		node1 = blk1->bp->b_addr;
+		node2 = blk2->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
+	}
+	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= nodehdr1.count) {
+		blk2->index = blk1->index - nodehdr1.count;
+		blk1->index = nodehdr1.count + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_add(state->args);
+
+	node = oldblk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	btree = dp->d_ops->node_tree_p(node);
+
+	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK)
+		ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+		       newblk->blkno < state->args->geo->freeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	if (oldblk->index < nodehdr.count) {
+		tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+		memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+	}
+	btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+	btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+				tmp + sizeof(*btree)));
+
+	nodehdr.count += 1;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+	struct xfs_da_state	*state)
+{
+	struct xfs_da_state_blk	*drop_blk;
+	struct xfs_da_state_blk	*save_blk;
+	int			action = 0;
+	int			error;
+
+	trace_xfs_da_join(state->args);
+
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+			error = xfs_attr3_leaf_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da3_node_remove(state, drop_blk);
+			xfs_da3_fixhashpath(state, &state->path);
+			error = xfs_da3_node_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_da3_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da3_fixhashpath(state, &state->altpath);
+		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return error;
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return error;
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da3_node_remove(state, drop_blk);
+	xfs_da3_fixhashpath(state, &state->path);
+	error = xfs_da3_root_join(state, &state->path.blk[0]);
+	return error;
+}
+
+#ifdef	DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+	__be16	magic = blkinfo->magic;
+
+	if (level == 1) {
+		ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+	} else {
+		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	}
+	ASSERT(!blkinfo->forw);
+	ASSERT(!blkinfo->back);
+}
+#else	/* !DEBUG */
+#define	xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif	/* !DEBUG */
+
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*root_blk)
+{
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		child;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr oldroothdr;
+	struct xfs_da_node_entry *btree;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_root_join(state->args);
+
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+	args = state->args;
+	oldroot = root_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+	ASSERT(oldroothdr.forw == 0);
+	ASSERT(oldroothdr.back == 0);
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (oldroothdr.count > 1)
+		return 0;
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	btree = dp->d_ops->node_tree_p(oldroot);
+	child = be32_to_cpu(btree[0].before);
+	ASSERT(child != 0);
+	error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
+					     args->whichfork);
+	if (error)
+		return error;
+	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+	/*
+	 * This could be copying a leaf back into the root block in the case of
+	 * there only being a single leaf block left in the tree. Hence we have
+	 * to update the b_ops pointer as well to match the buffer type change
+	 * that could occur. For dir3 blocks we also need to update the block
+	 * number in the buffer header.
+	 */
+	memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+	root_blk->bp->b_ops = bp->b_ops;
+	xfs_trans_buf_copy_type(root_blk->bp, bp);
+	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+		da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+	}
+	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+			  args->geo->blksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return error;
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	xfs_dablk_t		blkno;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr nodehdr;
+	int			count;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_toosmall(state->args);
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->b_addr;
+	node = (xfs_da_intnode_t *)info;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+		*action = 0;	/* blk over 50%, don't try to join */
+		return 0;	/* blk over 50%, don't try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (nodehdr.count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (info->forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return error;
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return 0;
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	count  = state->args->geo->node_ents;
+	count -= state->args->geo->node_ents >> 2;
+	count -= nodehdr.count;
+
+	/* start with smaller blk num */
+	forward = nodehdr.forw < nodehdr.back;
+	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_da3_icnode_hdr thdr;
+		if (forward)
+			blkno = nodehdr.forw;
+		else
+			blkno = nodehdr.back;
+		if (blkno == 0)
+			continue;
+		error = xfs_da3_node_read(state->args->trans, dp,
+					blkno, -1, &bp, state->args->whichfork);
+		if (error)
+			return error;
+
+		node = bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&thdr, node);
+		xfs_trans_brelse(state->args->trans, bp);
+
+		if (count - thdr.count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da3_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+		return 0;
+	}
+	*action = 1;
+	return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp,
+	int			*count)
+{
+	struct xfs_da_intnode	 *node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+
+	node = bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	if (count)
+		*count = nodehdr.count;
+	if (!nodehdr.count)
+		return 0;
+	btree = dp->d_ops->node_tree_p(node);
+	return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	xfs_dahash_t		lasthash=0;
+	int			level;
+	int			count;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_fixhashpath(state->args);
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+		if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		btree[blk->index].hashval = cpu_to_be32(lasthash);
+		xfs_trans_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, &btree[blk->index],
+						  sizeof(*btree)));
+
+		lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk)
+{
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			index;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_remove(state->args);
+
+	node = drop_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+	ASSERT(drop_blk->index < nodehdr.count);
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	index = drop_blk->index;
+	btree = dp->d_ops->node_tree_p(node);
+	if (index < nodehdr.count - 1) {
+		tmp  = nodehdr.count - index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		memmove(&btree[index], &btree[index + 1], tmp);
+		xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, &btree[index], tmp));
+		index = nodehdr.count - 1;
+	}
+	memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+	nodehdr.count -= 1;
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_intnode	*drop_node;
+	struct xfs_da_intnode	*save_node;
+	struct xfs_da_node_entry *drop_btree;
+	struct xfs_da_node_entry *save_btree;
+	struct xfs_da3_icnode_hdr drop_hdr;
+	struct xfs_da3_icnode_hdr save_hdr;
+	struct xfs_trans	*tp;
+	int			sindex;
+	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_node_unbalance(state->args);
+
+	drop_node = drop_blk->bp->b_addr;
+	save_node = save_blk->bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+	dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+	drop_btree = dp->d_ops->node_tree_p(drop_node);
+	save_btree = dp->d_ops->node_tree_p(save_node);
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((be32_to_cpu(drop_btree[0].hashval) <
+			be32_to_cpu(save_btree[0].hashval)) ||
+	    (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+			be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+		/* XXX: check this - is memmove dst correct? */
+		tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+		memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+		sindex = 0;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[0],
+				(save_hdr.count + drop_hdr.count) *
+						sizeof(xfs_da_node_entry_t)));
+	} else {
+		sindex = save_hdr.count;
+		xfs_trans_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+				drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+	save_hdr.count += drop_hdr.count;
+
+	dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+	xfs_trans_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+				dp->d_ops->node_hdr_size));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da3_node_lookup_int(
+	struct xfs_da_state	*state,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*curr;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		blkno;
+	xfs_dahash_t		hashval;
+	xfs_dahash_t		btreehashval;
+	int			probe;
+	int			span;
+	int			max;
+	int			error;
+	int			retval;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da3_node_read(args->trans, args->dp, blkno,
+					-1, &blk->bp, args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return error;
+		}
+		curr = blk->bp->b_addr;
+		blk->magic = be16_to_cpu(curr->magic);
+
+		if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+		    blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+		    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
+			break;
+		}
+
+		blk->magic = XFS_DA_NODE_MAGIC;
+
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+
+		max = nodehdr.count;
+		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+
+		/*
+		 * Binary search.  (note: small blocks will skip loop)
+		 */
+		probe = span = max / 2;
+		hashval = args->hashval;
+		while (span > 4) {
+			span /= 2;
+			btreehashval = be32_to_cpu(btree[probe].hashval);
+			if (btreehashval < hashval)
+				probe += span;
+			else if (btreehashval > hashval)
+				probe -= span;
+			else
+				break;
+		}
+		ASSERT((probe >= 0) && (probe < max));
+		ASSERT((span <= 4) ||
+			(be32_to_cpu(btree[probe].hashval) == hashval));
+
+		/*
+		 * Since we may have duplicate hashval's, find the first
+		 * matching hashval in the node.
+		 */
+		while (probe > 0 &&
+		       be32_to_cpu(btree[probe].hashval) >= hashval) {
+			probe--;
+		}
+		while (probe < max &&
+		       be32_to_cpu(btree[probe].hashval) < hashval) {
+			probe++;
+		}
+
+		/*
+		 * Pick the right block to descend on.
+		 */
+		if (probe == max) {
+			blk->index = max - 1;
+			blkno = be32_to_cpu(btree[max - 1].before);
+		} else {
+			blk->index = probe;
+			blkno = be32_to_cpu(btree[probe].before);
+		}
+	}
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+							&blk->index, state);
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+			blk->index = args->index;
+			args->blkno = blk->blkno;
+		} else {
+			ASSERT(0);
+			return EFSCORRUPTED;
+		}
+		if (((retval == ENOENT) || (retval == ENOATTR)) &&
+		    (blk->hashval == args->hashval)) {
+			error = xfs_da3_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return error;
+			if (retval == 0) {
+				continue;
+			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = ENOATTR;
+			}
+		}
+		break;
+	}
+	*result = retval;
+	return 0;
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+	struct xfs_inode *dp,
+	struct xfs_buf	*node1_bp,
+	struct xfs_buf	*node2_bp)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da3_icnode_hdr node1hdr;
+	struct xfs_da3_icnode_hdr node2hdr;
+
+	node1 = node1_bp->b_addr;
+	node2 = node2_bp->b_addr;
+	dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+	dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
+
+	if (node1hdr.count > 0 && node2hdr.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+	      be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da3_blk_link(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*old_blk,
+	struct xfs_da_state_blk	*new_blk)
+{
+	struct xfs_da_blkinfo	*old_info;
+	struct xfs_da_blkinfo	*new_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			before = 0;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->b_addr;
+	new_info = new_blk->bp->b_addr;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+	switch (old_blk->magic) {
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		trace_xfs_da_link_before(args);
+		new_info->forw = cpu_to_be32(old_blk->blkno);
+		new_info->back = old_info->back;
+		if (old_info->back) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+			tmp_info->forw = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->back = cpu_to_be32(new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		trace_xfs_da_link_after(args);
+		new_info->forw = old_info->forw;
+		new_info->back = cpu_to_be32(old_blk->blkno);
+		if (old_info->forw) {
+			error = xfs_da3_node_read(args->trans, dp,
+						be32_to_cpu(old_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == old_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+			tmp_info->back = cpu_to_be32(new_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+		}
+		old_info->forw = cpu_to_be32(new_blk->blkno);
+	}
+
+	xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return 0;
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int						/* error */
+xfs_da3_blk_unlink(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
+{
+	struct xfs_da_blkinfo	*drop_info;
+	struct xfs_da_blkinfo	*save_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->b_addr;
+	drop_info = drop_blk->bp->b_addr;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
+	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+		trace_xfs_da_unlink_back(args);
+		save_info->back = drop_info->back;
+		if (drop_info->back) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->back),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+			tmp_info->forw = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	} else {
+		trace_xfs_da_unlink_forward(args);
+		save_info->forw = drop_info->forw;
+		if (drop_info->forw) {
+			error = xfs_da3_node_read(args->trans, args->dp,
+						be32_to_cpu(drop_info->forw),
+						-1, &bp, args->whichfork);
+			if (error)
+				return error;
+			ASSERT(bp != NULL);
+			tmp_info = bp->b_addr;
+			ASSERT(tmp_info->magic == save_info->magic);
+			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+			tmp_info->back = cpu_to_be32(save_blk->blkno);
+			xfs_trans_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+		}
+	}
+
+	xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return 0;
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da3_path_shift(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path,
+	int			forward,
+	int			release,
+	int			*result)
+{
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_args	*args;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno = 0;
+	int			level;
+	int			error;
+	struct xfs_inode	*dp = state->args->dp;
+
+	trace_xfs_da_path_shift(state->args);
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+		node = blk->bp->b_addr;
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+		btree = dp->d_ops->node_tree_p(node);
+
+		if (forward && (blk->index < nodehdr.count - 1)) {
+			blk->index++;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = ENOENT;	/* we're out of our tree */
+		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+		return 0;
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Release the old block.
+		 * (if it's dirty, trans won't actually let go)
+		 */
+		if (release)
+			xfs_trans_brelse(args->trans, blk->bp);
+
+		/*
+		 * Read the next child block.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da3_node_read(args->trans, dp, blkno, -1,
+					&blk->bp, args->whichfork);
+		if (error)
+			return error;
+		info = blk->bp->b_addr;
+		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+		/*
+		 * Note: we flatten the magic number to a single type so we
+		 * don't have to compare against crc/non-crc types elsewhere.
+		 */
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			blk->magic = XFS_DA_NODE_MAGIC;
+			node = (xfs_da_intnode_t *)info;
+			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+			btree = dp->d_ops->node_tree_p(node);
+			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = nodehdr.count - 1;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+	*result = 0;
+	return 0;
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       rol32(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ rol32(hash, 7 * 1);
+	default: /* case 0: */
+		return hash;
+	}
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+	struct xfs_name	*name)
+{
+	return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+	.hashname	= xfs_default_hashname,
+	.compname	= xfs_da_compname
+};
+
+int
+xfs_da_grow_inode_int(
+	struct xfs_da_args	*args,
+	xfs_fileoff_t		*bno,
+	int			count)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	int			w = args->whichfork;
+	xfs_drfsbno_t		nblks = dp->i_d.di_nblocks;
+	struct xfs_bmbt_irec	map, *mapp;
+	int			nmap, error, got, i, mapi;
+
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+	if (error)
+		return error;
+
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	error = xfs_bmapi_write(tp, dp, *bno, count,
+			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist);
+	if (error)
+		return error;
+
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	} else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t		b;
+		int			c;
+
+		/*
+		 * If we didn't get it and the block might work if fragmented,
+		 * try without the CONTIG flag.  Loop until we get it all.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		for (b = *bno, mapi = 0; b < *bno + count; ) {
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(*bno + count - b);
+			error = xfs_bmapi_write(tp, dp, b, c,
+					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist);
+			if (error)
+				goto out_free_map;
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != *bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    *bno + count) {
+		error = ENOSPC;
+		goto out_free_map;
+	}
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
+
+out_free_map:
+	if (mapp != &map)
+		kmem_free(mapp);
+	return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*new_blkno)
+{
+	xfs_fileoff_t		bno;
+	int			error;
+
+	trace_xfs_da_grow_inode(args);
+
+	bno = args->geo->leafblk;
+	error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+	if (!error)
+		*new_blkno = (xfs_dablk_t)bno;
+	return error;
+}
+
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*dead_blknop,
+	struct xfs_buf		**dead_bufp)
+{
+	struct xfs_da_blkinfo	*dead_info;
+	struct xfs_da_blkinfo	*sib_info;
+	struct xfs_da_intnode	*par_node;
+	struct xfs_da_intnode	*dead_node;
+	struct xfs_dir2_leaf	*dead_leaf2;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr par_hdr;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_buf		*dead_buf;
+	struct xfs_buf		*last_buf;
+	struct xfs_buf		*sib_buf;
+	struct xfs_buf		*par_buf;
+	xfs_dahash_t		dead_hash;
+	xfs_fileoff_t		lastoff;
+	xfs_dablk_t		dead_blkno;
+	xfs_dablk_t		last_blkno;
+	xfs_dablk_t		sib_blkno;
+	xfs_dablk_t		par_blkno;
+	int			error;
+	int			w;
+	int			entno;
+	int			level;
+	int			dead_level;
+
+	trace_xfs_da_swap_lastblock(args);
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	dp = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = dp->i_mount;
+	lastoff = args->geo->freeblk;
+	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+	if (error)
+		return error;
+	if (unlikely(lastoff == 0)) {
+		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return EFSCORRUPTED;
+	}
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+	error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+	if (error)
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+	xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+	dead_info = dead_buf->b_addr;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+		ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+		dead_level = 0;
+		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+	} else {
+		struct xfs_da3_icnode_hdr deadhdr;
+
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+		btree = dp->d_ops->node_tree_p(dead_node);
+		dead_level = deadhdr.level;
+		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (unlikely(
+		    be32_to_cpu(sib_info->forw) != last_blkno ||
+		    sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->forw = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+		if (error)
+			goto done;
+		sib_info = sib_buf->b_addr;
+		if (unlikely(
+		       be32_to_cpu(sib_info->back) != last_blkno ||
+		       sib_info->magic != dead_info->magic)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		sib_info->back = cpu_to_be32(dead_blkno);
+		xfs_trans_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		sib_buf = NULL;
+	}
+	par_blkno = args->geo->leafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+		if (level >= 0 && level != par_hdr.level + 1) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		level = par_hdr.level;
+		btree = dp->d_ops->node_tree_p(par_node);
+		for (entno = 0;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].hashval) < dead_hash;
+		     entno++)
+			continue;
+		if (entno == par_hdr.count) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		par_blkno = be32_to_cpu(btree[entno].before);
+		if (level == dead_level + 1)
+			break;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].before) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < par_hdr.count)
+			break;
+		par_blkno = par_hdr.forw;
+		xfs_trans_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (unlikely(par_blkno == 0)) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+		if (error)
+			goto done;
+		par_node = par_buf->b_addr;
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+		if (par_hdr.level != level) {
+			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+					 XFS_ERRLEVEL_LOW, mp);
+			error = EFSCORRUPTED;
+			goto done;
+		}
+		btree = dp->d_ops->node_tree_p(par_node);
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	btree[entno].before = cpu_to_be32(dead_blkno);
+	xfs_trans_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+				sizeof(btree[entno].before)));
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_trans_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_trans_brelse(tp, sib_buf);
+	xfs_trans_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+	xfs_da_args_t	*args,
+	xfs_dablk_t	dead_blkno,
+	struct xfs_buf	*dead_buf)
+{
+	xfs_inode_t *dp;
+	int done, error, w, count;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	trace_xfs_da_shrink_inode(args);
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	mp = dp->i_mount;
+	count = args->geo->fsbcount;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+				    0, args->firstblock, args->flist, &done);
+		if (error == ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				break;
+			error = xfs_da3_swap_lastblock(args, &dead_blkno,
+						      &dead_buf);
+			if (error)
+				break;
+		} else {
+			break;
+		}
+	}
+	xfs_trans_binval(tp, dead_buf);
+	return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+	int		nmap,
+	xfs_bmbt_irec_t	*mapp,
+	xfs_dablk_t	bno,
+	int		count)
+{
+	int		i;
+	xfs_fileoff_t	off;
+
+	for (i = 0, off = bno; i < nmap; i++) {
+		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+			return 0;
+		}
+		if (off != mapp[i].br_startoff) {
+			return 0;
+		}
+		off += mapp[i].br_blockcount;
+	}
+	return off == bno + count;
+}
+
+/*
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map.  For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
+ */
+static int
+xfs_buf_map_from_irec(
+	struct xfs_mount	*mp,
+	struct xfs_buf_map	**mapp,
+	int			*nmaps,
+	struct xfs_bmbt_irec	*irecs,
+	int			nirecs)
+{
+	struct xfs_buf_map	*map;
+	int			i;
+
+	ASSERT(*nmaps == 1);
+	ASSERT(nirecs >= 1);
+
+	if (nirecs > 1) {
+		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+				  KM_SLEEP | KM_NOFS);
+		if (!map)
+			return ENOMEM;
+		*mapp = map;
+	}
+
+	*nmaps = nirecs;
+	map = *mapp;
+	for (i = 0; i < *nmaps; i++) {
+		ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+		       irecs[i].br_startblock != HOLESTARTBLOCK);
+		map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+		map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+	}
+	return 0;
+}
+
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ *	-1 - will be returned if we land in a hole and mappedbno == -2 so the
+ *	     caller knows not to execute a subsequent read.
+ *	 0 - if we mapped the block successfully
+ *	>0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	int			whichfork,
+	struct xfs_buf_map	**map,
+	int			*nmaps)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			nfsb;
+	int			error = 0;
+	struct xfs_bmbt_irec	irec;
+	struct xfs_bmbt_irec	*irecs = &irec;
+	int			nirecs;
+
+	ASSERT(map && *map);
+	ASSERT(*nmaps == 1);
+
+	if (whichfork == XFS_DATA_FORK)
+		nfsb = mp->m_dir_geo->fsbcount;
+	else
+		nfsb = mp->m_attr_geo->fsbcount;
+
+	/*
+	 * Caller doesn't have a mapping.  -2 means don't complain
+	 * if we land in a hole.
+	 */
+	if (mappedbno == -1 || mappedbno == -2) {
+		/*
+		 * Optimize the one-block case.
+		 */
+		if (nfsb != 1)
+			irecs = kmem_zalloc(sizeof(irec) * nfsb,
+					    KM_SLEEP | KM_NOFS);
+
+		nirecs = nfsb;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+				       &nirecs, xfs_bmapi_aflag(whichfork));
+		if (error)
+			goto out;
+	} else {
+		irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+		irecs->br_startoff = (xfs_fileoff_t)bno;
+		irecs->br_blockcount = nfsb;
+		irecs->br_state = 0;
+		nirecs = 1;
+	}
+
+	if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+		error = mappedbno == -2 ? -1 : EFSCORRUPTED;
+		if (unlikely(error == EFSCORRUPTED)) {
+			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+				int i;
+				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+					__func__, (long long)bno,
+					(long long)dp->i_ino);
+				for (i = 0; i < *nmaps; i++) {
+					xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+						i,
+						(long long)irecs[i].br_startoff,
+						(long long)irecs[i].br_startblock,
+						(long long)irecs[i].br_blockcount,
+						irecs[i].br_state);
+				}
+			}
+			XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+		}
+		goto out;
+	}
+	error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+	if (irecs != &irec)
+		kmem_free(irecs);
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	struct xfs_trans	*trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			whichfork)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	*bpp = NULL;
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+				    mapp, nmap, 0);
+	error = bp ? bp->b_error : EIO;
+	if (error) {
+		xfs_trans_brelse(trans, bp);
+		goto out_free;
+	}
+
+	*bpp = bp;
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	struct xfs_trans	*trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	*bpp = NULL;
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	error = xfs_trans_read_buf_map(dp->i_mount, trans,
+					dp->i_mount->m_ddev_targp,
+					mapp, nmap, 0, &bp, ops);
+	if (error)
+		goto out_free;
+
+	if (whichfork == XFS_ATTR_FORK)
+		xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+	else
+		xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+	*bpp = bp;
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	return error;
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	int			whichfork,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp;
+	int			nmap;
+	int			error;
+
+	mapp = &map;
+	nmap = 1;
+	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+				&mapp, &nmap);
+	if (error) {
+		/* mapping a hole is not an error, but we don't continue */
+		if (error == -1)
+			error = 0;
+		goto out_free;
+	}
+
+	mappedbno = mapp[0].bm_bn;
+	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+	if (mapp != &map)
+		kmem_free(mapp);
+
+	if (error)
+		return -1;
+	return mappedbno;
+}
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
new file mode 100644
index 000000000000..c9aee52a37e2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
+
+	count += len;					/* name */
+	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+				sizeof(xfs_dir2_ino4_t); /* ino # */
+	return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name.  Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	__uint8_t	ftype;
+
+	ftype = sfep->name[sfep->namelen];
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+	sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide.  These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*from)
+{
+	if (hdr->i8count)
+		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+	else
+		return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*to,
+	xfs_ino_t		ino)
+{
+	ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+	if (hdr->i8count)
+		put_unaligned_be64(ino, &to->i8.i);
+	else
+		put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr)
+{
+	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),	\
+		XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+	int			n)
+{
+	return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+	int			n)
+{
+	return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	__uint8_t	ftype = dep->name[dep->namelen];
+
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		type)
+{
+	ASSERT(type < XFS_DIR3_FT_MAX);
+	ASSERT(dep->namelen != 0);
+
+	dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->stale = be16_to_cpu(from->hdr.stale);
+
+	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->count);
+	to->stale = be16_to_cpu(hdr3->stale);
+
+	ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->count = cpu_to_be16(from->count);
+	hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.__count);
+	to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.__count = cpu_to_be16(from->count);
+	to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->__count);
+	to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+	ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->__count = cpu_to_be16(from->count);
+	hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+			(db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+	return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+			(db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	to->magic = be32_to_cpu(from->hdr.magic);
+	to->firstdb = be32_to_cpu(from->hdr.firstdb);
+	to->nvalid = be32_to_cpu(from->hdr.nvalid);
+	to->nused = be32_to_cpu(from->hdr.nused);
+	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+	to->hdr.magic = cpu_to_be32(from->magic);
+	to->hdr.firstdb = cpu_to_be32(from->firstdb);
+	to->hdr.nvalid = cpu_to_be32(from->nvalid);
+	to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+	to->magic = be32_to_cpu(hdr3->hdr.magic);
+	to->firstdb = be32_to_cpu(hdr3->firstdb);
+	to->nvalid = be32_to_cpu(hdr3->nvalid);
+	to->nused = be32_to_cpu(hdr3->nused);
+
+	ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+	hdr3->hdr.magic = cpu_to_be32(from->magic);
+	hdr3->firstdb = cpu_to_be32(from->firstdb);
+	hdr3->nvalid = cpu_to_be32(from->nvalid);
+	hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+	.sf_entsize = xfs_dir2_sf_entsize,
+	.sf_nextentry = xfs_dir2_sf_nextentry,
+	.sf_get_ftype = xfs_dir2_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir2_sfe_put_ftype,
+	.sf_get_ino = xfs_dir2_sfe_get_ino,
+	.sf_put_ino = xfs_dir2_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir2_data_entsize,
+	.data_get_ftype = xfs_dir2_data_get_ftype,
+	.data_put_ftype = xfs_dir2_data_put_ftype,
+	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+	.data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir3_data_first_entry_p,
+	.data_entry_p = xfs_dir3_data_entry_p,
+	.data_unused_p = xfs_dir3_data_unused_p,
+
+	.leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+	.leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+	.leaf_max_ents = xfs_dir3_max_leaf_ents,
+	.leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+	.node_tree_p = xfs_da3_node_tree_p,
+
+	.free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+	.free_max_bests = xfs_dir3_free_max_bests,
+	.free_bests_p = xfs_dir3_free_bests_p,
+	.db_to_fdb = xfs_dir3_db_to_fdb,
+	.db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+	.node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+	.node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config.  If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_dir_inode_ops)
+		return mp->m_dir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_ops;
+	if (xfs_sb_version_hasftype(&mp->m_sb))
+		return &xfs_dir2_ftype_ops;
+	return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_nondir_inode_ops)
+		return mp->m_nondir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_nondir_ops;
+	return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000000..a0aca734199b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+	struct xfs_name	*name)
+{
+	xfs_dahash_t	hash;
+	int		i;
+
+	for (i = 0, hash = 0; i < name->len; i++)
+		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+	return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	enum xfs_dacmp	result;
+	int		i;
+
+	if (args->namelen != len)
+		return XFS_CMP_DIFFERENT;
+
+	result = XFS_CMP_EXACT;
+	for (i = 0; i < len; i++) {
+		if (args->name[i] == name[i])
+			continue;
+		if (tolower(args->name[i]) != tolower(name[i]))
+			return XFS_CMP_DIFFERENT;
+		result = XFS_CMP_CASE;
+	}
+
+	return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+	.hashname	= xfs_ascii_ci_hashname,
+	.compname	= xfs_ascii_ci_compname,
+};
+
+int
+xfs_da_mount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_da_geometry	*dageo;
+	int			nodehdr_size;
+
+
+	ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+	       XFS_MAX_BLOCKSIZE);
+
+	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+	mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				    KM_SLEEP | KM_MAYFAIL);
+	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+				     KM_SLEEP | KM_MAYFAIL);
+	if (!mp->m_dir_geo || !mp->m_attr_geo) {
+		kmem_free(mp->m_dir_geo);
+		kmem_free(mp->m_attr_geo);
+		return ENOMEM;
+	}
+
+	/* set up directory geometry */
+	dageo = mp->m_dir_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = 1 << dageo->blklog;
+	dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+	/*
+	 * Now we've set up the block conversion variables, we can calculate the
+	 * segment block constants using the geometry structure.
+	 */
+	dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+	dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+	dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+	dageo->node_ents = (dageo->blksize - nodehdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+
+	/* set up attribute geometry - single fsb only */
+	dageo = mp->m_attr_geo;
+	dageo->blklog = mp->m_sb.sb_blocklog;
+	dageo->fsblog = mp->m_sb.sb_blocklog;
+	dageo->blksize = 1 << dageo->blklog;
+	dageo->fsbcount = 1;
+	dageo->node_ents = (dageo->blksize - nodehdr_size) /
+				(uint)sizeof(xfs_da_node_entry_t);
+	dageo->magicpct = (dageo->blksize * 37) / 100;
+
+	if (xfs_sb_version_hasasciici(&mp->m_sb))
+		mp->m_dirnameops = &xfs_ascii_ci_nameops;
+	else
+		mp->m_dirnameops = &xfs_default_nameops;
+
+	return 0;
+}
+
+void
+xfs_da_unmount(
+	struct xfs_mount	*mp)
+{
+	kmem_free(mp->m_dir_geo);
+	kmem_free(mp->m_attr_geo);
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
+{
+	xfs_dir2_sf_hdr_t	*sfp;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
+		return 1;
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE))) {
+		xfs_warn(mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
+{
+	struct xfs_da_args *args;
+	int		error;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+	if (error)
+		return error;
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->dp = dp;
+	args->trans = tp;
+	error = xfs_dir2_sf_create(args, pdp->i_ino);
+	kmem_free(args);
+	return error;
+}
+
+/*
+  Enter a name in a directory.
+ */
+int
+xfs_dir_createname(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*dp,
+	struct xfs_name		*name,
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_fsblock_t		*first,		/* bmap's firstblock */
+	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	struct xfs_da_args	*args;
+	int			rval;
+	int			v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+	if (rval)
+		return rval;
+	XFS_STATS_INC(xs_dir_create);
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_addname(args);
+	else
+		rval = xfs_dir2_node_addname(args);
+
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+	struct xfs_da_args *args,
+	const unsigned char *name,
+	int		len)
+{
+	if (args->cmpresult == XFS_CMP_DIFFERENT)
+		return ENOENT;
+	if (args->cmpresult != XFS_CMP_CASE ||
+					!(args->op_flags & XFS_DA_OP_CILOOKUP))
+		return EEXIST;
+
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+	if (!args->value)
+		return ENOMEM;
+
+	memcpy(args->value, name, len);
+	args->valuelen = len;
+	return EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	*inum,		/* out: inode number */
+	struct xfs_name *ci_name)	/* out: actual name if CI match */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	XFS_STATS_INC(xs_dir_lookup);
+
+	/*
+	 * We need to use KM_NOFS here so that lockdep will not throw false
+	 * positive deadlock warnings on a non-transactional lookup path. It is
+	 * safe to recurse into inode recalim in that case, but lockdep can't
+	 * easily be taught about it. Hence KM_NOFS avoids having to add more
+	 * lockdep Doing this avoids having to add a bunch of lockdep class
+	 * annotations into the reclaim path for the ilock.
+	 */
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->dp = dp;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_OKNOENT;
+	if (ci_name)
+		args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_lookup(args);
+		goto out_check_rval;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_lookup(args);
+	else
+		rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
+	if (rval == EEXIST)
+		rval = 0;
+	if (!rval) {
+		*inum = args->inumber;
+		if (ci_name) {
+			ci_name->name = args->value;
+			ci_name->len = args->valuelen;
+		}
+	}
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,
+	xfs_ino_t	ino,
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+	XFS_STATS_INC(xs_dir_remove);
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = ino;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_removename(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_removename(args);
+	else
+		rval = xfs_dir2_node_removename(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,		/* name of entry to replace */
+	xfs_ino_t	inum,		/* new inode number */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+	if (rval)
+		return rval;
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->inumber = inum;
+	args->dp = dp;
+	args->firstblock = first;
+	args->flist = flist;
+	args->total = total;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_replace(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_replace(args);
+	else
+		rval = xfs_dir2_node_replace(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
+ */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	struct xfs_name	*name,		/* name of entry to add */
+	uint		resblks)
+{
+	struct xfs_da_args *args;
+	int		rval;
+	int		v;		/* type-checking value */
+
+	if (resblks)
+		return 0;
+
+	ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+	if (!args)
+		return ENOMEM;
+
+	args->geo = dp->i_mount->m_dir_geo;
+	args->name = name->name;
+	args->namelen = name->len;
+	args->filetype = name->type;
+	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+	args->dp = dp;
+	args->whichfork = XFS_DATA_FORK;
+	args->trans = tp;
+	args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+							XFS_DA_OP_OKNOENT;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		rval = xfs_dir2_sf_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isblock(args, &v);
+	if (rval)
+		goto out_free;
+	if (v) {
+		rval = xfs_dir2_block_addname(args);
+		goto out_free;
+	}
+
+	rval = xfs_dir2_isleaf(args, &v);
+	if (rval)
+		goto out_free;
+	if (v)
+		rval = xfs_dir2_leaf_addname(args);
+	else
+		rval = xfs_dir2_node_addname(args);
+out_free:
+	kmem_free(args);
+	return rval;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+	struct xfs_da_args	*args,
+	int			space,	/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t		*dbp)	/* out: block number added */
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	xfs_fileoff_t		bno;	/* directory offset of new block */
+	int			count;	/* count of filesystem blocks */
+	int			error;
+
+	trace_xfs_dir2_grow_inode(args, space);
+
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = args->geo->fsbcount;
+
+	error = xfs_da_grow_inode_int(args, &bno, count);
+	if (error)
+		return error;
+
+	*dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+	struct xfs_da_args	*args,
+	int			*vp)	/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t		last;	/* last file offset */
+	int			rval;
+
+	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+		return rval;
+	rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+	ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+	*vp = rval;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+	struct xfs_da_args	*args,
+	int			*vp)	/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t		last;	/* last file offset */
+	int			rval;
+
+	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+		return rval;
+	*vp = last == args->geo->leafblk + args->geo->fsbcount;
+	return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	struct xfs_buf	*bp)
+{
+	xfs_fileoff_t	bno;		/* directory file offset */
+	xfs_dablk_t	da;		/* directory file offset */
+	int		done;		/* bunmap is finished */
+	xfs_inode_t	*dp;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
+
+	trace_xfs_dir2_shrink_inode(args, db);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = xfs_dir2_db_to_da(args->geo, db);
+	/*
+	 * Unmap the fsblock(s).
+	 */
+	if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
+			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+			&done))) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with
+		 * no space reservation, and the resulting block removal
+		 * would cause a bmap btree split or conversion from extents
+		 * to btree.  This can only happen for un-fragmented
+		 * directory blocks, since you need to be punching out
+		 * the middle of an extent.
+		 * In this case we need to leave the block in the file,
+		 * and not binval it.
+		 * So the block has to be in a consistent empty state
+		 * and appropriately logged.
+		 * We don't free up the buffer, the caller can tell it
+		 * hasn't happened since it got an error back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_trans_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == args->geo->datablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..ab0bffccf5c3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+				    int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static bool
+xfs_dir3_block_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+			return false;
+	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
+}
+
+static void
+xfs_dir3_block_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_dir3_block_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_block_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_block_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+	.verify_read = xfs_dir3_block_read_verify,
+	.verify_write = xfs_dir3_block_write_verify,
+};
+
+int
+xfs_dir3_block_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+	return err;
+}
+
+static void
+xfs_dir3_block_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+	bp->b_ops = &xfs_dir3_block_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+		return;
+
+	}
+	hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+	struct xfs_inode		*dp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	__be16				**tagpp,
+	struct xfs_dir2_data_unused	**dupp,
+	struct xfs_dir2_data_unused	**enddupp,
+	int				*compact,
+	int				len)
+{
+	struct xfs_dir2_data_free	*bf;
+	__be16				*tagp = NULL;
+	struct xfs_dir2_data_unused	*dup = NULL;
+	struct xfs_dir2_data_unused	*enddup = NULL;
+
+	*compact = 0;
+	bf = dp->d_ops->data_bestfree_p(hdr);
+
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 */
+	if (btp->stale) {
+		if (be16_to_cpu(bf[0].length) >= len) {
+			/*
+			 * The biggest entry enough to avoid compaction.
+			 */
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)hdr + be16_to_cpu(bf[0].offset));
+			goto out;
+		}
+
+		/*
+		 * Will need to compact to make this work.
+		 * Tag just before the first leaf entry.
+		 */
+		*compact = 1;
+		tagp = (__be16 *)blp - 1;
+
+		/* Data object just before the first leaf entry.  */
+		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		goto out;
+	}
+
+	/*
+	 * no stale entries, so just use free space.
+	 * Tag just before the first leaf entry.
+	 */
+	tagp = (__be16 *)blp - 1;
+
+	/* Data object just before the first leaf entry.  */
+	enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+	/*
+	 * If it's not free then can't do this add without cleaning up:
+	 * the space before the first leaf entry needs to be free so it
+	 * can be expanded to hold the pointer to the new entry.
+	 */
+	if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)hdr + be16_to_cpu(bf[0].offset));
+		if (dup != enddup) {
+			/*
+			 * Not the same free entry, just check its length.
+			 */
+			if (be16_to_cpu(dup->length) < len)
+				dup = NULL;
+			goto out;
+		}
+
+		/*
+		 * It is the biggest freespace, can it hold the leaf too?
+		 */
+		if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+			/*
+			 * Yes, use the second-largest entry instead if it works.
+			 */
+			if (be16_to_cpu(bf[1].length) >= len)
+				dup = (xfs_dir2_data_unused_t *)
+				      ((char *)hdr + be16_to_cpu(bf[1].offset));
+			else
+				dup = NULL;
+		}
+	}
+out:
+	*tagpp = tagp;
+	*dupp = dup;
+	*enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+	struct xfs_da_args		*args,
+	struct xfs_buf			*bp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	int				*needlog,
+	int				*lfloghigh,
+	int				*lfloglow)
+{
+	int			fromidx;	/* source leaf index */
+	int			toidx;		/* target leaf index */
+	int			needscan = 0;
+	int			highstale;	/* high stale index */
+
+	fromidx = toidx = be32_to_cpu(btp->count) - 1;
+	highstale = *lfloghigh = -1;
+	for (; fromidx >= 0; fromidx--) {
+		if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (highstale == -1)
+				highstale = toidx;
+			else {
+				if (*lfloghigh == -1)
+					*lfloghigh = toidx;
+				continue;
+			}
+		}
+		if (fromidx < toidx)
+			blp[toidx] = blp[fromidx];
+		toidx--;
+	}
+	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
+	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+		needlog, &needscan);
+	btp->stale = cpu_to_be32(1);
+	/*
+	 * If we now need to rebuild the bestfree map, do so.
+	 * This needs to happen before the next call to use_free.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(args->dp, hdr, needlog);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	__be16			*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	trace_xfs_dir2_block_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	/* Read the (one and only) directory block into bp. */
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	len = dp->d_ops->data_entsize(args->namelen);
+
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Find out if we can reuse stale entries or whether we need extra
+	 * space for entry and new leaf.
+	 */
+	xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+				  &enddup, &compact, len);
+
+	/*
+	 * Done everything we need for a space check now.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, bp);
+		if (!dup)
+			return ENOSPC;
+		return 0;
+	}
+
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/* Don't have a space reservation: return no-space.  */
+		if (args->total == 0)
+			return ENOSPC;
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+
+	needlog = needscan = 0;
+
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 */
+	if (compact) {
+		xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+				      &lfloghigh, &lfloglow);
+		/* recalculate blp post-compaction */
+		blp = xfs_dir2_block_leaf_p(btp);
+	} else if (btp->stale) {
+		/*
+		 * Set leaf logging boundaries to impossible state.
+		 * For the no-stale case they're set explicitly.
+		 */
+		lfloglow = be32_to_cpu(btp->count);
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (!btp->stale) {
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		xfs_dir2_data_use_free(args, bp, enddup,
+			(xfs_dir2_data_aoff_t)
+			((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
+			 sizeof(*blp)),
+			(xfs_dir2_data_aoff_t)sizeof(*blp),
+			&needlog, &needscan);
+		/*
+		 * Update the tail (entry count).
+		 */
+		be32_add_cpu(&btp->count, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(dp, hdr, &needlog);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			memmove(blp, &blp[1], mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			blp[lowstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < be32_to_cpu(btp->count) &&
+			blp[highstale].address !=
+			cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == be32_to_cpu(btp->count) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				memmove(&blp[lowstale], &blp[lowstale + 1],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < be32_to_cpu(btp->count));
+			mid++;
+			if (highstale - mid)
+				memmove(&blp[mid + 1], &blp[mid],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = MIN(mid, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		be32_add_cpu(&btp->stale, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	blp[mid].hashval = cpu_to_be32(args->hashval);
+	blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	xfs_dir2_data_use_free(args, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	/*
+	 * Create the new data entry.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, args->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_leaf_entry_t	*blp;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+		(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	struct xfs_buf		*bp)		/* block buffer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+	xfs_dir2_block_tail_t	*btp;
+
+	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+	xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+		(uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	trace_xfs_dir2_block_lookup(args);
+
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	mp = dp->i_mount;
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Fill in inode number, CI name if appropriate, release the block.
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(args->trans, bp);
+	return error;
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	struct xfs_buf		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_block_read(tp, dp, &bp);
+	if (error)
+		return error;
+
+	hdr = bp->b_addr;
+	xfs_dir3_data_check(dp, bp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+			xfs_trans_brelse(tp, bp);
+			return ENOENT;
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*bpp = bp;
+			*entno = mid;
+			if (cmp == XFS_CMP_EXACT)
+				return 0;
+		}
+	} while (++mid < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[mid].hashval) == hash);
+
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was found earlier, return success.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE)
+		return 0;
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	xfs_trans_brelse(tp, bp);
+	return ENOENT;
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	trace_xfs_dir2_block_removename(args);
+
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	be32_add_cpu(&btp->stale, 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, bp);
+	xfs_dir3_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > XFS_IFORK_DSIZE(dp))
+		return 0;
+
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	trace_xfs_dir2_block_replace(args);
+
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	mp = dp->i_mount;
+	hdr = bp->b_addr;
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(blp[ent].address)));
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+		(be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	struct xfs_buf		*dbp)		/* data buffer */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	__be16			*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_d.di_size > args->geo->blksize) {
+		int hdrsz;
+
+		hdrsz = dp->d_ops->data_entry_offset;
+		bestsp = xfs_dir2_leaf_bests_p(ltp);
+		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+					    args->geo->blksize - hdrsz) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+				return error;
+		} else
+			return 0;
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (!dbp) {
+		error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+		if (error)
+			return error;
+	}
+	hdr = dbp->b_addr;
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(xfs_dir2_block_tail_t) +
+	       (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+	    be16_to_cpu(dup->length) < size)
+		return 0;
+
+	/*
+	 * Start converting it to block form.
+	 */
+	xfs_dir3_block_init(mp, tp, dbp, dp);
+
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
+		&needlog, &needscan);
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+	btp->stale = 0;
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = xfs_dir2_block_leaf_p(btp);
+	for (from = to = 0; from < leafhdr.count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		lep[to++] = ents[from];
+	}
+	ASSERT(to == be32_to_cpu(btp->count));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+	if (size > XFS_IFORK_DSIZE(dp))
+		return 0;
+
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	int			offset;		/* target block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old shortform header  */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform header  */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_name		name;
+	struct xfs_ifork	*ifp;
+
+	trace_xfs_dir2_sf_to_block(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	/*
+	 * Bomb out if the shortform directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return EIO;
+	}
+
+	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+	ASSERT(ifp->if_bytes == dp->i_d.di_size);
+	ASSERT(ifp->if_u1.if_data != NULL);
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+	ASSERT(dp->i_d.di_nextents == 0);
+
+	/*
+	 * Copy the directory into a temporary buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+	sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+	memcpy(sfp, oldsfp, ifp->if_bytes);
+
+	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+	xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error) {
+		kmem_free(sfp);
+		return error;
+	}
+	/*
+	 * Initialize the data block, then convert it to block format.
+	 */
+	error = xfs_dir3_data_init(args, blkno, &bp);
+	if (error) {
+		kmem_free(sfp);
+		return error;
+	}
+	xfs_dir3_block_init(mp, tp, bp, dp);
+	hdr = bp->b_addr;
+
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = dp->d_ops->data_unused_p(hdr);
+	needlog = needscan = 0;
+	xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+			       i, &needlog, &needscan);
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	btp->count = cpu_to_be32(sfp->count + 2);	/* ., .. */
+	btp->stale = 0;
+	blp = xfs_dir2_block_leaf_p(btp);
+	endoffset = (uint)((char *)blp - (char *)hdr);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	xfs_dir2_data_use_free(args, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+		be16_to_cpu(dup->length), &needlog, &needscan);
+	/*
+	 * Create entry for .
+	 */
+	dep = dp->d_ops->data_dot_entry_p(hdr);
+	dep->inumber = cpu_to_be64(dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	/*
+	 * Create entry for ..
+	 */
+	dep = dp->d_ops->data_dotdot_entry_p(hdr);
+	dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, bp, dep);
+	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+				(char *)dep - (char *)hdr));
+	offset = dp->d_ops->data_first_offset;
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	i = 0;
+	if (!sfp->count)
+		sfep = NULL;
+	else
+		sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = xfs_dir2_sf_get_offset(sfep);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+			dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+			dup->length = cpu_to_be16(newoffset - offset);
+			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+				((char *)dup - (char *)hdr));
+			xfs_dir2_data_log_unused(args, bp, dup);
+			xfs_dir2_data_freeinsert(hdr,
+						 dp->d_ops->data_bestfree_p(hdr),
+						 dup, &dummy);
+			offset += be16_to_cpu(dup->length);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
+		dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+		dep->namelen = sfep->namelen;
+		dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+		memcpy(dep->name, sfep->name, dep->namelen);
+		tagp = dp->d_ops->data_entry_tag_p(dep);
+		*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+		xfs_dir2_data_log_entry(args, bp, dep);
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+							hashname(&name));
+		blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+						 (char *)dep - (char *)hdr));
+		offset = (int)((char *)(tagp + 1) - (char *)hdr);
+		if (++i == sfp->count)
+			sfep = NULL;
+		else
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(sfp);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir3_data_check(dp, bp);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..8c2f6422648e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return 0 is the buffer is good, otherwise an error.
+ */
+int
+__xfs_dir3_data_check(
+	struct xfs_inode	*dp,		/* incore inode pointer */
+	struct xfs_buf		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	char			*endp;		/* end of useful data */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*p;		/* current data position */
+	int			stale;		/* count of stale leaves */
+	struct xfs_name		name;
+	const struct xfs_dir_ops *ops;
+	struct xfs_da_geometry	*geo;
+
+	mp = bp->b_target->bt_mount;
+	geo = mp->m_dir_geo;
+
+	/*
+	 * We can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
+	 */
+	ops = xfs_dir_get_ops(mp, dp);
+
+	hdr = bp->b_addr;
+	p = (char *)ops->data_entry_p(hdr);
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+		btp = xfs_dir2_block_tail_p(geo, hdr);
+		lep = xfs_dir2_block_leaf_p(btp);
+		endp = (char *)lep;
+
+		/*
+		 * The number of leaf entries is limited by the size of the
+		 * block and the amount of space used by the data entries.
+		 * We don't know how much space is used by the data entries yet,
+		 * so just ensure that the count falls somewhere inside the
+		 * block right now.
+		 */
+		XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+			((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
+		break;
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+		endp = (char *)hdr + geo->blksize;
+		break;
+	default:
+		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
+	}
+
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	bf = ops->data_bestfree_p(hdr);
+	count = lastfree = freeseen = 0;
+	if (!bf[0].length) {
+		XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+		freeseen |= 1 << 0;
+	}
+	if (!bf[1].length) {
+		XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+		freeseen |= 1 << 1;
+	}
+	if (!bf[2].length) {
+		XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+		freeseen |= 1 << 2;
+	}
+
+	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+						be16_to_cpu(bf[1].length));
+	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+						be16_to_cpu(bf[2].length));
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+			XFS_WANT_CORRUPTED_RETURN(
+				be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+					       (char *)dup - (char *)hdr);
+			dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+			if (dfp) {
+				i = (int)(dfp - bf);
+				XFS_WANT_CORRUPTED_RETURN(
+					(freeseen & (1 << i)) == 0);
+				freeseen |= 1 << i;
+			} else {
+				XFS_WANT_CORRUPTED_RETURN(
+					be16_to_cpu(dup->length) <=
+						be16_to_cpu(bf[2].length));
+			}
+			p += be16_to_cpu(dup->length);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		dep = (xfs_dir2_data_entry_t *)p;
+		XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+		XFS_WANT_CORRUPTED_RETURN(
+			!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+		XFS_WANT_CORRUPTED_RETURN(
+			be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
+					       (char *)dep - (char *)hdr);
+		XFS_WANT_CORRUPTED_RETURN(
+				ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
+		count++;
+		lastfree = 0;
+		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+			addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+						(xfs_dir2_data_aoff_t)
+						((char *)dep - (char *)hdr));
+			name.name = dep->name;
+			name.len = dep->namelen;
+			hash = mp->m_dirnameops->hashname(&name);
+			for (i = 0; i < be32_to_cpu(btp->count); i++) {
+				if (be32_to_cpu(lep[i].address) == addr &&
+				    be32_to_cpu(lep[i].hashval) == hash)
+					break;
+			}
+			XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+		}
+		p += ops->data_entsize(dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+			if (lep[i].address ==
+			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+			if (i > 0)
+				XFS_WANT_CORRUPTED_RETURN(
+					be32_to_cpu(lep[i].hashval) >=
+						be32_to_cpu(lep[i - 1].hashval));
+		}
+		XFS_WANT_CORRUPTED_RETURN(count ==
+			be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+		XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+	}
+	return 0;
+}
+
+static bool
+xfs_dir3_data_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+			return false;
+	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+		xfs_dir3_data_verify(bp);
+		return;
+	default:
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		break;
+	}
+}
+
+static void
+xfs_dir3_data_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+		 xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_dir3_data_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_data_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_data_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+	.verify_read = xfs_dir3_data_read_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+	.verify_read = xfs_dir3_data_reada_verify,
+	.verify_write = xfs_dir3_data_write_verify,
+};
+
+
+int
+xfs_dir3_data_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+	return err;
+}
+
+int
+xfs_dir3_data_readahead(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno)
+{
+	return xfs_da_reada_buf(dp, bno, mapped_bno,
+				XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+}
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup)	/* unused space */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+#ifdef DEBUG
+	int			matched;	/* matched the value */
+	int			seenzero;	/* saw a 0 bestfree entry */
+#endif
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+#ifdef DEBUG
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+	for (dfp = &bf[0], seenzero = matched = 0;
+	     dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (!dfp->offset) {
+			ASSERT(!dfp->length);
+			seenzero = 1;
+			continue;
+		}
+		ASSERT(seenzero == 0);
+		if (be16_to_cpu(dfp->offset) == off) {
+			matched = 1;
+			ASSERT(dfp->length == dup->length);
+		} else if (off < be16_to_cpu(dfp->offset))
+			ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+		else
+			ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+		ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+		if (dfp > &bf[0])
+			ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+	}
+#endif
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (be16_to_cpu(dup->length) <
+	    be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+		if (!dfp->offset)
+			return NULL;
+		if (be16_to_cpu(dfp->offset) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	struct xfs_dir2_data_hdr *hdr,		/* data block pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup,	/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	new.length = dup->length;
+	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &bf[0]) {
+		bf[0] = bf[1];
+		bf[1] = bf[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &bf[1])
+		bf[1] = bf[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &bf[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	bf[2].length = 0;
+	bf[2].offset = 0;
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	struct xfs_inode	*dp,
+	struct xfs_dir2_data_hdr *hdr,
+	int			*loghead)
+{
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* active data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	struct xfs_dir2_data_free *bf;
+	char			*endp;		/* end of block's data */
+	char			*p;		/* current entry pointer */
+	struct xfs_da_geometry	*geo = dp->i_mount->m_dir_geo;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Start by clearing the table.
+	 */
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+	*loghead = 1;
+	/*
+	 * Set up pointers.
+	 */
+	p = (char *)dp->d_ops->data_entry_p(hdr);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		btp = xfs_dir2_block_tail_p(geo, hdr);
+		endp = (char *)xfs_dir2_block_leaf_p(btp);
+	} else
+		endp = (char *)hdr + geo->blksize;
+	/*
+	 * Loop over the block's entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT((char *)dup - (char *)hdr ==
+			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+			xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+			p += be16_to_cpu(dup->length);
+		}
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		else {
+			dep = (xfs_dir2_data_entry_t *)p;
+			ASSERT((char *)dep - (char *)hdr ==
+			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+			p += dp->d_ops->data_entsize(dep->namelen);
+		}
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir3_data_init(
+	xfs_da_args_t		*args,		/* directory operation args */
+	xfs_dir2_db_t		blkno,		/* logical dir block number */
+	struct xfs_buf		**bpp)		/* output block buffer */
+{
+	struct xfs_buf		*bp;		/* block buffer */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	struct xfs_dir2_data_free *bf;
+	int			error;		/* error return value */
+	int			i;		/* bestfree index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int                     t;              /* temp */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+			       -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
+	/*
+	 * Initialize the header.
+	 */
+	hdr = bp->b_addr;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+	} else
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		bf[i].length = 0;
+		bf[i].offset = 0;
+	}
+
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = dp->d_ops->data_unused_p(hdr);
+	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+
+	t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+	bf[0].length = cpu_to_be16(t);
+	dup->length = cpu_to_be16(t);
+	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(args, bp);
+	xfs_dir2_data_log_unused(args, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+		(uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+		       (char *)hdr - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+	xfs_trans_log_buf(args->trans, bp, 0,
+			  args->dp->d_ops->data_entry_offset - 1);
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)hdr));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	char			*endptr;	/* end of data area */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+	struct xfs_dir2_data_free *bf;
+
+	hdr = bp->b_addr;
+
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+		endptr = (char *)hdr + args->geo->blksize;
+	else {
+		xfs_dir2_block_tail_t	*btp;	/* block tail */
+
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+			hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+		btp = xfs_dir2_block_tail_p(args->geo, hdr);
+		endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	}
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > args->dp->d_ops->data_entry_offset) {
+		__be16			*tagp;	/* tag just before us */
+
+		tagp = (__be16 *)((char *)hdr + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+		if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if ((char *)hdr + offset + len < endptr) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	bf = args->dp->d_ops->data_bestfree_p(hdr);
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = (bf[2].length != 0);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &bf[1]) {
+				dfp = &bf[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &bf[1];
+			}
+			xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+						       needlogp);
+			ASSERT(dfp == &bf[0]);
+			ASSERT(dfp->length == prevdup->length);
+			ASSERT(!dfp[1].length);
+			ASSERT(!dfp[2].length);
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		be16_add_cpu(&prevdup->length, len);
+		*xfs_dir2_data_unused_tag_p(prevdup) =
+			cpu_to_be16((char *)prevdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(prevdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else {
+			needscan = be16_to_cpu(newdup->length) >
+				   be16_to_cpu(bf[2].length);
+		}
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	int			oldlen;		/* old unused entry's length */
+	struct xfs_dir2_data_free *bf;
+
+	hdr = bp->b_addr;
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+	ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+	ASSERT(offset >= (char *)dup - (char *)hdr);
+	ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
+	ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	oldlen = be16_to_cpu(dup->length);
+	bf = args->dp->d_ops->data_bestfree_p(hdr);
+	dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)hdr == offset;
+	matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = (bf[2].offset != 0);
+			if (!needscan)
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup->length = cpu_to_be16(oldlen - len);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(dfp->length == newdup->length);
+			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &bf[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+		*xfs_dir2_data_unused_tag_p(newdup) =
+			cpu_to_be16((char *)newdup - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+		newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+		newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+		*xfs_dir2_data_unused_tag_p(newdup2) =
+			cpu_to_be16((char *)newdup2 - (char *)hdr);
+		xfs_dir2_data_log_unused(args, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = (bf[2].length != 0);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+							 needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..78b411bfc543
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1831 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+				    int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+				    struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+				   struct xfs_buf *bp);
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(dp, bp) \
+do { \
+	if (!xfs_dir3_leaf1_check((dp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+bool
+xfs_dir3_leaf_check_int(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
+	struct xfs_dir3_icleaf_hdr *hdr,
+	struct xfs_dir2_leaf	*leaf)
+{
+	struct xfs_dir2_leaf_entry *ents;
+	xfs_dir2_leaf_tail_t	*ltp;
+	int			stale;
+	int			i;
+	const struct xfs_dir_ops *ops;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+
+	/*
+	 * we can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
+	 */
+	ops = xfs_dir_get_ops(mp, dp);
+
+	if (!hdr) {
+		ops->leaf_hdr_from_disk(&leafhdr, leaf);
+		hdr = &leafhdr;
+	}
+
+	ents = ops->leaf_ents_p(leaf);
+	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+	/*
+	 * XXX (dgc): This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	if (hdr->count > ops->leaf_max_ents(geo))
+		return false;
+
+	/* Leaves and bests don't overlap in leaf format. */
+	if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	     hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+	    (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+		return false;
+
+	/* Check hash value order, count stale entries.  */
+	for (i = stale = 0; i < hdr->count; i++) {
+		if (i + 1 < hdr->count) {
+			if (be32_to_cpu(ents[i].hashval) >
+					be32_to_cpu(ents[i + 1].hashval))
+				return false;
+		}
+		if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			stale++;
+	}
+	if (hdr->stale != stale)
+		return false;
+	return true;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
+	struct xfs_buf		*bp,
+	__uint16_t		magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		__uint16_t		magic3;
+
+		magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+							 : XFS_DIR3_LEAFN_MAGIC;
+
+		if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+			return false;
+		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (leaf->hdr.info.magic != cpu_to_be16(magic))
+			return false;
+	}
+
+	return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+
+static void
+__read_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_dir3_leaf_verify(bp, magic))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+__write_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_leaf_verify(bp, magic)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+static void
+xfs_dir3_leaf1_read_verify(
+	struct xfs_buf	*bp)
+{
+	__read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leaf1_write_verify(
+	struct xfs_buf	*bp)
+{
+	__write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_read_verify(
+	struct xfs_buf	*bp)
+{
+	__read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_write_verify(
+	struct xfs_buf	*bp)
+{
+	__write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+	.verify_read = xfs_dir3_leaf1_read_verify,
+	.verify_write = xfs_dir3_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+	.verify_read = xfs_dir3_leafn_read_verify,
+	.verify_write = xfs_dir3_leafn_write_verify,
+};
+
+static int
+xfs_dir3_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+	return err;
+}
+
+int
+xfs_dir3_leafn_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+	return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	xfs_ino_t		owner,
+	__uint16_t		type)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+		memset(leaf3, 0, sizeof(*leaf3));
+
+		leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+					 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+					 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+		leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+		leaf3->info.owner = cpu_to_be64(owner);
+		uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		memset(leaf, 0, sizeof(*leaf));
+		leaf->hdr.info.magic = cpu_to_be16(type);
+	}
+
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * Caller is responsible for initialising the bests table.
+	 */
+	if (type == XFS_DIR2_LEAF1_MAGIC) {
+		struct xfs_dir2_leaf_tail *ltp;
+
+		ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+		ltp->bestcount = 0;
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+	} else {
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+}
+
+int
+xfs_dir3_leaf_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		bno,
+	struct xfs_buf		**bpp,
+	__uint16_t		magic)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+	       bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+			       -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (magic == XFS_DIR2_LEAF1_MAGIC)
+		xfs_dir3_leaf_log_tail(args, bp);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*dbp)		/* input block's buffer */
+{
+	__be16			*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_block_to_leaf(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = xfs_dir2_da_to_db(args->geo, blkno);
+	ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	leafhdr.count = be32_to_cpu(btp->count);
+	leafhdr.stale = be32_to_cpu(btp->stale);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	dbp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	else
+		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(1);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	bestsp[0] =  bf[0].length;
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+	return 0;
+}
+
+STATIC void
+xfs_dir3_leaf_find_stale(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,
+	int			*lowstale,
+	int			*highstale)
+{
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+		if (ents[*lowstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+	}
+
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the result would require moving more entries than using
+	 * lowstale.
+	 */
+	for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+		if (ents[*highstale].address ==
+		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			break;
+		if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+			break;
+	}
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int			index,		/* leaf table position */
+	int			compact,	/* need to compact leaves */
+	int			lowstale,	/* index of prev stale leaf */
+	int			highstale,	/* index of next stale leaf */
+	int			*lfloglow,	/* low leaf logging index */
+	int			*lfloghigh)	/* high leaf logging index */
+{
+	if (!leafhdr->stale) {
+		xfs_dir2_leaf_entry_t	*lep;	/* leaf entry table pointer */
+
+		/*
+		 * Now we need to make room to insert the leaf entry.
+		 *
+		 * If there are no stale entries, just insert a hole at index.
+		 */
+		lep = &ents[index];
+		if (index < leafhdr->count)
+			memmove(lep + 1, lep,
+				(leafhdr->count - index) * sizeof(*lep));
+
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		*lfloglow = index;
+		*lfloghigh = leafhdr->count++;
+		return lep;
+	}
+
+	/*
+	 * There are stale entries.
+	 *
+	 * We will use one of them for the new entry.  It's probably not at
+	 * the right location, so we'll have to shift some up or down first.
+	 *
+	 * If we didn't compact before, we need to find the nearest stale
+	 * entries before and after our insertion point.
+	 */
+	if (compact == 0)
+		xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+					 &lowstale, &highstale);
+
+	/*
+	 * If the low one is better, use it.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale - 1 < highstale - index)) {
+		ASSERT(index - lowstale - 1 >= 0);
+		ASSERT(ents[lowstale].address ==
+		       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+		/*
+		 * Copy entries up to cover the stale entry and make room
+		 * for the new entry.
+		 */
+		if (index - lowstale - 1 > 0) {
+			memmove(&ents[lowstale], &ents[lowstale + 1],
+				(index - lowstale - 1) *
+					sizeof(xfs_dir2_leaf_entry_t));
+		}
+		*lfloglow = MIN(lowstale, *lfloglow);
+		*lfloghigh = MAX(index - 1, *lfloghigh);
+		leafhdr->stale--;
+		return &ents[index - 1];
+	}
+
+	/*
+	 * The high one is better, so use that one.
+	 */
+	ASSERT(highstale - index >= 0);
+	ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+	/*
+	 * Copy entries down to cover the stale entry and make room for the
+	 * new entry.
+	 */
+	if (highstale - index > 0) {
+		memmove(&ents[index + 1], &ents[index],
+			(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+	}
+	*lfloglow = MIN(index, *lfloglow);
+	*lfloghigh = MAX(highstale, *lfloghigh);
+	leafhdr->stale--;
+	return &ents[index];
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* freespace table in leaf */
+	int			compact;	/* need to compact leaves */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	struct xfs_buf		*lbp;		/* leaf's buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale;	/* index of prev stale leaf */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	__be16			*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		use_block;	/* data block number */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_addname(args);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+	if (error)
+		return error;
+
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	length = dp->d_ops->data_entsize(args->namelen);
+
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     index++, lep++) {
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+		ASSERT(i < be32_to_cpu(ltp->bestcount));
+		ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+		if (be16_to_cpu(bestsp[i]) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+			    use_block == -1)
+				use_block = i;
+			else if (be16_to_cpu(bestsp[i]) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes = 0;
+	if (!leafhdr.stale)
+		needbytes += sizeof(xfs_dir2_leaf_entry_t);
+	if (use_block == -1)
+		needbytes += sizeof(xfs_dir2_data_off_t);
+
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+	    leafhdr.stale > 1)
+		compact = 1;
+
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
+			xfs_trans_brelse(tp, lbp);
+			return ENOSPC;
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+		xfs_trans_brelse(tp, lbp);
+		return use_block == -1 ? ENOSPC : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_trans_brelse(tp, lbp);
+		return ENOSPC;
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+			&highstale, &lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (leafhdr.stale) {
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= be32_to_cpu(ltp->bestcount)) {
+			bestsp--;
+			memmove(&bestsp[0], &bestsp[1],
+				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+			be32_add_cpu(&ltp->bestcount, 1);
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		bestsp[use_block] = bf[0].length;
+		grown = 1;
+	} else {
+		/*
+		 * Already had space in some data block.
+		 * Just read that one in.
+		 */
+		error = xfs_dir3_data_read(tp, dp,
+				   xfs_dir2_db_to_da(args->geo, use_block),
+				   -1, &dbp);
+		if (error) {
+			xfs_trans_brelse(tp, lbp);
+			return error;
+		}
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		grown = 0;
+	}
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+	ASSERT(be16_to_cpu(dup->length) >= length);
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+		&needlog, &needscan);
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+		bestsp[use_block] = bf[0].length;
+		if (!grown)
+			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+	}
+
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(
+				xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+				be16_to_cpu(*tagp)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	return 0;
+}
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_buf	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode *dp = args->dp;
+
+	leaf = bp->b_addr;
+	if (!leafhdr->stale)
+		return;
+
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			ents[to] = ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(leafhdr->stale == from - to);
+	leafhdr->count -= leafhdr->stale;
+	leafhdr->stale = 0;
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	if (loglow != -1)
+		xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	ASSERT(leafhdr->stale > 1);
+	index = *indexp;
+
+	xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == leafhdr->count ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < leafhdr->count; from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			ents[to] = ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	leafhdr->count -= from - to;
+	leafhdr->stale = 1;
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = leafhdr->count;
+	*highlogp = leafhdr->count - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	__be16			*firstb;	/* pointer to first entry */
+	__be16			*lastb;		/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
+	firstlep = &ents[first];
+	lastlep = &ents[last];
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	xfs_trans_log_buf(args->trans, bp,
+			  (uint)((char *)&leaf->hdr - (char *)leaf),
+			  args->dp->d_ops->leaf_hdr_size - 1);
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(args->geo->blksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leaf_lookup(args);
+
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir3_leaf_check(dp, lbp);
+	leaf = lbp->b_addr;
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &ents[index];
+
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	/*
+	 * Return the found inode number & CI name if appropriate
+	 */
+	args->inumber = be64_to_cpu(dep->inumber);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return error;
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	struct xfs_buf		**dbpp)		/* out: data buffer */
+{
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	struct xfs_buf		*dbp = NULL;	/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
+	enum xfs_dacmp		cmp;		/* name compare result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+
+	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+	if (error)
+		return error;
+
+	*lbpp = lbp;
+	leaf = lbp->b_addr;
+	xfs_dir3_leaf_check(dp, lbp);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, newdb),
+					   -1, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			*indexp = index;
+			/* case exact match: return the current buffer. */
+			if (cmp == XFS_CMP_EXACT) {
+				*dbpp = dbp;
+				return 0;
+			}
+			cidb = curdb;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or remove).
+	 * If a case-insensitive match was found earlier, re-read the
+	 * appropriate data block if required and return it.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE) {
+		ASSERT(cidb != -1);
+		if (cidb != curdb) {
+			xfs_trans_brelse(tp, dbp);
+			error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, cidb),
+					   -1, &dbp);
+			if (error) {
+				xfs_trans_brelse(tp, lbp);
+				return error;
+			}
+		}
+		*dbpp = dbp;
+		return 0;
+	}
+	/*
+	 * No match found, return ENOENT.
+	 */
+	ASSERT(cidb == -1);
+	if (dbp)
+		xfs_trans_brelse(tp, dbp);
+	xfs_trans_brelse(tp, lbp);
+	return ENOENT;
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	__be16			*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	trace_xfs_dir2_leaf_removename(args);
+
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->b_addr;
+	hdr = dbp->b_addr;
+	xfs_dir3_data_check(dp, dbp);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &ents[index];
+	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+		xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	needscan = needlog = 0;
+	oldbest = be16_to_cpu(bf[0].length);
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(args, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	leafhdr.stale++;
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, lbp, index, index);
+
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (be16_to_cpu(bf[0].length) != oldbest) {
+		bestsp[db] = bf[0].length;
+		xfs_dir3_leaf_log_bests(args, lbp, db, db);
+	}
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (be16_to_cpu(bf[0].length) ==
+			args->geo->blksize - dp->d_ops->data_entry_offset) {
+		ASSERT(db != args->geo->datablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == ENOSPC && args->total == 0)
+				error = 0;
+			xfs_dir3_leaf_check(dp, lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == be32_to_cpu(ltp->bestcount) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			memmove(&bestsp[db - i], bestsp,
+				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+			be32_add_cpu(&ltp->bestcount, -(db - i));
+			xfs_dir3_leaf_log_tail(args, lbp);
+			xfs_dir3_leaf_log_bests(args, lbp, 0,
+						be32_to_cpu(ltp->bestcount) - 1);
+		} else
+			bestsp[db] = cpu_to_be16(NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != args->geo->datablk)
+		dbp = NULL;
+
+	xfs_dir3_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	struct xfs_buf		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leaf_replace(args);
+
+	/*
+	 * Look up the entry.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->b_addr;
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->b_addr +
+	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	dep->inumber = cpu_to_be64(args->inumber);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	xfs_dir3_leaf_check(dp, lbp);
+	xfs_trans_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	leaf = lbp->b_addr;
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
+	args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = ents, low = 0, high = leafhdr.count - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	__be16			*bestsp;	/* leaf bests table */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+				   -1, &dbp);
+	if (error)
+		return error;
+
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+#ifdef DEBUG
+{
+	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+	struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+	ASSERT(be16_to_cpu(bf[0].length) ==
+	       args->geo->blksize - dp->d_ops->data_entry_offset);
+	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != ENOSPC);
+		xfs_trans_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = xfs_dir2_leaf_bests_p(ltp);
+	be32_add_cpu(&ltp->bestcount, -1);
+	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	return 0;
+}
+
+static inline size_t
+xfs_dir3_leaf_size(
+	struct xfs_dir3_icleaf_hdr	*hdr,
+	int				counts)
+{
+	int	entries;
+	int	hdrsize;
+
+	entries = hdr->count - hdr->stale;
+	if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	    hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+		hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+	else
+		hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+	return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+	               + counts * sizeof(xfs_dir2_data_off_t)
+		       + sizeof(xfs_dir2_leaf_tail_t);
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	struct xfs_buf		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	struct xfs_buf		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+
+	trace_xfs_dir2_node_to_leaf(args);
+
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= args->geo->fsbcount;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > args->geo->freeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= args->geo->fsbcount;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
+	if (error)
+		return error;
+	free = fbp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+	ASSERT(!freehdr.firstdb);
+
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+		xfs_trans_brelse(tp, fbp);
+		return 0;
+	}
+
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 */
+	if (leafhdr.stale)
+		xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+	lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+	leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+					? XFS_DIR2_LEAF1_MAGIC
+					: XFS_DIR3_LEAF1_MAGIC;
+
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
+	/*
+	 * Set up the leaf bests table.
+	 */
+	memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir3_leaf_log_tail(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+			fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..4cf8b99d09a4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2284 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+			      int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+				     xfs_da_state_blk_t *fblk);
+
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(dp, bp) \
+do { \
+	if (!xfs_dir3_leafn_check((dp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+			return false;
+	}
+
+	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+	return true;
+}
+
+static void
+xfs_dir3_free_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_dir3_free_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_free_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_free_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+	.verify_read = xfs_dir3_free_read_verify,
+	.verify_write = xfs_dir3_free_write_verify,
+};
+
+
+static int
+__xfs_dir3_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+	/* try read returns without an error or *bpp if it lands in a hole */
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+	return err;
+}
+
+int
+xfs_dir2_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_dir3_icfree_hdr hdr;
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+				   -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+	bp->b_ops = &xfs_dir3_free_buf_ops;
+
+	/*
+	 * Initialize the new block to be empty, and remember
+	 * its first slot as our empty slot.
+	 */
+	memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+	memset(&hdr, 0, sizeof(hdr));
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+		hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+	} else
+		hdr.magic = XFS_DIR2_FREE_MAGIC;
+	dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp,
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*bests;
+
+	free = bp->b_addr;
+	bests = args->dp->d_ops->free_bests_p(free);
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	xfs_trans_log_buf(args->trans, bp,
+		(uint)((char *)&bests[first] - (char *)free),
+		(uint)((char *)&bests[last] - (char *)free +
+		       sizeof(bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
+{
+#ifdef DEBUG
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->b_addr;
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+	xfs_trans_log_buf(args->trans, bp, 0,
+			  args->dp->d_ops->free_hdr_size - 1);
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	__be16			*to;		/* pointer to freespace entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	trace_xfs_dir2_leaf_to_node(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+	if (error)
+		return error;
+
+	free = fbp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	leaf = lbp->b_addr;
+	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+	ASSERT(be32_to_cpu(ltp->bestcount) <=
+				(uint)dp->i_d.di_size / args->geo->blksize);
+
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	from = xfs_dir2_leaf_bests_p(ltp);
+	to = dp->d_ops->free_bests_p(free);
+	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+			n++;
+		*to = cpu_to_be16(off);
+	}
+
+	/*
+	 * Now initialize the freespace block header.
+	 */
+	freehdr.nused = n;
+	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+	dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+	xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * Converting the leaf to a leafnode is just a matter of changing the
+	 * magic number and the ops. Do the change directly to the buffer as
+	 * it's less work (and less code) than decoding the header to host
+	 * format and back again.
+	 */
+	if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+	else
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+	xfs_dir3_leaf_log_header(args, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	int			compact;	/* compacting stale leaves */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			highstale;	/* next stale entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale;	/* previous stale entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leafn_add(args, index);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Quick check just to make sure we are not going to index
+	 * into other peoples memory
+	 */
+	if (index < 0)
+		return EFSCORRUPTED;
+
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+
+	if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+		if (!leafhdr.stale)
+			return ENOSPC;
+		compact = leafhdr.stale > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+	ASSERT(index == leafhdr.count ||
+	       be32_to_cpu(ents[index].hashval) >= args->hashval);
+
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact)
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+					 &highstale, &lfloglow, &lfloghigh);
+	else if (leafhdr.stale) {
+		/*
+		 * Set impossible logging indices for this case.
+		 */
+		lfloglow = leafhdr.count;
+		lfloghigh = -1;
+	}
+
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+				       highstale, &lfloglow, &lfloghigh);
+
+	lep->hashval = cpu_to_be32(args->hashval);
+	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+				args->blkno, args->index));
+
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+	xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,
+	xfs_dir2_db_t	db)
+{
+	struct xfs_dir3_icfree_hdr hdr;
+
+	dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+
+	ASSERT((hdr.firstdb %
+		dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+	ASSERT(hdr.firstdb <= db);
+	ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leafn_lasthash(
+	struct xfs_inode *dp,
+	struct xfs_buf	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
+	if (count)
+		*count = leafhdr.count;
+	if (!leafhdr.count)
+		return 0;
+
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free = NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	ASSERT(leafhdr.count > 0);
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		/* If so, it's a free block buffer, get the block number. */
+		curbp = state->extrablk.bp;
+		curfdb = state->extrablk.blkno;
+		free = curbp->b_addr;
+		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	}
+	length = dp->d_ops->data_entsize(args->namelen);
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 *
+		 * If this block isn't the data block we already have
+		 * in hand, take a look at it.
+		 */
+		if (newdb != curdb) {
+			__be16 *bests;
+
+			curdb = newdb;
+			/*
+			 * Convert the data block to the free block
+			 * holding its freespace information.
+			 */
+			newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+			/*
+			 * If it's not the one we have in hand, read it in.
+			 */
+			if (newfdb != curfdb) {
+				/*
+				 * If we had one before, drop it.
+				 */
+				if (curbp)
+					xfs_trans_brelse(tp, curbp);
+
+				error = xfs_dir2_free_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newfdb),
+						&curbp);
+				if (error)
+					return error;
+				free = curbp->b_addr;
+
+				xfs_dir2_free_hdr_check(dp, curbp, curdb);
+			}
+			/*
+			 * Get the index for our entry.
+			 */
+			fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+			/*
+			 * If it has room, return it.
+			 */
+			bests = dp->d_ops->free_bests_p(free);
+			if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
+				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+							XFS_ERRLEVEL_LOW, mp);
+				if (curfdb != newfdb)
+					xfs_trans_brelse(tp, curbp);
+				return EFSCORRUPTED;
+			}
+			curfdb = newfdb;
+			if (be16_to_cpu(bests[fi]) >= length)
+				goto out;
+		}
+	}
+	/* Didn't find any space */
+	fi = -1;
+out:
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	if (curbp) {
+		/* Giving back a free block. */
+		state->extravalid = 1;
+		state->extrablk.bp = curbp;
+		state->extrablk.index = fi;
+		state->extrablk.blkno = curfdb;
+
+		/*
+		 * Important: this magic number is not in the buffer - it's for
+		 * buffer type information and therefore only the free/data type
+		 * matters here, not whether CRCs are enabled or not.
+		 */
+		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+	} else {
+		state->extravalid = 0;
+	}
+	/*
+	 * Return the index, that will be the insertion point.
+	 */
+	*indexp = index;
+	return ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(dp, bp);
+	ASSERT(leafhdr.count > 0);
+
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		curbp = state->extrablk.bp;
+		curdb = state->extrablk.blkno;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(args->geo,
+					       be32_to_cpu(lep->address));
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 *
+		 * If it's a different data block, go get it.
+		 */
+		if (newdb != curdb) {
+			/*
+			 * If we had a block before that we aren't saving
+			 * for a CI name, drop it
+			 */
+			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+						curdb != state->extrablk.blkno))
+				xfs_trans_brelse(tp, curbp);
+			/*
+			 * If needing the block that is saved with a CI match,
+			 * use it otherwise read in the new data block.
+			 */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+					newdb == state->extrablk.blkno) {
+				ASSERT(state->extravalid);
+				curbp = state->extrablk.bp;
+			} else {
+				error = xfs_dir3_data_read(tp, dp,
+						xfs_dir2_db_to_da(args->geo,
+								  newdb),
+						-1, &curbp);
+				if (error)
+					return error;
+			}
+			xfs_dir3_data_check(dp, curbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+			xfs_dir2_dataptr_to_off(args->geo,
+						be32_to_cpu(lep->address)));
+		/*
+		 * Compare the entry and if it's an exact match, return
+		 * EEXIST immediately. If it's the first case-insensitive
+		 * match, store the block & inode number and continue looking.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			/* If there is a CI match block, drop it */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+						curdb != state->extrablk.blkno)
+				xfs_trans_brelse(tp, state->extrablk.bp);
+			args->cmpresult = cmp;
+			args->inumber = be64_to_cpu(dep->inumber);
+			args->filetype = dp->d_ops->data_get_ftype(dep);
+			*indexp = index;
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.blkno = curdb;
+			state->extrablk.index = (int)((char *)dep -
+							(char *)curbp->b_addr);
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+			if (cmp == XFS_CMP_EXACT)
+				return EEXIST;
+		}
+	}
+	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+	if (curbp) {
+		if (args->cmpresult == XFS_CMP_DIFFERENT) {
+			/* Giving back last used data block. */
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.index = -1;
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+		} else {
+			/* If the curbp is not the CI match block, drop it */
+			if (state->extrablk.bp != curbp)
+				xfs_trans_brelse(tp, curbp);
+		}
+	} else {
+		state->extravalid = 0;
+	}
+	*indexp = index;
+	return ENOENT;
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	struct xfs_buf		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	if (args->op_flags & XFS_DA_OP_ADDNAME)
+		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+							state);
+	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+	xfs_da_args_t			*args,	/* operation arguments */
+	struct xfs_buf			*bp_s,	/* source */
+	struct xfs_dir3_icleaf_hdr	*shdr,
+	struct xfs_dir2_leaf_entry	*sents,
+	int				start_s,/* source leaf index */
+	struct xfs_buf			*bp_d,	/* destination */
+	struct xfs_dir3_icleaf_hdr	*dhdr,
+	struct xfs_dir2_leaf_entry	*dents,
+	int				start_d,/* destination leaf index */
+	int				count)	/* count of leaves to copy */
+{
+	int				stale;	/* count stale leaves copied */
+
+	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < dhdr->count) {
+		memmove(&dents[start_d + count], &dents[start_d],
+			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+				       count + dhdr->count - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (shdr->stale) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (sents[i].address ==
+					cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	memcpy(&dents[start_d], &sents[start_s],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < shdr->count) {
+		memmove(&sents[start_s], &sents[start_s + count],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+	}
+
+	/*
+	 * Update the headers and log them.
+	 */
+	shdr->count -= count;
+	shdr->stale -= stale;
+	dhdr->count += count;
+	dhdr->stale += stale;
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	struct xfs_inode	*dp,
+	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
+	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
+{
+	struct xfs_dir2_leaf	*leaf1 = leaf1_bp->b_addr;
+	struct xfs_dir2_leaf	*leaf2 = leaf2_bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+	if (hdr1.count > 0 && hdr2.count > 0 &&
+	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+	     be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+				be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap;		/* swapped leaf blocks */
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
+		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
+
+		tmp = blk1;
+		blk1 = blk2;
+		blk2 = tmp;
+	}
+	leaf1 = blk1->bp->b_addr;
+	leaf2 = blk2->bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+	oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+	oldstale = hdr1.stale + hdr2.stale;
+#endif
+	mid = oldsum >> 1;
+
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= hdr1.count)
+			midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+		else
+			midhash = be32_to_cpu(ents1[mid].hashval);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.  Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = hdr1.count - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+					hdr1.count - count, blk2->bp,
+					&hdr2, ents2, 0, count);
+	else if (count < 0)
+		xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+					blk1->bp, &hdr1, ents1,
+					hdr1.count, count);
+
+	ASSERT(hdr1.count + hdr2.count == oldsum);
+	ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+	/* log the changes made when moving the entries */
+	dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+	dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+	xfs_dir3_leaf_log_header(args, blk1->bp);
+	xfs_dir3_leaf_log_header(args, blk2->bp);
+
+	xfs_dir3_leaf_check(dp, blk1->bp);
+	xfs_dir3_leaf_check(dp, blk2->bp);
+
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (hdr1.count < hdr2.count)
+		state->inleaf = swap;
+	else if (hdr1.count > hdr2.count)
+		state->inleaf = !swap;
+	else
+		state->inleaf = swap ^ (blk1->index <= hdr1.count);
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - hdr1.count;
+
+	/*
+	 * Finally sanity check just to make sure we are not returning a
+	 * negative index
+	 */
+	if (blk2->index < 0) {
+		state->inleaf = 1;
+		blk2->index = 0;
+		xfs_alert(dp->i_mount,
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+			__func__, blk1->index);
+	}
+}
+
+static int
+xfs_dir3_data_block_free(
+	xfs_da_args_t		*args,
+	struct xfs_dir2_data_hdr *hdr,
+	struct xfs_dir2_free	*free,
+	xfs_dir2_db_t		fdb,
+	int			findex,
+	struct xfs_buf		*fbp,
+	int			longest)
+{
+	int			logfree = 0;
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_inode	*dp = args->dp;
+
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+	bests = dp->d_ops->free_bests_p(free);
+	if (hdr) {
+		/*
+		 * Data block is not empty, just set the free entry to the new
+		 * value.
+		 */
+		bests[findex] = cpu_to_be16(longest);
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+		return 0;
+	}
+
+	/* One less used entry in the free table. */
+	freehdr.nused--;
+
+	/*
+	 * If this was the last entry in the table, we can trim the table size
+	 * back.  There might be other entries at the end referring to
+	 * non-existent data blocks, get those too.
+	 */
+	if (findex == freehdr.nvalid - 1) {
+		int	i;		/* free entry index */
+
+		for (i = findex - 1; i >= 0; i--) {
+			if (bests[i] != cpu_to_be16(NULLDATAOFF))
+				break;
+		}
+		freehdr.nvalid = i + 1;
+		logfree = 0;
+	} else {
+		/* Not the last entry, just punch it out.  */
+		bests[findex] = cpu_to_be16(NULLDATAOFF);
+		logfree = 1;
+	}
+
+	dp->d_ops->free_hdr_to_disk(free, &freehdr);
+	xfs_dir2_free_log_header(args, fbp);
+
+	/*
+	 * If there are no useful entries left in the block, get rid of the
+	 * block if we can.
+	 */
+	if (!freehdr.nused) {
+		int error;
+
+		error = xfs_dir2_shrink_inode(args, fdb, fbp);
+		if (error == 0) {
+			fbp = NULL;
+			logfree = 0;
+		} else if (error != ENOSPC || args->total != 0)
+			return error;
+		/*
+		 * It's possible to get ENOSPC if there is no
+		 * space reservation.  In this case some one
+		 * else will eventually get rid of this block.
+		 */
+	}
+
+	/* Log the free entry that changed, unless we got rid of it.  */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+	return 0;
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		db;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+
+	trace_xfs_dir2_leafn_remove(args, index);
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &ents[index];
+
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->blkno == db);
+	off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+	ASSERT(dblk->index == off);
+
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	leafhdr.stale++;
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(args, bp);
+
+	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+	xfs_dir3_leaf_log_ents(args, bp, index, index);
+
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	hdr = dbp->b_addr;
+	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	longest = be16_to_cpu(bf[0].length);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(args, dbp, off,
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	xfs_dir3_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < be16_to_cpu(bf[0].length)) {
+		int		error;		/* error return value */
+		struct xfs_buf	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t	*free;		/* freeblock structure */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = dp->d_ops->db_to_fdb(args->geo, db);
+		error = xfs_dir2_free_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, fdb),
+					   &fbp);
+		if (error)
+			return error;
+		free = fbp->b_addr;
+#ifdef DEBUG
+	{
+		struct xfs_dir3_icfree_hdr freehdr;
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+			(fdb - xfs_dir2_byte_to_db(args->geo,
+						   XFS_DIR2_FREE_OFFSET)));
+	}
+#endif
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = dp->d_ops->db_to_fdindex(args->geo, db);
+		longest = be16_to_cpu(bf[0].length);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == args->geo->blksize -
+			       dp->d_ops->data_entry_offset) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				hdr = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (!(error == ENOSPC && args->total == 0))
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		error = xfs_dir3_data_block_free(args, hdr, free,
+						 fdb, findex, fbp, longest);
+		if (error)
+			return error;
+	}
+
+	xfs_dir3_leaf_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is empty enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval = (dp->d_ops->leaf_hdr_size +
+		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+		args->geo->magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	struct xfs_inode	*dp;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	dp = args->dp;
+	mp = dp->i_mount;
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+				      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error)
+		return error;
+
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+	xfs_dir3_leaf_check(dp, oldblk->bp);
+	xfs_dir3_leaf_check(dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	struct xfs_buf		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	leaf = blk->bp->b_addr;
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	xfs_dir3_leaf_check(dp, blk->bp);
+
+	count = leafhdr.count - leafhdr.stale;
+	bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+	if (bytes > (state->args->geo->blksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (leafhdr.forw != 0);
+		memcpy(&state->altpath, &state->path, sizeof(state->path));
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = leafhdr.forw < leafhdr.back;
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		struct xfs_dir3_icleaf_hdr hdr2;
+
+		blkno = forward ? leafhdr.forw : leafhdr.back;
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		error = xfs_dir3_leafn_read(state->args->trans, dp,
+					    blkno, -1, &bp);
+		if (error)
+			return error;
+
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		count = leafhdr.count - leafhdr.stale;
+		bytes = state->args->geo->blksize -
+			(state->args->geo->blksize >> 2);
+
+		leaf = bp->b_addr;
+		dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
+		count += hdr2.count - hdr2.stale;
+		bytes -= count * sizeof(ents[0]);
+
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_trans_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	memcpy(&state->altpath, &state->path, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da3_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+	struct xfs_dir3_icleaf_hdr savehdr;
+	struct xfs_dir3_icleaf_hdr drophdr;
+	struct xfs_dir2_leaf_entry *sents;
+	struct xfs_dir2_leaf_entry *dents;
+	struct xfs_inode	*dp = state->args->dp;
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->b_addr;
+	save_leaf = save_blk->bp->b_addr;
+
+	dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+	dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+	sents = dp->d_ops->leaf_ents_p(save_leaf);
+	dents = dp->d_ops->leaf_ents_p(drop_leaf);
+
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (drophdr.stale)
+		xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+	if (savehdr.stale)
+		xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+	if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents, 0,
+					drophdr.count);
+	else
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents,
+					savehdr.count, drophdr.count);
+	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+	/* log the changes made when moving the entries */
+	dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+	dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+	xfs_dir3_leaf_log_header(args, save_blk->bp);
+	xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+	xfs_dir3_leaf_check(dp, save_blk->bp);
+	xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_addname(args);
+
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+			xfs_da3_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da3_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int					/* error */
+xfs_dir2_node_addname_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_db_t		dbno;		/* data block number */
+	struct xfs_buf		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
+	int			error;		/* error return value */
+	xfs_dir2_db_t		fbno;		/* freespace block number */
+	struct xfs_buf		*fbp;		/* freespace buffer */
+	int			findex;		/* freespace entry index */
+	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
+	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
+	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
+	int			length;		/* length of the new entry */
+	int			logfree;	/* need to log free entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	__be16			*tagp;		/* data entry tag pointer */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_data_free *bf;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	length = dp->d_ops->data_entsize(args->namelen);
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.  This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		/*
+		 * Remember initial freespace block number.
+		 */
+		ifbno = fblk->blkno;
+		free = fbp->b_addr;
+		findex = fblk->index;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+		/*
+		 * This means the free entry showed that the data block had
+		 * space for our entry, so we remembered it.
+		 * Use that data block.
+		 */
+		if (findex >= 0) {
+			ASSERT(findex < freehdr.nvalid);
+			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(bests[findex]) >= length);
+			dbno = freehdr.firstdb + findex;
+		} else {
+			/*
+			 * The data block looked at didn't have enough room.
+			 * We'll start at the beginning of the freespace entries.
+			 */
+			dbno = -1;
+			findex = 0;
+		}
+	} else {
+		/*
+		 * Didn't come in with a freespace block, so no data block.
+		 */
+		ifbno = dbno = -1;
+		fbp = NULL;
+		findex = 0;
+	}
+
+	/*
+	 * If we don't have a data block yet, we're going to scan the
+	 * freespace blocks looking for one.  Figure out what the
+	 * highest freespace block number is.
+	 */
+	if (dbno == -1) {
+		xfs_fileoff_t	fo;		/* freespace block number */
+
+		if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
+			return error;
+		lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+		fbno = ifbno;
+	}
+	/*
+	 * While we haven't identified a data block, search the freeblock
+	 * data for a good data block.  If we find a null freeblock entry,
+	 * indicating a hole in the data blocks, remember that.
+	 */
+	while (dbno == -1) {
+		/*
+		 * If we don't have a freeblock in hand, get the next one.
+		 */
+		if (fbp == NULL) {
+			/*
+			 * Happens the first time through unless lookup gave
+			 * us a freespace block to start with.
+			 */
+			if (++fbno == 0)
+				fbno = xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET);
+			/*
+			 * If it's ifbno we already looked at it.
+			 */
+			if (fbno == ifbno)
+				fbno++;
+			/*
+			 * If it's off the end we're done.
+			 */
+			if (fbno >= lastfbno)
+				break;
+			/*
+			 * Read the block.  There can be holes in the
+			 * freespace blocks, so this might not succeed.
+			 * This should be really rare, so there's no reason
+			 * to avoid it.
+			 */
+			error = xfs_dir2_free_try_read(tp, dp,
+					xfs_dir2_db_to_da(args->geo, fbno),
+					&fbp);
+			if (error)
+				return error;
+			if (!fbp)
+				continue;
+			free = fbp->b_addr;
+			findex = 0;
+		}
+		/*
+		 * Look at the current free entry.  Is it good enough?
+		 *
+		 * The bests initialisation should be where the bufer is read in
+		 * the above branch. But gcc is too stupid to realise that bests
+		 * and the freehdr are actually initialised if they are placed
+		 * there, so we have to do it here to avoid warnings. Blech.
+		 */
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+		    be16_to_cpu(bests[findex]) >= length)
+			dbno = freehdr.firstdb + findex;
+		else {
+			/*
+			 * Are we done with the freeblock?
+			 */
+			if (++findex == freehdr.nvalid) {
+				/*
+				 * Drop the block.
+				 */
+				xfs_trans_brelse(tp, fbp);
+				fbp = NULL;
+				if (fblk && fblk->bp)
+					fblk->bp = NULL;
+			}
+		}
+	}
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (unlikely(dbno == -1)) {
+		/*
+		 * Not allowed to allocate, return failure.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return ENOSPC;
+
+		/*
+		 * Allocate and initialize the new data block.
+		 */
+		if (unlikely((error = xfs_dir2_grow_inode(args,
+							 XFS_DIR2_DATA_SPACE,
+							 &dbno)) ||
+		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
+			return error;
+
+		/*
+		 * If (somehow) we have a freespace block, get rid of it.
+		 */
+		if (fbp)
+			xfs_trans_brelse(tp, fbp);
+		if (fblk && fblk->bp)
+			fblk->bp = NULL;
+
+		/*
+		 * Get the freespace block corresponding to the data block
+		 * that was just allocated.
+		 */
+		fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
+		error = xfs_dir2_free_try_read(tp, dp,
+				       xfs_dir2_db_to_da(args->geo, fbno),
+				       &fbp);
+		if (error)
+			return error;
+
+		/*
+		 * If there wasn't a freespace block, the read will
+		 * return a NULL fbp.  Allocate and initialize a new one.
+		 */
+		if (!fbp) {
+			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+						    &fbno);
+			if (error)
+				return error;
+
+			if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+				xfs_alert(mp,
+			"%s: dir ino %llu needed freesp block %lld for\n"
+			"  data block %lld, got %lld ifbno %llu lastfbno %d",
+					__func__, (unsigned long long)dp->i_ino,
+					(long long)dp->d_ops->db_to_fdb(
+								args->geo, dbno),
+					(long long)dbno, (long long)fbno,
+					(unsigned long long)ifbno, lastfbno);
+				if (fblk) {
+					xfs_alert(mp,
+				" fblk 0x%p blkno %llu index %d magic 0x%x",
+						fblk,
+						(unsigned long long)fblk->blkno,
+						fblk->index,
+						fblk->magic);
+				} else {
+					xfs_alert(mp, " ... fblk is NULL");
+				}
+				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+						 XFS_ERRLEVEL_LOW, mp);
+				return EFSCORRUPTED;
+			}
+
+			/*
+			 * Get a buffer for the new block.
+			 */
+			error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+			if (error)
+				return error;
+			free = fbp->b_addr;
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+			/*
+			 * Remember the first slot as our empty slot.
+			 */
+			freehdr.firstdb =
+				(fbno - xfs_dir2_byte_to_db(args->geo,
+							XFS_DIR2_FREE_OFFSET)) *
+					dp->d_ops->free_max_bests(args->geo);
+		} else {
+			free = fbp->b_addr;
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		}
+
+		/*
+		 * Set the freespace block index from the data block number.
+		 */
+		findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
+		/*
+		 * If it's after the end of the current entries in the
+		 * freespace block, extend that table.
+		 */
+		if (findex >= freehdr.nvalid) {
+			ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+			freehdr.nvalid = findex + 1;
+			/*
+			 * Tag new entry so nused will go up.
+			 */
+			bests[findex] = cpu_to_be16(NULLDATAOFF);
+		}
+		/*
+		 * If this entry was for an empty data block
+		 * (this should always be true) then update the header.
+		 */
+		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+			freehdr.nused++;
+			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+			xfs_dir2_free_log_header(args, fbp);
+		}
+		/*
+		 * Update the real value in the table.
+		 * We haven't allocated the data entry yet so this will
+		 * change again.
+		 */
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+	/*
+	 * We had a data block so we don't have to make a new one.
+	 */
+	else {
+		/*
+		 * If just checking, we succeeded.
+		 */
+		if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+			return 0;
+
+		/*
+		 * Read the data block in.
+		 */
+		error = xfs_dir3_data_read(tp, dp,
+					   xfs_dir2_db_to_da(args->geo, dbno),
+					   -1, &dbp);
+		if (error)
+			return error;
+		hdr = dbp->b_addr;
+		bf = dp->d_ops->data_bestfree_p(hdr);
+		logfree = 0;
+	}
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
+	/*
+	 * Point to the existing unused space.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
+	needscan = needlog = 0;
+	/*
+	 * Mark the first part of the unused space, inuse for us.
+	 */
+	xfs_dir2_data_use_free(args, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+		&needlog, &needscan);
+	/*
+	 * Fill in the new entry and log it.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	dep->inumber = cpu_to_be64(args->inumber);
+	dep->namelen = args->namelen;
+	memcpy(dep->name, args->name, dep->namelen);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
+	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
+	xfs_dir2_data_log_entry(args, dbp, dep);
+	/*
+	 * Rescan the block for bestfree if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
+	/*
+	 * Log the data block header if needed.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(args, dbp);
+	/*
+	 * If the freespace entry is now wrong, update it.
+	 */
+	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+		bests[findex] = bf[0].length;
+		logfree = 1;
+	}
+	/*
+	 * Log the freespace entry if needed.
+	 */
+	if (logfree)
+		xfs_dir2_free_log_bests(args, fbp, findex, findex);
+	/*
+	 * Return the data block and offset in args, then drop the data block.
+	 */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = be16_to_cpu(*tagp);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	trace_xfs_dir2_node_lookup(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return EEXIST */
+		xfs_dir2_data_entry_t	*dep;
+
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)state->extrablk.bp->b_addr +
+						 state->extrablk.index);
+		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	}
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	struct xfs_da_args	*args)		/* operation arguments */
+{
+	struct xfs_da_state_blk	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	struct xfs_da_state	*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_removename(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+
+	/* Look up the entry we're deleting, set up the cursor. */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error)
+		goto out_free;
+
+	/* Didn't find it, upper layer screwed up. */
+	if (rval != EEXIST) {
+		error = rval;
+		goto out_free;
+	}
+
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error)
+		goto out_free;
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da3_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da3_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+out_free:
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	trace_xfs_dir2_node_replace(args);
+
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	inum = args->inumber;
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da3_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == EEXIST) {
+		struct xfs_dir2_leaf_entry *ents;
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		leaf = blk->bp->b_addr;
+		ents = args->dp->d_ops->leaf_ents_p(leaf);
+		lep = &ents[blk->index];
+		ASSERT(state->extravalid);
+		/*
+		 * Point to the data entry.
+		 */
+		hdr = state->extrablk.bp->b_addr;
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+		       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)hdr +
+		       xfs_dir2_dataptr_to_off(args->geo,
+					       be32_to_cpu(lep->address)));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		dep->inumber = cpu_to_be64(inum);
+		args->dp->d_ops->data_put_ftype(dep, args->filetype);
+		xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_trans_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	struct xfs_buf		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the freespace block.
+	 */
+	error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+	if (error)
+		return error;
+	/*
+	 * There can be holes in freespace.  If fo is a hole, there's
+	 * nothing to do.
+	 */
+	if (!bp)
+		return 0;
+	free = bp->b_addr;
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (freehdr.nused > 0) {
+		xfs_trans_brelse(tp, bp);
+		*rvalp = 0;
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	error = xfs_dir2_shrink_inode(args,
+			xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+	if (error) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != ENOSPC);
+		xfs_trans_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000000..27ce0794d196
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+	return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+	return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+	return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir2_block_tail *)
+		((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+	return (struct xfs_dir2_leaf_tail *)
+		((char *)lp + geo->blksize -
+		  sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/* xfs_dir2.c */
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+				xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
+
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+					__uint8_t filetype);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			       struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+		struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+#ifdef DEBUG
+#define	xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
+#else
+#define	xfs_dir3_data_check(dp,bp)
+#endif
+
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+		xfs_daddr_t mapped_bno);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+		struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+		int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+		struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+		struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+		struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int *indexp,
+		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+		struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+		struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+		struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+		struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int index, int compact,
+		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
+/* xfs_dir2_node.c */
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+		struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+		struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+		struct xfs_da_args *args, int *indexp,
+		struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+		struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+	struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+		struct xfs_da_state_blk *drop_blk,
+		struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+		int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+		struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+		int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+		       size_t bufsize);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..ab3563b87995
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define	xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_INUMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_INUMS */
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_data_hdr_t	*hdr,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent = 0;	/* parent inode number */
+	int			size=0;		/* total computed size */
+	int			has_ftype;
+	struct xfs_da_geometry	*geo;
+
+	mp = dp->i_mount;
+	geo = mp->m_dir_geo;
+
+	/*
+	 * if there is a filetype field, add the extra byte to the namelen
+	 * for each entry that we see.
+	 */
+	has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+
+	count = i8count = namelen = 0;
+	btp = xfs_dir2_block_tail_p(geo, hdr);
+	blp = xfs_dir2_block_leaf_p(btp);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < be32_to_cpu(btp->count); i++) {
+		if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+				xfs_dir2_dataptr_to_off(geo, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_INUMS
+		if (!isdot)
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+		/* take into account the file type field */
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen + has_ftype;
+		} else if (isdotdot)
+			parent = be64_to_cpu(dep->inumber);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
+		       count +					/* namelen */
+		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+		       namelen +				/* name */
+		       (i8count ?				/* inumber */
+				(uint)sizeof(xfs_dir2_ino8_t) * count :
+				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		if (size > XFS_IFORK_DSIZE(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	dp->d_ops->sf_put_parent_ino(sfhp, parent);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	struct xfs_buf		*bp,
+	int			size,		/* shortform directory size */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
+{
+	xfs_dir2_data_hdr_t	*hdr;		/* block header */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
+	char			*endptr;	/* end of data entries */
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data pointer */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform directory header */
+	xfs_dir2_sf_hdr_t	*dst;		/* temporary data buffer */
+
+	trace_xfs_dir2_block_to_sf(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * allocate a temporary destination buffer the size of the inode
+	 * to format the data into. Once we have formatted the data, we
+	 * can free the block and copy the formatted data into the inode literal
+	 * area.
+	 */
+	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+	hdr = bp->b_addr;
+
+	/*
+	 * Copy the header into the newly allocate local space.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dst;
+	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
+	/*
+	 * Set up to loop over the block's entries.
+	 */
+	btp = xfs_dir2_block_tail_p(args->geo, hdr);
+	ptr = (char *)dp->d_ops->data_entry_p(hdr);
+	endptr = (char *)xfs_dir2_block_leaf_p(btp);
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	/*
+	 * Loop over the active and unused entries.
+	 * Stop when we reach the leaf/tail portion of the block.
+	 */
+	while (ptr < endptr) {
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += be16_to_cpu(dup->length);
+			continue;
+		}
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(be64_to_cpu(dep->inumber) ==
+			       dp->d_ops->sf_get_parent_ino(sfp));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			xfs_dir2_sf_put_offset(sfep,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)hdr));
+			memcpy(sfep->name, dep->name, dep->namelen);
+			dp->d_ops->sf_put_ino(sfp, sfep,
+					      be64_to_cpu(dep->inumber));
+			dp->d_ops->sf_put_ftype(sfep,
+					dp->d_ops->data_get_ftype(dep));
+
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+		}
+		ptr += dp->d_ops->data_entsize(dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+
+	/* now we are done with the block, we can shrink the inode */
+	logflags = XFS_ILOG_CORE;
+	error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+	if (error) {
+		ASSERT(error != ENOSPC);
+		goto out;
+	}
+
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format and copy the data in.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+	logflags |= XFS_ILOG_DDATA;
+	memcpy(dp->i_df.if_u1.if_data, dst, size);
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(dst);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* di_size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
+
+	trace_xfs_dir2_sf_addname(args);
+
+	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Make sure the shortform value has some of its header.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+	objchange = 0;
+#if XFS_BIG_INUMS
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		/*
+		 * Yes, adjust the inode size.  old count + (parent + new)
+		 */
+		incr_isize +=
+			(sfp->count + 2) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		objchange = 1;
+	}
+#endif
+	new_isize = (int)dp->i_d.di_size + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+			return ENOSPC;
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+#if XFS_BIG_INUMS
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+#endif
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+			  XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
+
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->count++;
+#if XFS_BIG_INUMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->i8count++;
+#endif
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous di_size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new shortform dir */
+	struct xfs_mount	*mp;
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_d.di_size;
+	buf = kmem_alloc(old_isize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	memcpy(oldsfp, sfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = dp->d_ops->data_first_offset,
+	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+	      add_datasize = dp->d_ops->data_entsize(args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+	      oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = xfs_dir2_sf_get_offset(oldsfep);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.  We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	memcpy(sfp, oldsfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	xfs_dir2_sf_put_offset(sfep, offset);
+	memcpy(sfep->name, args->name, sfep->namelen);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
+	sfp->count++;
+#if XFS_BIG_INUMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->i8count++;
+#endif
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+		memcpy(sfep, oldsfep, old_isize - nbytes);
+	}
+	kmem_free(buf);
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	size = dp->d_ops->data_entsize(args->namelen);
+	offset = dp->d_ops->data_first_offset;
+	sfep = xfs_dir2_sf_firstentry(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->count; i++) {
+		if (!holefit)
+			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+		offset = xfs_dir2_sf_get_offset(sfep) +
+			 dp->d_ops->data_entsize(sfep->namelen);
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > args->geo->blksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+#if XFS_BIG_INUMS
+	if (objchange) {
+		return 2;
+	}
+#else
+	ASSERT(objchange == 0);
+#endif
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > args->geo->blksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	struct xfs_mount	*mp;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	offset = dp->d_ops->data_first_offset;
+	ino = dp->d_ops->sf_get_parent_ino(sfp);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+		ino = dp->d_ops->sf_get_ino(sfp, sfep);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			xfs_dir2_sf_get_offset(sfep) +
+			dp->d_ops->data_entsize(sfep->namelen);
+		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+	}
+	ASSERT(i8count == sfp->i8count);
+	ASSERT(XFS_BIG_INUMS || i8count == 0);
+	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+	ASSERT(offset +
+	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif	/* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_hdr_t *sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	trace_xfs_dir2_sf_create(args);
+
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = xfs_dir2_sf_hdr_size(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp->i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	dp->d_ops->sf_put_parent_ino(sfp, pino);
+	sfp->count = 0;
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			error;
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
+
+	trace_xfs_dir2_sf_lookup(args);
+
+	xfs_dir2_sf_check(args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return EEXIST;
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+		args->cmpresult = XFS_CMP_EXACT;
+		args->filetype = XFS_DIR3_FT_DIR;
+		return EEXIST;
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	ci_sfep = NULL;
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		/*
+		 * Compare name and if it's an exact match, return the inode
+		 * number. If it's the first case-insensitive match, store the
+		 * inode number and continue looking for an exact match.
+		 */
+		cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+								sfep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+			args->filetype = dp->d_ops->sf_get_ftype(sfep);
+			if (cmp == XFS_CMP_EXACT)
+				return EEXIST;
+			ci_sfep = sfep;
+		}
+	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was not found, return ENOENT.
+	 */
+	if (!ci_sfep)
+		return ENOENT;
+	/* otherwise process the CI match as required by the caller */
+	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+	return error;
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	int			byteoff;	/* offset of removed entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_removename(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	oldsize = (int)dp->i_d.di_size;
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+			ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+			       args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->count)
+		return ENOENT;
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->count--;
+	dp->i_d.di_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_INUMS
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+#if XFS_BIG_INUMS || defined(DEBUG)
+	xfs_ino_t		ino=0;		/* entry old inode number */
+#endif
+#if XFS_BIG_INUMS
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+#endif
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+
+	trace_xfs_dir2_sf_replace(args);
+
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the shortform directory is way too small.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return EIO;
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+#if XFS_BIG_INUMS
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+		int	error;			/* error return value */
+		int	newsize;		/* new inode size */
+
+		newsize =
+			dp->i_df.if_bytes +
+			(sfp->count + 1) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (newsize > XFS_IFORK_DSIZE(dp)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error) {
+				return error;
+			}
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+#endif
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_INUMS || defined(DEBUG)
+		ino = dp->d_ops->sf_get_parent_ino(sfp);
+		ASSERT(args->inumber != ino);
+#endif
+		dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+		     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
+#if XFS_BIG_INUMS || defined(DEBUG)
+				ino = dp->d_ops->sf_get_ino(sfp, sfep);
+				ASSERT(args->inumber != ino);
+#endif
+				dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+				dp->d_ops->sf_put_ftype(sfep, args->filetype);
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->count) {
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+#if XFS_BIG_INUMS
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+#endif
+			return ENOENT;
+		}
+	}
+#if XFS_BIG_INUMS
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->i8count != 0);
+		if (!i8elevated)
+			sfp->i8count++;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+#if XFS_BIG_INUMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+	struct xfs_mount	*mp;
+
+	trace_xfs_dir2_sf_toino4(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 1);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize -
+		(oldsfp->count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 0;
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
+	struct xfs_mount	*mp;
+
+	trace_xfs_dir2_sf_toino8(args);
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->i8count == 0);
+	memcpy(buf, oldsfp, oldsize);
+	/*
+	 * Compute the new inode size (nb: entry count + 1 for parent)
+	 */
+	newsize =
+		oldsize +
+		(oldsfp->count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->count = oldsfp->count;
+	sfp->i8count = 1;
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+	     i < sfp->count;
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		memcpy(sfep->name, oldsfep->name, sfep->namelen);
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif	/* XFS_BIG_INUMS */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000000..c2ac0c611ad8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+	unsigned int		nbblks)	/* basic block units */
+{
+	unsigned int	ndquots;
+
+	ASSERT(nbblks > 0);
+	ndquots = BBTOB(nbblks);
+	do_div(ndquots, sizeof(xfs_dqblk_t));
+
+	return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+	struct xfs_mount *mp,
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
+	int		errs = 0;
+
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks got
+	 *    used for user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (ddq->d_version != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, ddq->d_version, XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (ddq->d_flags != XFS_DQ_USER &&
+	    ddq->d_flags != XFS_DQ_PROJ &&
+	    ddq->d_flags != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, ddq->d_flags);
+		errs++;
+	}
+
+	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : ondisk-dquot 0x%p, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, be32_to_cpu(ddq->d_id));
+		errs++;
+	}
+
+	if (!errs && ddq->d_id) {
+		if (ddq->d_blk_softlimit &&
+		    be64_to_cpu(ddq->d_bcount) >
+				be64_to_cpu(ddq->d_blk_softlimit)) {
+			if (!ddq->d_btimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_ino_softlimit &&
+		    be64_to_cpu(ddq->d_icount) >
+				be64_to_cpu(ddq->d_ino_softlimit)) {
+			if (!ddq->d_itimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_rtb_softlimit &&
+		    be64_to_cpu(ddq->d_rtbcount) >
+				be64_to_cpu(ddq->d_rtb_softlimit)) {
+			if (!ddq->d_rtbtimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return errs;
+
+	if (flags & XFS_QMOPT_DOWARN)
+		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	memset(d, 0, sizeof(xfs_dqblk_t));
+
+	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	d->dd_diskdq.d_flags = type;
+	d->dd_diskdq.d_id = cpu_to_be32(id);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(
+					XFS_BB_TO_FSB(mp, bp->b_length));
+
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF))
+			return false;
+		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	}
+	return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	xfs_dqid_t		id = 0;
+	int			ndquots;
+	int			i;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < ndquots; i++) {
+		struct xfs_disk_dquot	*ddq;
+		int			error;
+
+		ddq = &d[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+				       "xfs_dquot_buf_verify");
+		if (error)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_dquot_buf_verify(mp, bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify(mp, bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.verify_read = xfs_dquot_buf_read_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000000..16fb63a9bc5e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2189 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+	xfs_alloc_arg_t	*args)
+{
+	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+	    args->mp->m_sb.sb_inoalignmt >=
+	     XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+		return args->mp->m_sb.sb_inoalignmt;
+	return 1;
+}
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	xfs_lookup_t		dir,	/* <=, >=, == */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
+{
+	union xfs_btree_rec	rec;
+
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_inobt_rec_incore_t	*irec,	/* btree record */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+	}
+	return error;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+	struct xfs_btree_cur	*cur,
+	__int32_t		freecount,
+	xfs_inofree_t		free,
+	int			*stat)
+{
+	cur->bc_rec.i.ir_freecount = freecount;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agino_t		newino,
+	xfs_agino_t		newlen,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agino_t		thisino;
+	int			i;
+	int			error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+
+		error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+					     XFS_INOBT_ALL_FREE, &i);
+		if (error) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur,
+	struct xfs_agi		*agi)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)	0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct list_head	*buffer_list,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			nbufs, blks_per_cluster, inodes_per_cluster;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+	xfs_ino_t		ino = 0;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.  For small block
+	 * sizes, manipulate the inodes in buffers  which are multiples of the
+	 * blocks size.
+	 */
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+	nbufs = length / blks_per_cluster;
+
+	/*
+	 * Figure out what version number to use in the inodes we create.  If
+	 * the superblock version has caught up to the one that supports the new
+	 * inode format, then use the new inode version.  Otherwise use the old
+	 * version so that old kernels will continue to be able to use the file
+	 * system.
+	 *
+	 * For v3 inodes, we also need to write the inode number into the inode,
+	 * so calculate the first inode number of the chunk here as
+	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+	 * across multiple filesystem blocks (such as a cluster) and so cannot
+	 * be used in the cluster buffer loop below.
+	 *
+	 * Further, because we are writing the inode directly into the buffer
+	 * and calculating a CRC on the entire inode, we have ot log the entire
+	 * inode so that the entire range the CRC covers is present in the log.
+	 * That means for v3 inode we log the entire buffer rather than just the
+	 * inode cores.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		version = 3;
+		ino = XFS_AGINO_TO_INO(mp, agno,
+				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+		/*
+		 * log the initialisation that is about to take place as an
+		 * logical operation. This means the transaction does not
+		 * need to log the physical changes to the inode buffers as log
+		 * recovery will know what initialisation is actually needed.
+		 * Hence we only need to log the buffers as "ordered" buffers so
+		 * they track in the AIL as if they were physically logged.
+		 */
+		if (tp)
+			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+					mp->m_sb.sb_inodesize, length, gen);
+	} else
+		version = 2;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					 mp->m_bsize * blks_per_cluster,
+					 XBF_UNMAPPED);
+		if (!fbuf)
+			return ENOMEM;
+
+		/* Initialize the inode buffers and log them appropriately. */
+		fbuf->b_ops = &xfs_inode_buf_ops;
+		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+		for (i = 0; i < inodes_per_cluster; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+			uint	isize = xfs_dinode_size(version);
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+			if (version == 3) {
+				free->di_ino = cpu_to_be64(ino);
+				ino++;
+				uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+				xfs_dinode_calc_crc(mp, free);
+			} else if (tp) {
+				/* just log the inode core */
+				xfs_trans_log_buf(tp, fbuf, ioffset,
+						  ioffset + isize - 1);
+			}
+		}
+
+		if (tp) {
+			/*
+			 * Mark the buffer as an inode allocation buffer so it
+			 * sticks in AIL at the point of this allocation
+			 * transaction. This ensures the they are on disk before
+			 * the tail of the log can be moved past this
+			 * transaction (i.e. by preventing relogging from moving
+			 * it forward in the log).
+			 */
+			xfs_trans_inode_alloc_buf(tp, fbuf);
+			if (version == 3) {
+				/*
+				 * Mark the buffer as ordered so that they are
+				 * not physically logged in the transaction but
+				 * still tracked in the AIL as part of the
+				 * transaction and pin the log appropriately.
+				 */
+				xfs_trans_ordered_buf(tp, fbuf);
+				xfs_trans_log_buf(tp, fbuf, 0,
+						  BBTOB(fbuf->b_length) - 1);
+			}
+		} else {
+			fbuf->b_flags |= XBF_DONE;
+			xfs_buf_delwri_queue(fbuf, buffer_list);
+			xfs_buf_relse(fbuf);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int				/* error code or 0 */
+xfs_ialloc_ag_alloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* alloc group buffer */
+	int		*alloc)
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	xfs_alloc_arg_t	args;		/* allocation argument structure */
+	xfs_agnumber_t	agno;
+	int		error;
+	xfs_agino_t	newino;		/* new first inode's number */
+	xfs_agino_t	newlen;		/* new number of inodes */
+	int		isaligned = 0;	/* inode allocation at stripe unit */
+					/* boundary */
+	struct xfs_perag *pag;
+
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = args.mp->m_ialloc_inos;
+	if (args.mp->m_maxicount &&
+	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+		return ENOSPC;
+	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+	/*
+	 * First try to allocate inodes contiguous with the last-allocated
+	 * chunk of inodes.  If the filesystem is striped, this will fill
+	 * an entire stripe unit with inodes.
+	 */
+	agi = XFS_BUF_TO_AGI(agbp);
+	newino = be32_to_cpu(agi->agi_newino);
+	agno = be32_to_cpu(agi->agi_seqno);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+		     args.mp->m_ialloc_blks;
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.type = XFS_ALLOCTYPE_THIS_BNO;
+		args.prod = 1;
+
+		/*
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
+		 */
+		args.alignment = 1;
+		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+		/* Allow space for the inode btree to split. */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+
+		/*
+		 * This request might have dirtied the transaction if the AG can
+		 * satisfy the request, but the exact block was not available.
+		 * If the allocation did fail, subsequent requests will relax
+		 * the exact agbno requirement and increase the alignment
+		 * instead. It is critical that the total size of the request
+		 * (len + alignment + slop) does not increase from this point
+		 * on, so reset minalignslop to ensure it is not included in
+		 * subsequent requests.
+		 */
+		args.minalignslop = 0;
+	} else
+		args.fsbno = NULLFSBLOCK;
+
+	if (unlikely(args.fsbno == NULLFSBLOCK)) {
+		/*
+		 * Set the alignment for the allocation.
+		 * If stripe alignment is turned on then align at stripe unit
+		 * boundary.
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
+		 * pieces, so don't need alignment anyway.
+		 */
+		isaligned = 0;
+		if (args.mp->m_sinoalign) {
+			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+			args.alignment = args.mp->m_dalign;
+			isaligned = 1;
+		} else
+			args.alignment = xfs_ialloc_cluster_alignment(&args);
+		/*
+		 * Need to figure out where to allocate the inode blocks.
+		 * Ideally they should be spaced out through the a.g.
+		 * For now, just allocate blocks up front.
+		 */
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		/*
+		 * Allocate a fixed-size extent of inodes.
+		 */
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.prod = 1;
+		/*
+		 * Allow space for the inode btree to split.
+		 */
+		args.minleft = args.mp->m_in_maxlevels - 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.alignment = xfs_ialloc_cluster_alignment(&args);
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+	}
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*alloc = 0;
+		return 0;
+	}
+	ASSERT(args.len == args.minlen);
+
+	/*
+	 * Stamp and write the inode buffers.
+	 *
+	 * Seed the new inode cluster with a random generation number. This
+	 * prevents short-term reuse of generation numbers if a chunk is
+	 * freed and then immediately reallocated. We use random numbers
+	 * rather than a linear progression to prevent the next generation
+	 * number from being easily guessable.
+	 */
+	error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+			args.len, prandom_u32());
+
+	if (error)
+		return error;
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	be32_add_cpu(&agi->agi_count, newlen);
+	be32_add_cpu(&agi->agi_freecount, newlen);
+	pag = xfs_perag_get(args.mp, agno);
+	pag->pagi_freecount += newlen;
+	xfs_perag_put(pag);
+	agi->agi_newino = cpu_to_be32(newino);
+
+	/*
+	 * Insert records describing the new inode chunk into the btrees.
+	 */
+	error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+				 XFS_BTNUM_INO);
+	if (error)
+		return error;
+
+	if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+					 XFS_BTNUM_FINO);
+		if (error)
+			return error;
+	}
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	*alloc = 1;
+	return 0;
+}
+
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+	xfs_mount_t	*mp)
+{
+	xfs_agnumber_t	agno;
+
+	spin_lock(&mp->m_agirotor_lock);
+	agno = mp->m_agirotor;
+	if (++mp->m_agirotor >= mp->m_maxagi)
+		mp->m_agirotor = 0;
+	spin_unlock(&mp->m_agirotor_lock);
+
+	return agno;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and the mode.  Return the allocation group buffer.
+ */
+STATIC xfs_agnumber_t
+xfs_ialloc_ag_select(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent directory inode number */
+	umode_t		mode,		/* bits set to indicate file type */
+	int		okalloc)	/* ok to allocate more space */
+{
+	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
+	xfs_agnumber_t	agno;		/* current ag number */
+	int		flags;		/* alloc buffer locking flags */
+	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
+	xfs_extlen_t	longest = 0;	/* longest extent available */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		needspace;	/* file mode implies space allocated */
+	xfs_perag_t	*pag;		/* per allocation group data */
+	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
+	int		error;
+
+	/*
+	 * Files of these types need at least one block if length > 0
+	 * (and they won't fit in the inode, but that's hard to figure out).
+	 */
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+	mp = tp->t_mountp;
+	agcount = mp->m_maxagi;
+	if (S_ISDIR(mode))
+		pagno = xfs_ialloc_next_ag(mp);
+	else {
+		pagno = XFS_INO_TO_AGNO(mp, parent);
+		if (pagno >= agcount)
+			pagno = 0;
+	}
+
+	ASSERT(pagno < agcount);
+
+	/*
+	 * Loop through allocation groups, looking for one with a little
+	 * free space in it.  Note we don't look for free inodes, exactly.
+	 * Instead, we include whether there is a need to allocate inodes
+	 * to mean that blocks must be allocated for them,
+	 * if none are currently free.
+	 */
+	agno = pagno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (!pag->pagi_inodeok) {
+			xfs_ialloc_next_ag(mp);
+			goto nextag;
+		}
+
+		if (!pag->pagi_init) {
+			error = xfs_ialloc_pagi_init(mp, tp, agno);
+			if (error)
+				goto nextag;
+		}
+
+		if (pag->pagi_freecount) {
+			xfs_perag_put(pag);
+			return agno;
+		}
+
+		if (!okalloc)
+			goto nextag;
+
+		if (!pag->pagf_init) {
+			error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+			if (error)
+				goto nextag;
+		}
+
+		/*
+		 * Is there enough free space for the file plus a block of
+		 * inodes? (if we need to allocate some)?
+		 */
+		ineed = mp->m_ialloc_blks;
+		longest = pag->pagf_longest;
+		if (!longest)
+			longest = pag->pagf_flcount > 0;
+
+		if (pag->pagf_freeblks >= needspace + ineed &&
+		    longest >= ineed) {
+			xfs_perag_put(pag);
+			return agno;
+		}
+nextag:
+		xfs_perag_put(pag);
+		/*
+		 * No point in iterating over the rest, if we're shutting
+		 * down.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			return NULLAGNUMBER;
+		agno++;
+		if (agno >= agcount)
+			agno = 0;
+		if (agno == pagno) {
+			if (flags == 0)
+				return NULLAGNUMBER;
+			flags = 0;
+		}
+	}
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agnumber_t		pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t		pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_perag	*pag;
+	struct xfs_btree_cur	*cur, *tcur;
+	struct xfs_inobt_rec_incore rec, trec;
+	xfs_ino_t		ino;
+	int			error;
+	int			offset;
+	int			i, j;
+
+	pag = xfs_perag_get(mp, agno);
+
+	ASSERT(pag->pagi_init);
+	ASSERT(pag->pagi_inodeok);
+	ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * If in the same AG as the parent, try to get near the parent.
+	 */
+	if (pagno == agno) {
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+		int		searchdistance = 10;
+
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+
+		if (rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as the parent, done.
+			 */
+			goto alloc_inode;
+		}
+
+
+		/*
+		 * In the same AG as parent, but parent's chunk is full.
+		 */
+
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft);
+			if (error)
+				goto error1;
+
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (!doneleft || !doneright) {
+			int	useleft;  /* using left inode chunk this time */
+
+			if (!--searchdistance) {
+				/*
+				 * Not in range - save last search
+				 * location and allocate a new inode
+				 */
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto newino;
+			}
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
+			}
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				rec = trec;
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto alloc_inode;
+			}
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
+		}
+
+		/*
+		 * We've reached the end of the btree. because
+		 * we are only searching a small chunk of the
+		 * btree each search, there is obviously free
+		 * inodes closer to the parent inode than we
+		 * are now. restart the search again.
+		 */
+		pag->pagl_pagino = NULLAGINO;
+		pag->pagl_leftrec = NULLAGINO;
+		pag->pagl_rightrec = NULLAGINO;
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		goto restart_pagno;
+	}
+
+	/*
+	 * In a different AG from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+newino:
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
+		if (error)
+			goto error0;
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
+		}
+	}
+
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+
+alloc_inode:
+	offset = xfs_lowbit64(rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
+		goto error0;
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	xfs_perag_put(pag);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+	xfs_agino_t			pagino,
+	struct xfs_btree_cur		**ocur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */
+	struct xfs_btree_cur		*rcur;	/* right search cursor */
+	struct xfs_inobt_rec_incore	rrec;
+	int				error;
+	int				i, j;
+
+	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+	if (error)
+		return error;
+
+	if (i == 1) {
+		error = xfs_inobt_get_rec(lcur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+		/*
+		 * See if we've landed in the parent inode record. The finobt
+		 * only tracks chunks with at least one free inode, so record
+		 * existence is enough.
+		 */
+		if (pagino >= rec->ir_startino &&
+		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+			return 0;
+	}
+
+	error = xfs_btree_dup_cursor(lcur, &rcur);
+	if (error)
+		return error;
+
+	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+	if (error)
+		goto error_rcur;
+	if (j == 1) {
+		error = xfs_inobt_get_rec(rcur, &rrec, &j);
+		if (error)
+			goto error_rcur;
+		XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+	}
+
+	XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+	if (i == 1 && j == 1) {
+		/*
+		 * Both the left and right records are valid. Choose the closer
+		 * inode chunk to the target.
+		 */
+		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+		    (rrec.ir_startino - pagino)) {
+			*rec = rrec;
+			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+			*ocur = rcur;
+		} else {
+			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+		}
+	} else if (j == 1) {
+		/* only the right record is valid */
+		*rec = rrec;
+		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+		*ocur = rcur;
+	} else if (i == 1) {
+		/* only the left record is valid */
+		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+	}
+
+	return 0;
+
+error_rcur:
+	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+	struct xfs_agi			*agi,
+	struct xfs_btree_cur		*cur,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	int error;
+	int i;
+
+	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+		error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+					 &i);
+		if (error)
+			return error;
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, rec, &i);
+			if (error)
+				return error;
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+			return 0;
+		}
+	}
+
+	/*
+	 * Find the first inode available in the AG.
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+	error = xfs_inobt_get_rec(cur, rec, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+	return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+	struct xfs_btree_cur		*cur,	/* inobt cursor */
+	struct xfs_inobt_rec_incore	*frec,	/* finobt record */
+	int				offset) /* inode offset */
+{
+	struct xfs_inobt_rec_incore	rec;
+	int				error;
+	int				i;
+
+	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+
+	XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+				  (rec.ir_freecount == frec->ir_freecount));
+
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_ino_t		parent,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent);
+	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent);
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*cur;	/* finobt cursor */
+	struct xfs_btree_cur		*icur;	/* inobt cursor */
+	struct xfs_inobt_rec_incore	rec;
+	xfs_ino_t			ino;
+	int				error;
+	int				offset;
+	int				i;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+	pag = xfs_perag_get(mp, agno);
+
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = be32_to_cpu(agi->agi_newino);
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The search algorithm depends on whether we're in the same AG as the
+	 * parent. If so, find the closest available inode to the parent. If
+	 * not, consider the agi hint or find the first free inode in the AG.
+	 */
+	if (agno == pagno)
+		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+	else
+		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+	if (error)
+		goto error_cur;
+
+	offset = xfs_lowbit64(rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+	/*
+	 * Modify or remove the finobt record.
+	 */
+	rec.ir_free &= ~XFS_INOBT_MASK(offset);
+	rec.ir_freecount--;
+	if (rec.ir_freecount)
+		error = xfs_inobt_update(cur, &rec);
+	else
+		error = xfs_btree_delete(cur, &i);
+	if (error)
+		goto error_cur;
+
+	/*
+	 * The finobt has now been updated appropriately. We haven't updated the
+	 * agi and superblock yet, so we can create an inobt cursor and validate
+	 * the original freecount. If all is well, make the equivalent update to
+	 * the inobt using the finobt record and offset information.
+	 */
+	icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(icur, agi);
+	if (error)
+		goto error_icur;
+
+	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+	if (error)
+		goto error_icur;
+
+	/*
+	 * Both trees have now been updated. We must update the perag and
+	 * superblock before we can check the freecount for each btree.
+	 */
+	be32_add_cpu(&agi->agi_freecount, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	pag->pagi_freecount--;
+
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+	error = xfs_check_agi_freecount(icur, agi);
+	if (error)
+		goto error_icur;
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error_icur;
+
+	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_perag_put(pag);
+	*inop = ino;
+	return 0;
+
+error_icur:
+	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated.  The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+	struct xfs_trans	*tp,
+	xfs_ino_t		parent,
+	umode_t			mode,
+	int			okalloc,
+	struct xfs_buf		**IO_agbp,
+	xfs_ino_t		*inop)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*agbp;
+	xfs_agnumber_t		agno;
+	int			error;
+	int			ialloced;
+	int			noroom = 0;
+	xfs_agnumber_t		start_agno;
+	struct xfs_perag	*pag;
+
+	if (*IO_agbp) {
+		/*
+		 * If the caller passes in a pointer to the AGI buffer,
+		 * continue where we left off before.  In this case, we
+		 * know that the allocation group has free inodes.
+		 */
+		agbp = *IO_agbp;
+		goto out_alloc;
+	}
+
+	/*
+	 * We do not have an agbp, so select an initial allocation
+	 * group for inode allocation.
+	 */
+	start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+	if (start_agno == NULLAGNUMBER) {
+		*inop = NULLFSINO;
+		return 0;
+	}
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * okalloc so we scan all available agi structures for a free
+	 * inode.
+	 */
+	if (mp->m_maxicount &&
+	    mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+		noroom = 1;
+		okalloc = 0;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	agno = start_agno;
+	for (;;) {
+		pag = xfs_perag_get(mp, agno);
+		if (!pag->pagi_inodeok) {
+			xfs_ialloc_next_ag(mp);
+			goto nextag;
+		}
+
+		if (!pag->pagi_init) {
+			error = xfs_ialloc_pagi_init(mp, tp, agno);
+			if (error)
+				goto out_error;
+		}
+
+		/*
+		 * Do a first racy fast path check if this AG is usable.
+		 */
+		if (!pag->pagi_freecount && !okalloc)
+			goto nextag;
+
+		/*
+		 * Then read in the AGI buffer and recheck with the AGI buffer
+		 * lock held.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		if (error)
+			goto out_error;
+
+		if (pag->pagi_freecount) {
+			xfs_perag_put(pag);
+			goto out_alloc;
+		}
+
+		if (!okalloc)
+			goto nextag_relse_buffer;
+
+
+		error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+		if (error) {
+			xfs_trans_brelse(tp, agbp);
+
+			if (error != ENOSPC)
+				goto out_error;
+
+			xfs_perag_put(pag);
+			*inop = NULLFSINO;
+			return 0;
+		}
+
+		if (ialloced) {
+			/*
+			 * We successfully allocated some inodes, return
+			 * the current context to the caller so that it
+			 * can commit the current transaction and call
+			 * us again where we left off.
+			 */
+			ASSERT(pag->pagi_freecount > 0);
+			xfs_perag_put(pag);
+
+			*IO_agbp = agbp;
+			*inop = NULLFSINO;
+			return 0;
+		}
+
+nextag_relse_buffer:
+		xfs_trans_brelse(tp, agbp);
+nextag:
+		xfs_perag_put(pag);
+		if (++agno == mp->m_sb.sb_agcount)
+			agno = 0;
+		if (agno == start_agno) {
+			*inop = NULLFSINO;
+			return noroom ? ENOSPC : 0;
+		}
+	}
+
+out_alloc:
+	*IO_agbp = NULL;
+	return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+	xfs_perag_put(pag);
+	return error;
+}
+
+STATIC int
+xfs_difree_inobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agino_t			agino,
+	struct xfs_bmap_free		*flist,
+	int				*deleted,
+	xfs_ino_t			*first_ino,
+	struct xfs_inobt_rec_incore	*orec)
+{
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	struct xfs_perag		*pag;
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				ilen;
+	int				error;
+	int				i;
+	int				off;
+
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+			__func__, error);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	rec.ir_free |= XFS_INOBT_MASK(off);
+	rec.ir_freecount++;
+
+	/*
+	 * When an inode cluster is free, it becomes eligible for removal
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+	    (rec.ir_freecount == mp->m_ialloc_inos)) {
+
+		*deleted = 1;
+		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+
+		/*
+		 * Remove the inode cluster from the AGI B+Tree, adjust the
+		 * AGI and Superblock inode counts, and mark the disk space
+		 * to be freed when the transaction is committed.
+		 */
+		ilen = mp->m_ialloc_inos;
+		be32_add_cpu(&agi->agi_count, -ilen);
+		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount -= ilen - 1;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+		if ((error = xfs_btree_delete(cur, &i))) {
+			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+				  mp->m_ialloc_blks, flist, mp);
+	} else {
+		*deleted = 0;
+
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
+			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+				__func__, error);
+			goto error0;
+		}
+
+		/* 
+		 * Change the inode free counts and log the ag/sb changes.
+		 */
+		be32_add_cpu(&agi->agi_freecount, 1);
+		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount++;
+		xfs_perag_put(pag);
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	}
+
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
+	*orec = rec;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	xfs_agino_t			agino,
+	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */
+{
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	struct xfs_btree_cur		*cur;
+	struct xfs_inobt_rec_incore	rec;
+	int				offset = agino - ibtrec->ir_startino;
+	int				error;
+	int				i;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	if (i == 0) {
+		/*
+		 * If the record does not exist in the finobt, we must have just
+		 * freed an inode in a previously fully allocated chunk. If not,
+		 * something is out of sync.
+		 */
+		XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+		error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+					     ibtrec->ir_free, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+
+		goto out;
+	}
+
+	/*
+	 * Read and update the existing record. We could just copy the ibtrec
+	 * across here, but that would defeat the purpose of having redundant
+	 * metadata. By making the modifications independently, we can catch
+	 * corruptions that we wouldn't see if we just copied from one record
+	 * to another.
+	 */
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		goto error;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+	rec.ir_free |= XFS_INOBT_MASK(offset);
+	rec.ir_freecount++;
+
+	XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+				(rec.ir_freecount == ibtrec->ir_freecount),
+				error);
+
+	/*
+	 * The content of inobt records should always match between the inobt
+	 * and finobt. The lifecycle of records in the finobt is different from
+	 * the inobt in that the finobt only tracks records with at least one
+	 * free inode. Hence, if all of the inodes are free and we aren't
+	 * keeping inode chunks permanently on disk, remove the record.
+	 * Otherwise, update the record with the new information.
+	 */
+	if (rec.ir_freecount == mp->m_ialloc_inos &&
+	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+		error = xfs_btree_delete(cur, &i);
+		if (error)
+			goto error;
+		ASSERT(i == 1);
+	} else {
+		error = xfs_inobt_update(cur, &rec);
+		if (error)
+			goto error;
+	}
+
+out:
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_ino_t		inode,		/* inode to be freed */
+	struct xfs_bmap_free	*flist,		/* extents to free */
+	int			*deleted,/* set if inode cluster was deleted */
+	xfs_ino_t		*first_ino)/* first inode in deleted cluster */
+{
+	/* REFERENCED */
+	xfs_agblock_t		agbno;	/* block number containing inode */
+	struct xfs_buf		*agbp;	/* buffer for allocation group header */
+	xfs_agino_t		agino;	/* allocation group inode number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	int			error;	/* error return value */
+	struct xfs_mount	*mp;	/* mount structure for filesystem */
+	struct xfs_inobt_rec_incore rec;/* btree record */
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, inode);
+	if (agno >= mp->m_sb.sb_agcount)  {
+		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+			__func__, agno, mp->m_sb.sb_agcount);
+		ASSERT(0);
+		return EINVAL;
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+			__func__, (unsigned long long)inode,
+			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+		ASSERT(0);
+		return EINVAL;
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+			__func__, agbno, mp->m_sb.sb_agblocks);
+		ASSERT(0);
+		return EINVAL;
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+			__func__, error);
+		return error;
+	}
+
+	/*
+	 * Fix up the inode allocation btree.
+	 */
+	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+				 &rec);
+	if (error)
+		goto error0;
+
+	/*
+	 * Fix up the free inode btree.
+	 */
+	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+		error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+		if (error)
+			goto error0;
+	}
+
+	return 0;
+
+error0:
+	return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		*chunk_agbno,
+	xfs_agblock_t		*offset_agbno,
+	int			flags)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	int			error;
+	int			i;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_alert(mp,
+			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+			__func__, error, agno);
+		return error;
+	}
+
+	/*
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+	if (!error) {
+		if (i)
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (!error && i == 0)
+			error = EINVAL;
+	}
+
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	if (error)
+		return error;
+
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + mp->m_ialloc_inos <= agino)
+		return EINVAL;
+
+	/* for untrusted inodes check it is allocated first */
+	if ((flags & XFS_IGET_UNTRUSTED) &&
+	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+		return EINVAL;
+
+	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+	*offset_agbno = agbno - *chunk_agbno;
+	return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+	xfs_mount_t	 *mp,	/* file system mount structure */
+	xfs_trans_t	 *tp,	/* transaction pointer */
+	xfs_ino_t	ino,	/* inode to locate */
+	struct xfs_imap	*imap,	/* location map structure */
+	uint		flags)	/* flags for inode btree lookup */
+{
+	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
+	xfs_agino_t	agino;	/* inode number within alloc group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	int		blks_per_cluster; /* num blocks per inode cluster */
+	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
+	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
+	int		error;	/* error code */
+	int		offset;	/* index of inode in its buffer */
+	xfs_agblock_t	offset_agbno;	/* blks from chunk start to inode */
+
+	ASSERT(ino != NULLFSINO);
+
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+		/*
+		 * Don't output diagnostic information for untrusted inodes
+		 * as they can be invalid without implying corruption.
+		 */
+		if (flags & XFS_IGET_UNTRUSTED)
+			return EINVAL;
+		if (agno >= mp->m_sb.sb_agcount) {
+			xfs_alert(mp,
+				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+				__func__, agno, mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_alert(mp,
+		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+				__func__, (unsigned long long)agbno,
+				(unsigned long)mp->m_sb.sb_agblocks);
+		}
+		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+			xfs_alert(mp,
+		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+				__func__, ino,
+				XFS_AGINO_TO_INO(mp, agno, agino));
+		}
+		xfs_stack_trace();
+#endif /* DEBUG */
+		return EINVAL;
+	}
+
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+
+	/*
+	 * For bulkstat and handle lookups, we have an untrusted inode number
+	 * that we have to verify is valid. We cannot do this just by reading
+	 * the inode buffer as it may have been unlinked and removed leaving
+	 * inodes in stale state on disk. Hence we have to do a btree lookup
+	 * in all cases where an untrusted inode number is passed.
+	 */
+	if (flags & XFS_IGET_UNTRUSTED) {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+		goto out_map;
+	}
+
+	/*
+	 * If the inode cluster size is the same as the blocksize or
+	 * smaller we get to the buffer by simple arithmetics.
+	 */
+	if (blks_per_cluster == 1) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+
+		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		imap->im_len = XFS_FSB_TO_BB(mp, 1);
+		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+		return 0;
+	}
+
+	/*
+	 * If the inode chunks are aligned then use simple maths to
+	 * find the location. Otherwise we have to do a btree
+	 * lookup to find the location.
+	 */
+	if (mp->m_inoalign_mask) {
+		offset_agbno = agbno & mp->m_inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+	}
+
+out_map:
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+
+	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_alert(mp,
+	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+			__func__, (unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		return EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+		XFS_INODES_PER_CHUNK_LOG;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* allocation group header buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		offsetof(xfs_agi_t, agi_free_root),
+		offsetof(xfs_agi_t, agi_free_level),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	xfs_agi_t		*agi;	/* allocation group header */
+
+	agi = XFS_BUF_TO_AGI(bp);
+	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
+	/*
+	 * Compute byte offsets for the first and last fields in the first
+	 * region and log the agi buffer. This only logs up through
+	 * agi_unlinked.
+	 */
+	if (fields & XFS_AGI_ALL_BITS_R1) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+
+	/*
+	 * Mask off the bits in the first region and calculate the first and
+	 * last field offsets for any bits in the second region.
+	 */
+	fields &= ~XFS_AGI_ALL_BITS_R1;
+	if (fields) {
+		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+				  &first, &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	}
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+	struct xfs_agi		*agi)
+{
+	int			i;
+
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+		ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+static bool
+xfs_agi_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+		return false;
+	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+		return false;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	xfs_check_agi_unlinked(agi);
+	return true;
+}
+
+static void
+xfs_agi_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+				XFS_ERRTAG_IALLOC_READ_AGI,
+				XFS_RANDOM_IALLOC_READ_AGI))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agi_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agi_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+	.verify_read = xfs_agi_read_verify,
+	.verify_write = xfs_agi_write_verify,
+};
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	int			error;
+
+	trace_xfs_read_agi(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+	if (error)
+		return error;
+
+	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+	return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+{
+	struct xfs_agi		*agi;	/* allocation group header */
+	struct xfs_perag	*pag;	/* per allocation group data */
+	int			error;
+
+	trace_xfs_ialloc_read_agi(mp, agno);
+
+	error = xfs_read_agi(mp, tp, agno, bpp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(*bpp);
+	pag = xfs_perag_get(mp, agno);
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+		pag->pagi_count = be32_to_cpu(agi->agi_count);
+		pag->pagi_init = 1;
+	}
+
+	/*
+	 * It's possible for these to be out of sync if
+	 * we are in the middle of a forced shutdown.
+	 */
+	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+		XFS_FORCED_SHUTDOWN(mp));
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno)		/* allocation group number */
+{
+	xfs_buf_t	*bp = NULL;
+	int		error;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+	if (error)
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..726f83a681a5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+
+
+STATIC int
+xfs_inobt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
+
+STATIC void
+xfs_inobt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*nptr,
+	int			inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	agi->agi_root = nptr->s;
+	be32_add_cpu(&agi->agi_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC void
+xfs_finobt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*nptr,
+	int			inc)	/* level change */
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	agi->agi_free_root = nptr->s;
+	be32_add_cpu(&agi->agi_free_level, inc);
+	xfs_ialloc_log_agi(cur->bc_tp, agbp,
+			   XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+	xfs_agblock_t		sbno = be32_to_cpu(start->s);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+	error = xfs_alloc_vextent(&args);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+	new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_inobt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+	if (error)
+		return error;
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return error;
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+	rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+	ptr->s = agi->agi_root;
+}
+
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+	ptr->s = agi->agi_free_root;
+}
+
+STATIC __int64_t
+xfs_inobt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+			  cur->bc_rec.i.ir_startino;
+}
+
+static int
+xfs_inobt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	/*
+	 * During growfs operations, we can't verify the exact owner as the
+	 * perag is not fully initialised and hence not attached to the buffer.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agi information will not yet have been initialised
+	 * from the on disk AGI. We don't currently use any of this information,
+	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+	 * ever do.
+	 */
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+	case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_IBT_MAGIC):
+	case cpu_to_be32(XFS_FIBT_MAGIC):
+		break;
+	default:
+		return 0;
+	}
+
+	/* numrecs and level verification */
+	level = be16_to_cpu(block->bb_level);
+	if (level >= mp->m_in_maxlevels)
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
+}
+
+static void
+xfs_inobt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_inobt_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+static void
+xfs_inobt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_inobt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_inobt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->inobt.ir_startino) <
+		be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+		be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_inobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+	.rec_len		= sizeof(xfs_inobt_rec_t),
+	.key_len		= sizeof(xfs_inobt_key_t),
+
+	.dup_cursor		= xfs_inobt_dup_cursor,
+	.set_root		= xfs_finobt_set_root,
+	.alloc_block		= xfs_inobt_alloc_block,
+	.free_block		= xfs_inobt_free_block,
+	.get_minrecs		= xfs_inobt_get_minrecs,
+	.get_maxrecs		= xfs_inobt_get_maxrecs,
+	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
+	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_finobt_init_ptr_from_cur,
+	.key_diff		= xfs_inobt_key_diff,
+	.buf_ops		= &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_inobt_keys_inorder,
+	.recs_inorder		= xfs_inobt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur *				/* new inode btree cursor */
+xfs_inobt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agi structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	struct xfs_btree_cur	*cur;
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	if (btnum == XFS_BTNUM_INO) {
+		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+		cur->bc_ops = &xfs_inobt_ops;
+	} else {
+		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+		cur->bc_ops = &xfs_finobt_ops;
+	}
+
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	return cur;
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
+{
+	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+
+	if (leaf)
+		return blocklen / sizeof(xfs_inobt_rec_t);
+	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000000..1e5366d7745e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp)
+{
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++) {
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * mp->m_sb.sb_inodesize);
+		if (!dip->di_next_unlinked)  {
+			xfs_alert(mp,
+	"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+				i, (long long)bp->b_bn);
+		}
+	}
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+	struct xfs_buf	*bp,
+	bool		readahead)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		i;
+	int		ni;
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 */
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			if (readahead) {
+				bp->b_flags &= ~XBF_DONE;
+				return;
+			}
+
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			xfs_verifier_error(bp);
+#ifdef DEBUG
+			xfs_alert(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
+				(unsigned long long)bp->b_bn, i,
+				be16_to_cpu(dip->di_magic));
+#endif
+		}
+	}
+	xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+	.verify_read = xfs_inode_buf_read_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+	.verify_read = xfs_inode_buf_readahead_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_imap		*imap,
+	struct xfs_dinode       **dipp,
+	struct xfs_buf		**bpp,
+	uint			buf_flags,
+	uint			iget_flags)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	buf_flags |= XBF_UNMAPPED;
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   (int)imap->im_len, buf_flags, &bp,
+				   &xfs_inode_buf_ops);
+	if (error) {
+		if (error == EAGAIN) {
+			ASSERT(buf_flags & XBF_TRYLOCK);
+			return error;
+		}
+
+		if (error == EFSCORRUPTED &&
+		    (iget_flags & XFS_IGET_UNTRUSTED))
+			return EINVAL;
+
+		xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+			__func__, error);
+		return error;
+	}
+
+	*bpp = bp;
+	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+	return 0;
+}
+
+void
+xfs_dinode_from_disk(
+	xfs_icdinode_t		*to,
+	xfs_dinode_t		*from)
+{
+	to->di_magic = be16_to_cpu(from->di_magic);
+	to->di_mode = be16_to_cpu(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = be16_to_cpu(from->di_onlink);
+	to->di_uid = be32_to_cpu(from->di_uid);
+	to->di_gid = be32_to_cpu(from->di_gid);
+	to->di_nlink = be32_to_cpu(from->di_nlink);
+	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_flushiter = be16_to_cpu(from->di_flushiter);
+	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+	to->di_size = be64_to_cpu(from->di_size);
+	to->di_nblocks = be64_to_cpu(from->di_nblocks);
+	to->di_extsize = be32_to_cpu(from->di_extsize);
+	to->di_nextents = be32_to_cpu(from->di_nextents);
+	to->di_anextents = be16_to_cpu(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat	= from->di_aformat;
+	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
+	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
+	to->di_flags	= be16_to_cpu(from->di_flags);
+	to->di_gen	= be32_to_cpu(from->di_gen);
+
+	if (to->di_version == 3) {
+		to->di_changecount = be64_to_cpu(from->di_changecount);
+		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+		to->di_flags2 = be64_to_cpu(from->di_flags2);
+		to->di_ino = be64_to_cpu(from->di_ino);
+		to->di_lsn = be64_to_cpu(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+	}
+}
+
+void
+xfs_dinode_to_disk(
+	xfs_dinode_t		*to,
+	xfs_icdinode_t		*from)
+{
+	to->di_magic = cpu_to_be16(from->di_magic);
+	to->di_mode = cpu_to_be16(from->di_mode);
+	to->di_version = from ->di_version;
+	to->di_format = from->di_format;
+	to->di_onlink = cpu_to_be16(from->di_onlink);
+	to->di_uid = cpu_to_be32(from->di_uid);
+	to->di_gid = cpu_to_be32(from->di_gid);
+	to->di_nlink = cpu_to_be32(from->di_nlink);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+	to->di_size = cpu_to_be64(from->di_size);
+	to->di_nblocks = cpu_to_be64(from->di_nblocks);
+	to->di_extsize = cpu_to_be32(from->di_extsize);
+	to->di_nextents = cpu_to_be32(from->di_nextents);
+	to->di_anextents = cpu_to_be16(from->di_anextents);
+	to->di_forkoff = from->di_forkoff;
+	to->di_aformat = from->di_aformat;
+	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+	to->di_dmstate = cpu_to_be16(from->di_dmstate);
+	to->di_flags = cpu_to_be16(from->di_flags);
+	to->di_gen = cpu_to_be32(from->di_gen);
+
+	if (from->di_version == 3) {
+		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_ino = cpu_to_be64(from->di_ino);
+		to->di_lsn = cpu_to_be64(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = cpu_to_be16(from->di_flushiter);
+	}
+}
+
+static bool
+xfs_dinode_verify(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+		return false;
+
+	/* only version 3 or greater inodes are extensively verified here */
+	if (dip->di_version < 3)
+		return true;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      XFS_DINODE_CRC_OFF))
+		return false;
+	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+		return false;
+	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	return true;
+}
+
+void
+xfs_dinode_calc_crc(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	__uint32_t		crc;
+
+	if (dip->di_version < 3)
+		return;
+
+	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      XFS_DINODE_CRC_OFF);
+	dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		iget_flags)
+{
+	xfs_buf_t	*bp;
+	xfs_dinode_t	*dip;
+	int		error;
+
+	/*
+	 * Fill in the location information in the in-core inode.
+	 */
+	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+	if (error)
+		return error;
+
+	/* shortcut IO on inode allocation if possible */
+	if ((iget_flags & XFS_IGET_CREATE) &&
+	    xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+		/* initialise the on-disk inode core */
+		memset(&ip->i_d, 0, sizeof(ip->i_d));
+		ip->i_d.di_magic = XFS_DINODE_MAGIC;
+		ip->i_d.di_gen = prandom_u32();
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			ip->i_d.di_version = 3;
+			ip->i_d.di_ino = ip->i_ino;
+			uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+		} else
+			ip->i_d.di_version = 2;
+		return 0;
+	}
+
+	/*
+	 * Get pointers to the on-disk inode and the buffer containing it.
+	 */
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+	if (error)
+		return error;
+
+	/* even unallocated inodes are verified */
+	if (!xfs_dinode_verify(mp, ip, dip)) {
+		xfs_alert(mp, "%s: validation failed for inode %lld failed",
+				__func__, ip->i_ino);
+
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+		error = EFSCORRUPTED;
+		goto out_brelse;
+	}
+
+	/*
+	 * If the on-disk inode is already linked to a directory
+	 * entry, copy all of the inode into the in-core inode.
+	 * xfs_iformat_fork() handles copying in the inode format
+	 * specific information.
+	 * Otherwise, just get the truly permanent information.
+	 */
+	if (dip->di_mode) {
+		xfs_dinode_from_disk(&ip->i_d, dip);
+		error = xfs_iformat_fork(ip, dip);
+		if (error)  {
+#ifdef DEBUG
+			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+				__func__, error);
+#endif /* DEBUG */
+			goto out_brelse;
+		}
+	} else {
+		/*
+		 * Partial initialisation of the in-core inode. Just the bits
+		 * that xfs_ialloc won't overwrite or relies on being correct.
+		 */
+		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+		ip->i_d.di_version = dip->di_version;
+		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+		if (dip->di_version == 3) {
+			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+		}
+
+		/*
+		 * Make sure to pull in the mode here as well in
+		 * case the inode is released without being used.
+		 * This ensures that xfs_inactive() will see that
+		 * the inode is already free and not try to mess
+		 * with the uninitialized part of it.
+		 */
+		ip->i_d.di_mode = 0;
+	}
+
+	/*
+	 * Automatically convert version 1 inode formats in memory to version 2
+	 * inode format. If the inode is modified, it will get logged and
+	 * rewritten as a version 2 inode. We can do this because we set the
+	 * superblock feature bit for v2 inodes unconditionally during mount
+	 * and it means the reast of the code can assume the inode version is 2
+	 * or higher.
+	 */
+	if (ip->i_d.di_version == 1) {
+		ip->i_d.di_version = 2;
+		memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+		ip->i_d.di_nlink = ip->i_d.di_onlink;
+		ip->i_d.di_onlink = 0;
+		xfs_set_projid(ip, 0);
+	}
+
+	ip->i_delayed_blks = 0;
+
+	/*
+	 * Mark the buffer containing the inode as something to keep
+	 * around for a while.  This helps to keep recently accessed
+	 * meta-data in-core longer.
+	 */
+	xfs_buf_set_ref(bp, XFS_INO_REF);
+
+	/*
+	 * Use xfs_trans_brelse() to release the buffer containing the on-disk
+	 * inode, because it was acquired with xfs_trans_read_buf() in
+	 * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
+	 * will only release the buffer if it is not dirty within the
+	 * transaction.  It will be OK to release the buffer in this case,
+	 * because inodes on disk are never destroyed and we will be locking the
+	 * new in-core inode before putting it in the cache where other
+	 * processes can find it.  Thus we don't have to worry about the inode
+	 * being changed just because we released the buffer.
+	 */
+ out_brelse:
+	xfs_trans_brelse(tp, bp);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000000..2a124e97f082
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,1906 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+	xfs_ifork_t		*ifp,
+	int			nrecs,
+	xfs_exntfmt_t		fmt)
+{
+	xfs_bmbt_irec_t		irec;
+	xfs_bmbt_rec_host_t	rec;
+	int			i;
+
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		rec.l0 = get_unaligned(&ep->l0);
+		rec.l1 = get_unaligned(&ep->l1);
+		xfs_bmbt_get_all(&rec, &irec);
+		if (fmt == XFS_EXTFMT_NOSTATE)
+			ASSERT(irec.br_state == XFS_EXT_NORM);
+	}
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip)
+{
+	xfs_attr_shortform_t	*atp;
+	int			size;
+	int			error = 0;
+	xfs_fsize_t             di_size;
+
+	if (unlikely(be32_to_cpu(dip->di_nextents) +
+		     be16_to_cpu(dip->di_anextents) >
+		     be64_to_cpu(dip->di_nblocks))) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+			(unsigned long long)ip->i_ino,
+			(int)(be32_to_cpu(dip->di_nextents) +
+			      be16_to_cpu(dip->di_anextents)),
+			(unsigned long long)
+				be64_to_cpu(dip->di_nblocks));
+		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return EFSCORRUPTED;
+	}
+
+	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+			(unsigned long long)ip->i_ino,
+			dip->di_forkoff);
+		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return EFSCORRUPTED;
+	}
+
+	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		     !ip->i_mount->m_rtdev_targp)) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %Lu, has realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return EFSCORRUPTED;
+	}
+
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+					      ip->i_mount, dip);
+			return EFSCORRUPTED;
+		}
+		ip->i_d.di_size = 0;
+		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+		break;
+
+	case S_IFREG:
+	case S_IFLNK:
+	case S_IFDIR:
+		switch (dip->di_format) {
+		case XFS_DINODE_FMT_LOCAL:
+			/*
+			 * no local regular files yet
+			 */
+			if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (local format for regular file).",
+					(unsigned long long) ip->i_ino);
+				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return EFSCORRUPTED;
+			}
+
+			di_size = be64_to_cpu(dip->di_size);
+			if (unlikely(di_size < 0 ||
+				     di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (bad size %Ld for local inode).",
+					(unsigned long long) ip->i_ino,
+					(long long) di_size);
+				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+						     XFS_ERRLEVEL_LOW,
+						     ip->i_mount, dip);
+				return EFSCORRUPTED;
+			}
+
+			size = (int)di_size;
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+			break;
+		case XFS_DINODE_FMT_EXTENTS:
+			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+			break;
+		case XFS_DINODE_FMT_BTREE:
+			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+			break;
+		default:
+			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+					 ip->i_mount);
+			return EFSCORRUPTED;
+		}
+		break;
+
+	default:
+		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+		return EFSCORRUPTED;
+	}
+	if (error) {
+		return error;
+	}
+	if (!XFS_DFORK_Q(dip))
+		return 0;
+
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_LOCAL:
+		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+		size = be16_to_cpu(atp->hdr.totsize);
+
+		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+			xfs_warn(ip->i_mount,
+				"corrupt inode %Lu (bad attr fork size %Ld).",
+				(unsigned long long) ip->i_ino,
+				(long long) size);
+			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+					     XFS_ERRLEVEL_LOW,
+					     ip->i_mount, dip);
+			return EFSCORRUPTED;
+		}
+
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		error = EFSCORRUPTED;
+		break;
+	}
+	if (error) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+	}
+	return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork,
+	int		size)
+{
+	xfs_ifork_t	*ifp;
+	int		real_size;
+
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount,
+	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return EFSCORRUPTED;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	real_size = 0;
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size)
+		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFINLINE;
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork)
+{
+	xfs_bmbt_rec_t	*dp;
+	xfs_ifork_t	*ifp;
+	int		nex;
+	int		size;
+	int		i;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * If the number of extents is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or memcpy() below.
+	 */
+	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+			(unsigned long long) ip->i_ino, nex);
+		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+				     ip->i_mount, dip);
+		return EFSCORRUPTED;
+	}
+
+	ifp->if_real_bytes = 0;
+	if (nex == 0)
+		ifp->if_u1.if_extents = NULL;
+	else if (nex <= XFS_INLINE_EXTS)
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	else
+		xfs_iext_add(ifp, 0, nex);
+
+	ifp->if_bytes = size;
+	if (size) {
+		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+		for (i = 0; i < nex; i++, dp++) {
+			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+			ep->l0 = get_unaligned_be64(&dp->l0);
+			ep->l1 = get_unaligned_be64(&dp->l1);
+		}
+		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+		if (whichfork != XFS_DATA_FORK ||
+			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+				if (unlikely(xfs_check_nostate_extents(
+				    ifp, 0, nex))) {
+					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+							 XFS_ERRLEVEL_LOW,
+							 ip->i_mount);
+					return EFSCORRUPTED;
+				}
+	}
+	ifp->if_flags |= XFS_IFEXTENTS;
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_bmdr_block_t	*dfp;
+	xfs_ifork_t		*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+					XFS_IFORK_MAXEXT(ip, whichfork) ||
+		     XFS_BMDR_SPACE_CALC(nrecs) >
+					XFS_DFORK_SIZE(dip, mp, whichfork) ||
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+		xfs_warn(mp, "corrupt inode %Lu (btree).",
+					(unsigned long long) ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+					 mp, dip);
+		return EFSCORRUPTED;
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+			 ifp->if_broot, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFBROOT;
+
+	return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	int		error;
+	xfs_ifork_t	*ifp;
+	xfs_extnum_t	nextents;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+				 ip->i_mount);
+		return EFSCORRUPTED;
+	}
+	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	/*
+	 * We know that the size is valid (it's checked in iformat_btree)
+	 */
+	ifp->if_bytes = ifp->if_real_bytes = 0;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	xfs_iext_add(ifp, 0, nextents);
+	error = xfs_bmap_read_extents(tp, ip, whichfork);
+	if (error) {
+		xfs_iext_destroy(ifp);
+		ifp->if_flags &= ~XFS_IFEXTENTS;
+		return error;
+	}
+	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+	return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			cur_max;
+	xfs_ifork_t		*ifp;
+	struct xfs_btree_block	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+		new_max = cur_max + rec_diff;
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+				XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+				KM_SLEEP | KM_NOFS);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			XFS_IFORK_SIZE(ip, whichfork));
+		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+		/*
+		 * First copy over the btree block header.
+		 */
+		memcpy(new_broot, ifp->if_broot,
+			XFS_BMBT_BLOCK_LEN(ip->i_mount));
+	} else {
+		new_broot = NULL;
+		ifp->if_flags &= ~XFS_IFBROOT;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+						     (int)new_size);
+		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+	}
+	kmem_free(ifp->if_broot);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	if (ifp->if_broot)
+		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			XFS_IFORK_SIZE(ip, whichfork));
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	xfs_inode_t	*ip,
+	int		byte_diff,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	int		real_size;
+
+	if (byte_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			kmem_free(ifp->if_u1.if_data);
+		}
+		ifp->if_u1.if_data = NULL;
+		real_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+		/*
+		 * If the valid extents/data can fit in if_inline_ext/data,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_data == NULL) {
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			ASSERT(ifp->if_real_bytes != 0);
+			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+			      new_size);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		}
+		real_size = 0;
+	} else {
+		/*
+		 * Stuck with malloc/realloc.
+		 * For inline data, the underlying buffer must be
+		 * a multiple of 4 bytes in size so that it can be
+		 * logged and stay on word boundaries.  We enforce
+		 * that here.
+		 */
+		real_size = roundup(new_size, 4);
+		if (ifp->if_u1.if_data == NULL) {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			/*
+			 * Only do the realloc if the underlying size
+			 * is really changing.
+			 */
+			if (ifp->if_real_bytes != real_size) {
+				ifp->if_u1.if_data =
+					kmem_realloc(ifp->if_u1.if_data,
+							real_size,
+							ifp->if_real_bytes,
+							KM_SLEEP | KM_NOFS);
+			}
+		} else {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
+			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+				ifp->if_bytes);
+		}
+	}
+	ifp->if_real_bytes = real_size;
+	ifp->if_bytes = new_size;
+	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot);
+		ifp->if_broot = NULL;
+	}
+
+	/*
+	 * If the format is local, then we can't have an extents
+	 * array so just look for an inline data array.  If we're
+	 * not local then we may or may not have an extents list,
+	 * so check and free it up if we do.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+		    (ifp->if_u1.if_data != NULL)) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_data);
+			ifp->if_u1.if_data = NULL;
+			ifp->if_real_bytes = 0;
+		}
+	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+		   ((ifp->if_flags & XFS_IFEXTIREC) ||
+		    ((ifp->if_u1.if_extents != NULL) &&
+		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+		ASSERT(ifp->if_real_bytes != 0);
+		xfs_iext_destroy(ifp);
+	}
+	ASSERT(ifp->if_u1.if_extents == NULL ||
+	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+	ASSERT(ifp->if_real_bytes == 0);
+	if (whichfork == XFS_ATTR_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+	}
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+	xfs_inode_t		*ip,
+	xfs_bmbt_rec_t		*dp,
+	int			whichfork)
+{
+	int			copied;
+	int			i;
+	xfs_ifork_t		*ifp;
+	int			nrecs;
+	xfs_fsblock_t		start_block;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	ASSERT(ifp->if_bytes > 0);
+
+	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+	ASSERT(nrecs > 0);
+
+	/*
+	 * There are some delayed allocation extents in the
+	 * inode, so copy the extents one at a time and skip
+	 * the delayed ones.  There must be at least one
+	 * non-delayed extent.
+	 */
+	copied = 0;
+	for (i = 0; i < nrecs; i++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+		start_block = xfs_bmbt_get_startblock(ep);
+		if (isnullstartblock(start_block)) {
+			/*
+			 * It's a delayed allocation extent, so skip it.
+			 */
+			continue;
+		}
+
+		/* Translate to on disk format */
+		put_unaligned_be64(ep->l0, &dp->l0);
+		put_unaligned_be64(ep->l1, &dp->l1);
+		dp++;
+		copied++;
+	}
+	ASSERT(copied != 0);
+	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	xfs_inode_log_item_t	*iip,
+	int			whichfork)
+{
+	char			*cp;
+	xfs_ifork_t		*ifp;
+	xfs_mount_t		*mp;
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (!iip)
+		return;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (!ifp) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return;
+	}
+	cp = XFS_DFORK_PTR(dip, whichfork);
+	mp = ip->i_mount;
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+		       !(iip->ili_fields & extflag[whichfork]));
+		if ((iip->ili_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(xfs_iext_get_ext(ifp, 0));
+			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+			        XFS_IFORK_SIZE(ip, whichfork));
+			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE(dip, mp, whichfork));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		if (iip->ili_fields & XFS_ILOG_UUID) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			memcpy(XFS_DFORK_DPTR(dip),
+			       &ip->i_df.if_u2.if_uuid,
+			       sizeof(uuid_t));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx)		/* index of target extent */
+{
+	ASSERT(idx >= 0);
+	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+		return ifp->if_u1.if_ext_irec->er_extbuf;
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_ext_irec_t	*erp;		/* irec pointer */
+		int		erp_idx = 0;	/* irec index */
+		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
+
+		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+		return &erp->er_extbuf[page_idx];
+	} else if (ifp->if_bytes) {
+		return &ifp->if_u1.if_extents[idx];
+	} else {
+		return NULL;
+	}
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t	*new,		/* items to insert */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	i;		/* extent record index */
+
+	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	xfs_iext_add(ifp, idx, count);
+	for (i = idx; i < idx + count; i++, new++)
+		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin adding exts */
+	int		ext_diff)	/* number of extents to add */
+{
+	int		byte_diff;	/* new bytes being added */
+	int		new_size;	/* size of extents after adding */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT((idx >= 0) && (idx <= nextents));
+	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+	new_size = ifp->if_bytes + byte_diff;
+	/*
+	 * If the new number of extents (nextents + ext_diff)
+	 * fits inside the inode, then continue to use the inline
+	 * extent buffer.
+	 */
+	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+		if (idx < nextents) {
+			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+				&ifp->if_u2.if_inline_ext[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+		}
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+		ifp->if_real_bytes = 0;
+	}
+	/*
+	 * Otherwise use a linear (direct) extent list.
+	 * If the extents are currently inside the inode,
+	 * xfs_iext_realloc_direct will switch us from
+	 * inline to direct extent allocation mode.
+	 */
+	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, new_size);
+		if (idx < nextents) {
+			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+				&ifp->if_u1.if_extents[idx],
+				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
+			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+		}
+	}
+	/* Indirection array */
+	else {
+		xfs_ext_irec_t	*erp;
+		int		erp_idx = 0;
+		int		page_idx = idx;
+
+		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+		if (ifp->if_flags & XFS_IFEXTIREC) {
+			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+		} else {
+			xfs_iext_irec_init(ifp);
+			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+			erp = ifp->if_u1.if_ext_irec;
+		}
+		/* Extents fit in target extent page */
+		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+			if (page_idx < erp->er_extcount) {
+				memmove(&erp->er_extbuf[page_idx + ext_diff],
+					&erp->er_extbuf[page_idx],
+					(erp->er_extcount - page_idx) *
+					sizeof(xfs_bmbt_rec_t));
+				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+			}
+			erp->er_extcount += ext_diff;
+			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		}
+		/* Insert a new extent page */
+		else if (erp) {
+			xfs_iext_add_indirect_multi(ifp,
+				erp_idx, page_idx, ext_diff);
+		}
+		/*
+		 * If extent(s) are being appended to the last page in
+		 * the indirection array and the new extent(s) don't fit
+		 * in the page, then erp is NULL and erp_idx is set to
+		 * the next index needed in the indirection array.
+		 */
+		else {
+			uint	count = ext_diff;
+
+			while (count) {
+				erp = xfs_iext_irec_new(ifp, erp_idx);
+				erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+				count -= erp->er_extcount;
+				if (count)
+					erp_idx++;
+			}
+		}
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+	xfs_ifork_t	*ifp,			/* inode fork pointer */
+	int		erp_idx,		/* target extent irec index */
+	xfs_extnum_t	idx,			/* index within target list */
+	int		count)			/* new extents being added */
+{
+	int		byte_diff;		/* new bytes being added */
+	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
+	xfs_extnum_t	ext_diff;		/* number of extents to add */
+	xfs_extnum_t	ext_cnt;		/* new extents still needed */
+	xfs_extnum_t	nex2;			/* extents after idx + count */
+	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
+	int		nlists;			/* number of irec's (lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	nex2 = erp->er_extcount - idx;
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/*
+	 * Save second part of target extent list
+	 * (all extents past */
+	if (nex2) {
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+		erp->er_extcount -= nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+		memset(&erp->er_extbuf[idx], 0, byte_diff);
+	}
+
+	/*
+	 * Add the new extents to the end of the target
+	 * list, then allocate new irec record(s) and
+	 * extent buffer(s) as needed to store the rest
+	 * of the new extents.
+	 */
+	ext_cnt = count;
+	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+	if (ext_diff) {
+		erp->er_extcount += ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+	while (ext_cnt) {
+		erp_idx++;
+		erp = xfs_iext_irec_new(ifp, erp_idx);
+		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+		erp->er_extcount = ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+		ext_cnt -= ext_diff;
+	}
+
+	/* Add nex2 extents back to indirection array */
+	if (nex2) {
+		xfs_extnum_t	ext_avail;
+		int		i;
+
+		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+		i = 0;
+		/*
+		 * If nex2 extents fit in the current page, append
+		 * nex2_ep after the new extents.
+		 */
+		if (nex2 <= ext_avail) {
+			i = erp->er_extcount;
+		}
+		/*
+		 * Otherwise, check if space is available in the
+		 * next page.
+		 */
+		else if ((erp_idx < nlists - 1) &&
+			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+			erp_idx++;
+			erp++;
+			/* Create a hole for nex2 extents */
+			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+		}
+		/*
+		 * Final choice, create a new extent page for
+		 * nex2 extents.
+		 */
+		else {
+			erp_idx++;
+			erp = xfs_iext_irec_new(ifp, erp_idx);
+		}
+		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+		kmem_free(nex2_ep);
+		erp->er_extcount += nex2;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+	}
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff,	/* number of extents to remove */
+	int		state)		/* type of extent conversion */
+{
+	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+	ASSERT(ext_diff > 0);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (ifp->if_flags & XFS_IFEXTIREC) {
+		xfs_iext_remove_indirect(ifp, idx, ext_diff);
+	} else if (ifp->if_real_bytes) {
+		xfs_iext_remove_direct(ifp, idx, ext_diff);
+	} else {
+		xfs_iext_remove_inline(ifp, idx, ext_diff);
+	}
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	int		nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	ASSERT(idx < XFS_INLINE_EXTS);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(((nextents - ext_diff) > 0) &&
+		(nextents - ext_diff) < XFS_INLINE_EXTS);
+
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u2.if_inline_ext[idx],
+			&ifp->if_u2.if_inline_ext[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+			0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	} else {
+		memset(&ifp->if_u2.if_inline_ext[idx], 0,
+			ext_diff * sizeof(xfs_bmbt_rec_t));
+	}
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing exts */
+	int		ext_diff)	/* number of extents to remove */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		new_size;	/* size of extents after removal */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	new_size = ifp->if_bytes -
+		(ext_diff * sizeof(xfs_bmbt_rec_t));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+		return;
+	}
+	/* Move extents up in the list (if needed) */
+	if (idx + ext_diff < nextents) {
+		memmove(&ifp->if_u1.if_extents[idx],
+			&ifp->if_u1.if_extents[idx + ext_diff],
+			(nextents - (idx + ext_diff)) *
+			 sizeof(xfs_bmbt_rec_t));
+	}
+	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+		0, ext_diff * sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Reallocate the direct extent list. If the extents
+	 * will fit inside the inode then xfs_iext_realloc_direct
+	 * will switch from direct to inline extent allocation
+	 * mode for us.
+	 */
+	xfs_iext_realloc_direct(ifp, new_size);
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	idx,		/* index to begin removing extents */
+	int		count)		/* number of extents to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		erp_idx = 0;	/* indirection array index */
+	xfs_extnum_t	ext_cnt;	/* extents left to remove */
+	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
+	xfs_extnum_t	nex1;		/* number of extents before idx */
+	xfs_extnum_t	nex2;		/* extents after idx + count */
+	int		page_idx = idx;	/* index in target extent list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+	ASSERT(erp != NULL);
+	nex1 = page_idx;
+	ext_cnt = count;
+	while (ext_cnt) {
+		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+		/*
+		 * Check for deletion of entire list;
+		 * xfs_iext_irec_remove() updates extent offsets.
+		 */
+		if (ext_diff == erp->er_extcount) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+			ext_cnt -= ext_diff;
+			nex1 = 0;
+			if (ext_cnt) {
+				ASSERT(erp_idx < ifp->if_real_bytes /
+					XFS_IEXT_BUFSZ);
+				erp = &ifp->if_u1.if_ext_irec[erp_idx];
+				nex1 = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+		/* Move extents up (if needed) */
+		if (nex2) {
+			memmove(&erp->er_extbuf[nex1],
+				&erp->er_extbuf[nex1 + ext_diff],
+				nex2 * sizeof(xfs_bmbt_rec_t));
+		}
+		/* Zero out rest of page */
+		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+		/* Update remaining counters */
+		erp->er_extcount -= ext_diff;
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+		ext_cnt -= ext_diff;
+		nex1 = 0;
+		erp_idx++;
+		erp++;
+	}
+	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+	xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new size of extents after adding */
+{
+	int		rnew_size;	/* real new size of extents */
+
+	rnew_size = new_size;
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+		 (new_size != ifp->if_real_bytes)));
+
+	/* Free extent records */
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	}
+	/* Resize direct extent list and zero any new bytes */
+	else if (ifp->if_real_bytes) {
+		/* Check if extents will fit inside the inode */
+		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+			xfs_iext_direct_to_inline(ifp, new_size /
+				(uint)sizeof(xfs_bmbt_rec_t));
+			ifp->if_bytes = new_size;
+			return;
+		}
+		if (!is_power_of_2(new_size)){
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		if (rnew_size != ifp->if_real_bytes) {
+			ifp->if_u1.if_extents =
+				kmem_realloc(ifp->if_u1.if_extents,
+						rnew_size,
+						ifp->if_real_bytes, KM_NOFS);
+		}
+		if (rnew_size > ifp->if_real_bytes) {
+			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t)], 0,
+				rnew_size - ifp->if_real_bytes);
+		}
+	}
+	/* Switch from the inline extent buffer to a direct extent list */
+	else {
+		if (!is_power_of_2(new_size)) {
+			rnew_size = roundup_pow_of_two(new_size);
+		}
+		xfs_iext_inline_to_direct(ifp, rnew_size);
+	}
+	ifp->if_real_bytes = rnew_size;
+	ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	nextents)	/* number of extents in file */
+{
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(nextents <= XFS_INLINE_EXTS);
+	/*
+	 * The inline buffer was zeroed when we switched
+	 * from inline to direct extent allocation mode,
+	 * so we don't need to clear it here.
+	 */
+	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+		nextents * sizeof(xfs_bmbt_rec_t));
+	kmem_free(ifp->if_u1.if_extents);
+	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* number of extents in file */
+{
+	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+	memset(ifp->if_u1.if_extents, 0, new_size);
+	if (ifp->if_bytes) {
+		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+			ifp->if_bytes);
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		new_size)	/* new indirection array size */
+{
+	int		nlists;		/* number of irec's (ex lists) */
+	int		size;		/* current indirection array size */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	size = nlists * sizeof(xfs_ext_irec_t);
+	ASSERT(ifp->if_real_bytes);
+	ASSERT((new_size >= 0) && (new_size != size));
+	if (new_size == 0) {
+		xfs_iext_destroy(ifp);
+	} else {
+		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+			kmem_realloc(ifp->if_u1.if_ext_irec,
+				new_size, size, KM_NOFS);
+	}
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+	 xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		size;		/* size of file extents */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+	size = nextents * sizeof(xfs_bmbt_rec_t);
+
+	xfs_iext_irec_compact_pages(ifp);
+	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+	ep = ifp->if_u1.if_ext_irec->er_extbuf;
+	kmem_free(ifp->if_u1.if_ext_irec);
+	ifp->if_flags &= ~XFS_IFEXTIREC;
+	ifp->if_u1.if_extents = ep;
+	ifp->if_bytes = size;
+	if (nextents < XFS_LINEAR_EXTS) {
+		xfs_iext_realloc_direct(ifp, size);
+	}
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		int	erp_idx;
+		int	nlists;
+
+		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+			xfs_iext_irec_remove(ifp, erp_idx);
+		}
+		ifp->if_flags &= ~XFS_IFEXTIREC;
+	} else if (ifp->if_real_bytes) {
+		kmem_free(ifp->if_u1.if_extents);
+	} else if (ifp->if_bytes) {
+		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+			sizeof(xfs_bmbt_rec_t));
+	}
+	ifp->if_u1.if_extents = NULL;
+	ifp->if_real_bytes = 0;
+	ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *			/* pointer to found extent record */
+xfs_iext_bno_to_ext(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	xfs_extnum_t	*idxp)		/* index of target extent */
+{
+	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
+	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
+	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	int		high;		/* upper boundary in search */
+	xfs_extnum_t	idx = 0;	/* index of target extent */
+	int		low;		/* lower boundary in search */
+	xfs_extnum_t	nextents;	/* number of file extents */
+	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
+
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*idxp = 0;
+		return NULL;
+	}
+	low = 0;
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		/* Find target extent list */
+		int	erp_idx = 0;
+		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+		base = erp->er_extbuf;
+		high = erp->er_extcount - 1;
+	} else {
+		base = ifp->if_u1.if_extents;
+		high = nextents - 1;
+	}
+	/* Binary search extent records */
+	while (low <= high) {
+		idx = (low + high) >> 1;
+		ep = base + idx;
+		startoff = xfs_bmbt_get_startoff(ep);
+		blockcount = xfs_bmbt_get_blockcount(ep);
+		if (bno < startoff) {
+			high = idx - 1;
+		} else if (bno >= startoff + blockcount) {
+			low = idx + 1;
+		} else {
+			/* Convert back to file-based extent index */
+			if (ifp->if_flags & XFS_IFEXTIREC) {
+				idx += erp->er_extoff;
+			}
+			*idxp = idx;
+			return ep;
+		}
+	}
+	/* Convert back to file-based extent index */
+	if (ifp->if_flags & XFS_IFEXTIREC) {
+		idx += erp->er_extoff;
+	}
+	if (bno >= startoff + blockcount) {
+		if (++idx == nextents) {
+			ep = NULL;
+		} else {
+			ep = xfs_iext_get_ext(ifp, idx);
+		}
+	}
+	*idxp = idx;
+	return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *			/* pointer to found extent record */
+xfs_iext_bno_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number to search for */
+	int		*erp_idxp)	/* irec index of target ext list */
+{
+	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
+	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of extent irec's (lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+			high = erp_idx - 1;
+		} else if (erp_next && bno >=
+			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+			low = erp_idx + 1;
+		} else {
+			break;
+		}
+	}
+	*erp_idxp = erp_idx;
+	return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
+	int		*erp_idxp,	/* pointer to target irec */
+	int		realloc)	/* new bytes were just added */
+{
+	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
+	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
+	int		erp_idx;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+	int		high;		/* binary search upper limit */
+	int		low;		/* binary search lower limit */
+	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	ASSERT(page_idx >= 0);
+	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp_idx = 0;
+	low = 0;
+	high = nlists - 1;
+
+	/* Binary search extent irec's */
+	while (low <= high) {
+		erp_idx = (low + high) >> 1;
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		prev = erp_idx > 0 ? erp - 1 : NULL;
+		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+			high = erp_idx - 1;
+		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
+			   (page_idx == erp->er_extoff + erp->er_extcount &&
+			    !realloc)) {
+			low = erp_idx + 1;
+		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
+			   erp->er_extcount == XFS_LINEAR_EXTS) {
+			ASSERT(realloc);
+			page_idx = 0;
+			erp_idx++;
+			erp = erp_idx < nlists ? erp + 1 : NULL;
+			break;
+		} else {
+			page_idx -= erp->er_extoff;
+			break;
+		}
+	}
+	*idxp = page_idx;
+	*erp_idxp = erp_idx;
+	return erp;
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+
+	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+	if (nextents == 0) {
+		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	} else if (!ifp->if_real_bytes) {
+		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+	}
+	erp->er_extbuf = ifp->if_u1.if_extents;
+	erp->er_extcount = nextents;
+	erp->er_extoff = 0;
+
+	ifp->if_flags |= XFS_IFEXTIREC;
+	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+	ifp->if_u1.if_ext_irec = erp;
+
+	return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* index for new irec */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+	/* Resize indirection array */
+	xfs_iext_realloc_indirect(ifp, ++nlists *
+				  sizeof(xfs_ext_irec_t));
+	/*
+	 * Move records down in the array so the
+	 * new page can use erp_idx.
+	 */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = nlists - 1; i > erp_idx; i--) {
+		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+	}
+	ASSERT(i == erp_idx);
+
+	/* Initialize new extent record */
+	erp = ifp->if_u1.if_ext_irec;
+	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+	erp[erp_idx].er_extcount = 0;
+	erp[erp_idx].er_extoff = erp_idx > 0 ?
+		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+	return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx)	/* irec index to remove */
+{
+	xfs_ext_irec_t	*erp;		/* indirection array pointer */
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	erp = &ifp->if_u1.if_ext_irec[erp_idx];
+	if (erp->er_extbuf) {
+		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+			-erp->er_extcount);
+		kmem_free(erp->er_extbuf);
+	}
+	/* Compact extent records */
+	erp = ifp->if_u1.if_ext_irec;
+	for (i = erp_idx; i < nlists - 1; i++) {
+		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+	}
+	/*
+	 * Manually free the last extent record from the indirection
+	 * array.  A call to xfs_iext_realloc_indirect() with a size
+	 * of zero would result in a call to xfs_iext_destroy() which
+	 * would in turn call this function again, creating a nasty
+	 * infinite loop.
+	 */
+	if (--nlists) {
+		xfs_iext_realloc_indirect(ifp,
+			nlists * sizeof(xfs_ext_irec_t));
+	} else {
+		kmem_free(ifp->if_u1.if_ext_irec);
+	}
+	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+	if (nextents == 0) {
+		xfs_iext_destroy(ifp);
+	} else if (nextents <= XFS_INLINE_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+		xfs_iext_direct_to_inline(ifp, nextents);
+	} else if (nextents <= XFS_LINEAR_EXTS) {
+		xfs_iext_indirect_to_direct(ifp);
+	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+		xfs_iext_irec_compact_pages(ifp);
+	}
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+	xfs_ifork_t	*ifp)		/* inode fork pointer */
+{
+	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
+	int		erp_idx = 0;	/* indirection array index */
+	int		nlists;		/* number of irec's (ex lists) */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	while (erp_idx < nlists - 1) {
+		erp = &ifp->if_u1.if_ext_irec[erp_idx];
+		erp_next = erp + 1;
+		if (erp_next->er_extcount <=
+		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
+			memcpy(&erp->er_extbuf[erp->er_extcount],
+				erp_next->er_extbuf, erp_next->er_extcount *
+				sizeof(xfs_bmbt_rec_t));
+			erp->er_extcount += erp_next->er_extcount;
+			/*
+			 * Free page before removing extent record
+			 * so er_extoffs don't get modified in
+			 * xfs_iext_irec_remove.
+			 */
+			kmem_free(erp_next->er_extbuf);
+			erp_next->er_extbuf = NULL;
+			xfs_iext_irec_remove(ifp, erp_idx + 1);
+			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+		} else {
+			erp_idx++;
+		}
+	}
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	int		erp_idx,	/* irec index to update */
+	int		ext_diff)	/* number of new extents */
+{
+	int		i;		/* loop counter */
+	int		nlists;		/* number of irec's (ex lists */
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	for (i = erp_idx; i < nlists; i++) {
+		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+	}
+}
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000000..ee7e0e80246b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+	struct xfs_mount	*mp)
+{
+	int			size;
+	int			nblks;
+
+	size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+	       MAXNAMELEN - 1;
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	nblks += XFS_B_TO_FSB(mp, size);
+	nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+	return  M_RES(mp)->tr_attrsetm.tr_logres +
+		M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*max_resp)
+{
+	struct xfs_trans_res	*resp;
+	struct xfs_trans_res	*end_resp;
+	int			log_space = 0;
+	int			attr_space;
+
+	attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+	resp = (struct xfs_trans_res *)M_RES(mp);
+	end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+	for (; resp < end_resp; resp++) {
+		int		tmp = resp->tr_logcount > 1 ?
+				      resp->tr_logres * resp->tr_logcount :
+				      resp->tr_logres;
+		if (log_space < tmp) {
+			log_space = tmp;
+			*max_resp = *resp;		/* struct copy */
+		}
+	}
+
+	if (attr_space > log_space) {
+		*max_resp = M_RES(mp)->tr_attrsetm;	/* struct copy */
+		max_resp->tr_logres = attr_space;
+	}
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans_res	tres = {0};
+	int			max_logres;
+	int			min_logblks = 0;
+	int			lsunit = 0;
+
+	xfs_log_get_max_trans_res(mp, &tres);
+
+	max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+	if (tres.tr_logcount > 1)
+		max_logres *= tres.tr_logcount;
+
+	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+		lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+	/*
+	 * Two factors should be taken into account for calculating the minimum
+	 * log space.
+	 * 1) The fundamental limitation is that no single transaction can be
+	 *    larger than half size of the log.
+	 *
+	 *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+	 *    define, which is set to 3. That means we can definitely fit
+	 *    maximally sized 2 transactions in the log. We'll use this same
+	 *    value here.
+	 *
+	 * 2) If the lsunit option is specified, a transaction requires 2 LSU
+	 *    for the reservation because there are two log writes that can
+	 *    require padding - the transaction data and the commit record which
+	 *    are written separately and both can require padding to the LSU.
+	 *    Consider that we can have an active CIL reservation holding 2*LSU,
+	 *    but the CIL is not over a push threshold, in this case, if we
+	 *    don't have enough log space for at one new transaction, which
+	 *    includes another 2*LSU in the reservation, we will run into dead
+	 *    loop situation in log space grant procedure. i.e.
+	 *    xlog_grant_head_wait().
+	 *
+	 *    Hence the log size needs to be able to contain two maximally sized
+	 *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+	 *
+	 * Also, the log size should be a multiple of the log stripe unit, round
+	 * it up to lsunit boundary if lsunit is specified.
+	 */
+	if (lsunit) {
+		min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+			      2 * lsunit;
+	} else
+		min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+	min_logblks *= XFS_MIN_LOG_FACTOR;
+
+	return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000000..f4dd697cac08
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap = 1;
+	int		error;		/* error value */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
+				   mp->m_bsize, 0, &bp, NULL);
+	if (error)
+		return error;
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information, modify and log it.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sp += delta;
+	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+		(uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+	return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error)
+		return error;
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len)		/* length of extent */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+	int		stat;
+	int		error;
+
+	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+	if (error)
+		return error;
+	ASSERT(stat);
+	return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp = NULL;	/* summary file block buffer */
+
+	mp = tp->t_mountp;
+
+	ASSERT(mp->m_rbmip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+	if (error)
+		return error;
+
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000000..23c2f2577c8d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+	struct xfs_mount *mp,
+	int		pathlen)
+{
+	int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+	return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+	dsl->sl_offset = cpu_to_be32(offset);
+	dsl->sl_bytes = cpu_to_be32(size);
+	uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+	dsl->sl_owner = cpu_to_be64(ino);
+	dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+	if (offset != be32_to_cpu(dsl->sl_offset))
+		return false;
+	if (size != be32_to_cpu(dsl->sl_bytes))
+		return false;
+	if (ino != be64_to_cpu(dsl->sl_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_symlink_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+		return false;
+	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+		return false;
+	if (be32_to_cpu(dsl->sl_offset) +
+				be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+		return false;
+	if (dsl->sl_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_symlink_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_symlink_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_symlink_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_symlink_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (bip) {
+		struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+		dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	}
+	xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+	.verify_read = xfs_symlink_read_verify,
+	.verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	char			*buf;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+		bp->b_ops = NULL;
+		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		return;
+	}
+
+	/*
+	 * As this symlink fits in an inode literal area, it must also fit in
+	 * the smallest buffer the filesystem supports.
+	 */
+	ASSERT(BBTOB(bp->b_length) >=
+			ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	buf = bp->b_addr;
+	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000000..f2bda7c76b8a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,894 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+	return round_up(sizeof(struct xlog_op_header) +
+			sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+	uint		nbufs,
+	uint		size)
+{
+	return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ *	- for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ *	- the btree data contained by both forks will fit into the inode size,
+ *	  hence when combined with the inode core above, we have a total of the
+ *	  actual inode size.
+ *	- the BMBT headers need to be accounted separately, as they are
+ *	  additional to the records and pointers that fit inside the inode
+ *	  forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+	struct xfs_mount	*mp,
+	uint			ninodes)
+{
+	return ninodes *
+		(4 * sizeof(struct xlog_op_header) +
+		 sizeof(struct xfs_inode_log_format) +
+		 mp->m_sb.sb_inodesize +
+		 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ * 	- inode allocation
+ * 	- inode free
+ * 	- inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+	struct xfs_mount 	*mp,
+	int			alloc,
+	int			modify)
+{
+	uint res;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return 0;
+
+	res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+	if (alloc)
+		res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+					XFS_FSB_TO_B(mp, 1));
+	if (modify)
+		res += (uint)XFS_FSB_TO_B(mp, 1);
+
+	return res;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				      XFS_FSB_TO_B(mp, 1)) +
+		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				      XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(5, 0) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				     XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+				     mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 4) +
+		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+	struct xfs_mount        *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+	       max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		MAX((xfs_calc_inode_res(mp, 2) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_iunlink_add_reservation(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		(uint)XFS_FSB_TO_B(mp, 1) +
+		xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 1, 1);
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX(xfs_calc_create_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 0, 0);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX(xfs_calc_icreate_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return xfs_calc_icreate_reservation(mp);
+	return __xfs_calc_create_reservation(mp);
+
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+	struct xfs_mount        *mp)
+{
+	uint	res = XFS_DQUOT_LOGRES(mp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		res += xfs_calc_icreate_resv_alloc(mp);
+	else
+		res += xfs_calc_create_resv_alloc(mp);
+
+	return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp) +
+	       xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion, removal or modification)
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_iunlink_remove_reservation(mp) +
+		xfs_calc_buf_res(1, 0) +
+		xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+				 mp->m_in_maxlevels, 0) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_finobt_res(mp, 0, 1);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_inode_res(mp, 2) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+		xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
+{
+	return MAX((xfs_calc_inode_res(mp, 1) +
+		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				     XFS_FSB_TO_B(mp, 1))),
+		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		xfs_calc_inode_res(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * 	the superblock for allocations: sector size
+ *	the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *	ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((xfs_calc_inode_res(mp, 1) +
+		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+				      XFS_FSB_TO_B(mp, 1)) +
+		     (uint)XFS_FSB_TO_B(mp,
+					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ *	the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *	the write transaction log space for quota file extent allocation
+ *	the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_write_reservation(mp) +
+		xfs_calc_buf_res(1,
+			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2 +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+	struct xfs_mount	*mp,
+	struct xfs_trans_resv	*resp)
+{
+	/*
+	 * The following transactions are logged in physical format and
+	 * require a permanent reservation on space.
+	 */
+	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+	resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+	resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+	resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+	resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+	resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+	resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+	resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+	resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+	resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+	resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_create_tmpfile.tr_logres =
+			xfs_calc_create_tmpfile_reservation(mp);
+	resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+	resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+	resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+	resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+	resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+	resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+	resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+	resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+	resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+	resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+	resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+	resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+	resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+	resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+	/*
+	 * The following transactions are logged in logical format with
+	 * a default log count.
+	 */
+	resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+	resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+	resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+	resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_qm_equotaoff.tr_logres =
+		xfs_calc_qm_quotaoff_end_reservation(mp);
+	resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+	resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+	/* The following transaction are logged in logical format */
+	resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+	resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+	resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+	resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+	resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+	resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+	resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
deleted file mode 100644
index d43813267a80..000000000000
--- a/fs/xfs/xfs_alloc.c
+++ /dev/null
@@ -1,2630 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_shared.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_log.h"
-
-struct workqueue_struct *xfs_alloc_wq;
-
-#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
-#define	XFSA_FIXUP_BNO_OK	1
-#define	XFSA_FIXUP_CNT_OK	2
-
-STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-STATIC int				/* error */
-xfs_alloc_lookup_eq(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len,	/* length of extent */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int				/* error */
-xfs_alloc_lookup_ge(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len,	/* length of extent */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int					/* error */
-xfs_alloc_lookup_le(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len,	/* length of extent */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int				/* error */
-xfs_alloc_update(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len)	/* length of extent */
-{
-	union xfs_btree_rec	rec;
-
-	rec.alloc.ar_startblock = cpu_to_be32(bno);
-	rec.alloc.ar_blockcount = cpu_to_be32(len);
-	return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int					/* error */
-xfs_alloc_get_rec(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		*bno,	/* output: starting block of extent */
-	xfs_extlen_t		*len,	/* output: length of extent */
-	int			*stat)	/* output: success/failure */
-{
-	union xfs_btree_rec	*rec;
-	int			error;
-
-	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (!error && *stat == 1) {
-		*bno = be32_to_cpu(rec->alloc.ar_startblock);
-		*len = be32_to_cpu(rec->alloc.ar_blockcount);
-	}
-	return error;
-}
-
-/*
- * Compute aligned version of the found extent.
- * Takes alignment and min length into account.
- */
-STATIC void
-xfs_alloc_compute_aligned(
-	xfs_alloc_arg_t	*args,		/* allocation argument structure */
-	xfs_agblock_t	foundbno,	/* starting block in found extent */
-	xfs_extlen_t	foundlen,	/* length in found extent */
-	xfs_agblock_t	*resbno,	/* result block number */
-	xfs_extlen_t	*reslen)	/* result length */
-{
-	xfs_agblock_t	bno;
-	xfs_extlen_t	len;
-
-	/* Trim busy sections out of found extent */
-	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
-
-	if (args->alignment > 1 && len >= args->minlen) {
-		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
-		xfs_extlen_t	diff = aligned_bno - bno;
-
-		*resbno = aligned_bno;
-		*reslen = diff >= len ? 0 : len - diff;
-	} else {
-		*resbno = bno;
-		*reslen = len;
-	}
-}
-
-/*
- * Compute best start block and diff for "near" allocations.
- * freelen >= wantlen already checked by caller.
- */
-STATIC xfs_extlen_t			/* difference value (absolute) */
-xfs_alloc_compute_diff(
-	xfs_agblock_t	wantbno,	/* target starting block */
-	xfs_extlen_t	wantlen,	/* target length */
-	xfs_extlen_t	alignment,	/* target alignment */
-	char		userdata,	/* are we allocating data? */
-	xfs_agblock_t	freebno,	/* freespace's starting block */
-	xfs_extlen_t	freelen,	/* freespace's length */
-	xfs_agblock_t	*newbnop)	/* result: best start block from free */
-{
-	xfs_agblock_t	freeend;	/* end of freespace extent */
-	xfs_agblock_t	newbno1;	/* return block number */
-	xfs_agblock_t	newbno2;	/* other new block number */
-	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
-	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
-	xfs_agblock_t	wantend;	/* end of target extent */
-
-	ASSERT(freelen >= wantlen);
-	freeend = freebno + freelen;
-	wantend = wantbno + wantlen;
-	/*
-	 * We want to allocate from the start of a free extent if it is past
-	 * the desired block or if we are allocating user data and the free
-	 * extent is before desired block. The second case is there to allow
-	 * for contiguous allocation from the remaining free space if the file
-	 * grows in the short term.
-	 */
-	if (freebno >= wantbno || (userdata && freeend < wantend)) {
-		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
-			newbno1 = NULLAGBLOCK;
-	} else if (freeend >= wantend && alignment > 1) {
-		newbno1 = roundup(wantbno, alignment);
-		newbno2 = newbno1 - alignment;
-		if (newbno1 >= freeend)
-			newbno1 = NULLAGBLOCK;
-		else
-			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
-		if (newbno2 < freebno)
-			newbno2 = NULLAGBLOCK;
-		else
-			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
-		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
-			if (newlen1 < newlen2 ||
-			    (newlen1 == newlen2 &&
-			     XFS_ABSDIFF(newbno1, wantbno) >
-			     XFS_ABSDIFF(newbno2, wantbno)))
-				newbno1 = newbno2;
-		} else if (newbno2 != NULLAGBLOCK)
-			newbno1 = newbno2;
-	} else if (freeend >= wantend) {
-		newbno1 = wantbno;
-	} else if (alignment > 1) {
-		newbno1 = roundup(freeend - wantlen, alignment);
-		if (newbno1 > freeend - wantlen &&
-		    newbno1 - alignment >= freebno)
-			newbno1 -= alignment;
-		else if (newbno1 >= freeend)
-			newbno1 = NULLAGBLOCK;
-	} else
-		newbno1 = freeend - wantlen;
-	*newbnop = newbno1;
-	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
-}
-
-/*
- * Fix up the length, based on mod and prod.
- * len should be k * prod + mod for some k.
- * If len is too small it is returned unchanged.
- * If len hits maxlen it is left alone.
- */
-STATIC void
-xfs_alloc_fix_len(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_extlen_t	k;
-	xfs_extlen_t	rlen;
-
-	ASSERT(args->mod < args->prod);
-	rlen = args->len;
-	ASSERT(rlen >= args->minlen);
-	ASSERT(rlen <= args->maxlen);
-	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
-	    (args->mod == 0 && rlen < args->prod))
-		return;
-	k = rlen % args->prod;
-	if (k == args->mod)
-		return;
-	if (k > args->mod)
-		rlen = rlen - (k - args->mod);
-	else
-		rlen = rlen - args->prod + (args->mod - k);
-	if ((int)rlen < (int)args->minlen)
-		return;
-	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
-	ASSERT(rlen % args->prod == args->mod);
-	args->len = rlen;
-}
-
-/*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_agf_t	*agf;		/* a.g. freelist header */
-	int		diff;		/* free space difference */
-
-	if (args->minleft == 0)
-		return 1;
-	agf = XFS_BUF_TO_AGF(args->agbp);
-	diff = be32_to_cpu(agf->agf_freeblks)
-		- args->len - args->minleft;
-	if (diff >= 0)
-		return 1;
-	args->len += diff;		/* shrink the allocated space */
-	if (args->len >= args->minlen)
-		return 1;
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
- * Update the two btrees, logically removing from freespace the extent
- * starting at rbno, rlen blocks.  The extent is contained within the
- * actual (current) free extent fbno for flen blocks.
- * Flags are passed in indicating whether the cursors are set to the
- * relevant records.
- */
-STATIC int				/* error code */
-xfs_alloc_fixup_trees(
-	xfs_btree_cur_t	*cnt_cur,	/* cursor for by-size btree */
-	xfs_btree_cur_t	*bno_cur,	/* cursor for by-block btree */
-	xfs_agblock_t	fbno,		/* starting block of free extent */
-	xfs_extlen_t	flen,		/* length of free extent */
-	xfs_agblock_t	rbno,		/* starting block of returned extent */
-	xfs_extlen_t	rlen,		/* length of returned extent */
-	int		flags)		/* flags, XFSA_FIXUP_... */
-{
-	int		error;		/* error code */
-	int		i;		/* operation results */
-	xfs_agblock_t	nfbno1;		/* first new free startblock */
-	xfs_agblock_t	nfbno2;		/* second new free startblock */
-	xfs_extlen_t	nflen1=0;	/* first new free length */
-	xfs_extlen_t	nflen2=0;	/* second new free length */
-
-	/*
-	 * Look up the record in the by-size tree if necessary.
-	 */
-	if (flags & XFSA_FIXUP_CNT_OK) {
-#ifdef DEBUG
-		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(
-			i == 1 && nfbno1 == fbno && nflen1 == flen);
-#endif
-	} else {
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-	/*
-	 * Look up the record in the by-block tree if necessary.
-	 */
-	if (flags & XFSA_FIXUP_BNO_OK) {
-#ifdef DEBUG
-		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(
-			i == 1 && nfbno1 == fbno && nflen1 == flen);
-#endif
-	} else {
-		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-
-#ifdef DEBUG
-	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
-		struct xfs_btree_block	*bnoblock;
-		struct xfs_btree_block	*cntblock;
-
-		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
-		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-
-		XFS_WANT_CORRUPTED_RETURN(
-			bnoblock->bb_numrecs == cntblock->bb_numrecs);
-	}
-#endif
-
-	/*
-	 * Deal with all four cases: the allocated record is contained
-	 * within the freespace record, so we can have new freespace
-	 * at either (or both) end, or no freespace remaining.
-	 */
-	if (rbno == fbno && rlen == flen)
-		nfbno1 = nfbno2 = NULLAGBLOCK;
-	else if (rbno == fbno) {
-		nfbno1 = rbno + rlen;
-		nflen1 = flen - rlen;
-		nfbno2 = NULLAGBLOCK;
-	} else if (rbno + rlen == fbno + flen) {
-		nfbno1 = fbno;
-		nflen1 = flen - rlen;
-		nfbno2 = NULLAGBLOCK;
-	} else {
-		nfbno1 = fbno;
-		nflen1 = rbno - fbno;
-		nfbno2 = rbno + rlen;
-		nflen2 = (fbno + flen) - nfbno2;
-	}
-	/*
-	 * Delete the entry from the by-size btree.
-	 */
-	if ((error = xfs_btree_delete(cnt_cur, &i)))
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(i == 1);
-	/*
-	 * Add new by-size btree entry(s).
-	 */
-	if (nfbno1 != NULLAGBLOCK) {
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_btree_insert(cnt_cur, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-	if (nfbno2 != NULLAGBLOCK) {
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_btree_insert(cnt_cur, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-	/*
-	 * Fix up the by-block btree entry(s).
-	 */
-	if (nfbno1 == NULLAGBLOCK) {
-		/*
-		 * No remaining freespace, just delete the by-block tree entry.
-		 */
-		if ((error = xfs_btree_delete(bno_cur, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	} else {
-		/*
-		 * Update the by-block entry to start later|be shorter.
-		 */
-		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
-			return error;
-	}
-	if (nfbno2 != NULLAGBLOCK) {
-		/*
-		 * 2 resulting free entries, need to add one.
-		 */
-		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_btree_insert(bno_cur, &i)))
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-	return 0;
-}
-
-static bool
-xfs_agfl_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
-	int		i;
-
-	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
-		return false;
-	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
-		return false;
-	/*
-	 * during growfs operations, the perag is not fully initialised,
-	 * so we can't use it for any useful checking. growfs ensures we can't
-	 * use it by using uncached buffers that don't have the perag attached
-	 * so we can detect and avoid this problem.
-	 */
-	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
-		return false;
-
-	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
-		if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
-		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
-			return false;
-	}
-	return true;
-}
-
-static void
-xfs_agfl_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-
-	/*
-	 * There is no verification of non-crc AGFLs because mkfs does not
-	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
-	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
-	 * can't verify just those entries are valid.
-	 */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_agfl_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_agfl_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	/* no verification of non-crc AGFLs */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (!xfs_agfl_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (bip)
-		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agfl_buf_ops = {
-	.verify_read = xfs_agfl_read_verify,
-	.verify_write = xfs_agfl_write_verify,
-};
-
-/*
- * Read in the allocation group free block array.
- */
-STATIC int				/* error */
-xfs_alloc_read_agfl(
-	xfs_mount_t	*mp,		/* mount point structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
-{
-	xfs_buf_t	*bp;		/* return value */
-	int		error;
-
-	ASSERT(agno != NULLAGNUMBER);
-	error = xfs_trans_read_buf(
-			mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
-	if (error)
-		return error;
-	xfs_buf_set_ref(bp, XFS_AGFL_REF);
-	*bpp = bp;
-	return 0;
-}
-
-STATIC int
-xfs_alloc_update_counters(
-	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	struct xfs_buf		*agbp,
-	long			len)
-{
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
-
-	pag->pagf_freeblks += len;
-	be32_add_cpu(&agf->agf_freeblks, len);
-
-	xfs_trans_agblocks_delta(tp, len);
-	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
-		     be32_to_cpu(agf->agf_length)))
-		return EFSCORRUPTED;
-
-	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-	return 0;
-}
-
-/*
- * Allocation group level functions.
- */
-
-/*
- * Allocate a variable extent in the allocation group agno.
- * Type and bno are used to determine where in the allocation group the
- * extent will start.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int			/* error */
-xfs_alloc_ag_vextent(
-	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
-{
-	int		error=0;
-
-	ASSERT(args->minlen > 0);
-	ASSERT(args->maxlen > 0);
-	ASSERT(args->minlen <= args->maxlen);
-	ASSERT(args->mod < args->prod);
-	ASSERT(args->alignment > 0);
-	/*
-	 * Branch to correct routine based on the type.
-	 */
-	args->wasfromfl = 0;
-	switch (args->type) {
-	case XFS_ALLOCTYPE_THIS_AG:
-		error = xfs_alloc_ag_vextent_size(args);
-		break;
-	case XFS_ALLOCTYPE_NEAR_BNO:
-		error = xfs_alloc_ag_vextent_near(args);
-		break;
-	case XFS_ALLOCTYPE_THIS_BNO:
-		error = xfs_alloc_ag_vextent_exact(args);
-		break;
-	default:
-		ASSERT(0);
-		/* NOTREACHED */
-	}
-
-	if (error || args->agbno == NULLAGBLOCK)
-		return error;
-
-	ASSERT(args->len >= args->minlen);
-	ASSERT(args->len <= args->maxlen);
-	ASSERT(!args->wasfromfl || !args->isfl);
-	ASSERT(args->agbno % args->alignment == 0);
-
-	if (!args->wasfromfl) {
-		error = xfs_alloc_update_counters(args->tp, args->pag,
-						  args->agbp,
-						  -((long)(args->len)));
-		if (error)
-			return error;
-
-		ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
-					      args->agbno, args->len));
-	}
-
-	if (!args->isfl) {
-		xfs_trans_mod_sb(args->tp, args->wasdel ?
-				 XFS_TRANS_SB_RES_FDBLOCKS :
-				 XFS_TRANS_SB_FDBLOCKS,
-				 -((long)(args->len)));
-	}
-
-	XFS_STATS_INC(xs_allocx);
-	XFS_STATS_ADD(xs_allocb, args->len);
-	return error;
-}
-
-/*
- * Allocate a variable extent at exactly agno/bno.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
- */
-STATIC int			/* error */
-xfs_alloc_ag_vextent_exact(
-	xfs_alloc_arg_t	*args)	/* allocation argument structure */
-{
-	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
-	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
-	int		error;
-	xfs_agblock_t	fbno;	/* start block of found extent */
-	xfs_extlen_t	flen;	/* length of found extent */
-	xfs_agblock_t	tbno;	/* start block of trimmed extent */
-	xfs_extlen_t	tlen;	/* length of trimmed extent */
-	xfs_agblock_t	tend;	/* end block of trimmed extent */
-	int		i;	/* success/failure of operation */
-
-	ASSERT(args->alignment == 1);
-
-	/*
-	 * Allocate/initialize a cursor for the by-number freespace btree.
-	 */
-	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					  args->agno, XFS_BTNUM_BNO);
-
-	/*
-	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
-	 * Look for the closest free block <= bno, it must contain bno
-	 * if any free block does.
-	 */
-	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
-	if (error)
-		goto error0;
-	if (!i)
-		goto not_found;
-
-	/*
-	 * Grab the freespace record.
-	 */
-	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
-	if (error)
-		goto error0;
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	ASSERT(fbno <= args->agbno);
-
-	/*
-	 * Check for overlapping busy extents.
-	 */
-	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
-
-	/*
-	 * Give up if the start of the extent is busy, or the freespace isn't
-	 * long enough for the minimum request.
-	 */
-	if (tbno > args->agbno)
-		goto not_found;
-	if (tlen < args->minlen)
-		goto not_found;
-	tend = tbno + tlen;
-	if (tend < args->agbno + args->minlen)
-		goto not_found;
-
-	/*
-	 * End of extent will be smaller of the freespace end and the
-	 * maximal requested end.
-	 *
-	 * Fix the length according to mod and prod if given.
-	 */
-	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
-						- args->agbno;
-	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
-	ASSERT(args->agbno + args->len <= tend);
-
-	/*
-	 * We are allocating agbno for args->len
-	 * Allocate/initialize a cursor for the by-size btree.
-	 */
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT);
-	ASSERT(args->agbno + args->len <=
-		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-				      args->len, XFSA_FIXUP_BNO_OK);
-	if (error) {
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-		goto error0;
-	}
-
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
-	args->wasfromfl = 0;
-	trace_xfs_alloc_exact_done(args);
-	return 0;
-
-not_found:
-	/* Didn't find it, return null. */
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-	args->agbno = NULLAGBLOCK;
-	trace_xfs_alloc_exact_notfound(args);
-	return 0;
-
-error0:
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-	trace_xfs_alloc_exact_error(args);
-	return error;
-}
-
-/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
- */
-STATIC int
-xfs_alloc_find_best_extent(
-	struct xfs_alloc_arg	*args,	/* allocation argument structure */
-	struct xfs_btree_cur	**gcur,	/* good cursor */
-	struct xfs_btree_cur	**scur,	/* searching cursor */
-	xfs_agblock_t		gdiff,	/* difference for search comparison */
-	xfs_agblock_t		*sbno,	/* extent found by search */
-	xfs_extlen_t		*slen,	/* extent length */
-	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
-	xfs_extlen_t		*slena,	/* aligned extent length */
-	int			dir)	/* 0 = search right, 1 = search left */
-{
-	xfs_agblock_t		new;
-	xfs_agblock_t		sdiff;
-	int			error;
-	int			i;
-
-	/* The good extent is perfect, no need to  search. */
-	if (!gdiff)
-		goto out_use_good;
-
-	/*
-	 * Look until we find a better one, run out of space or run off the end.
-	 */
-	do {
-		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
-
-		/*
-		 * The good extent is closer than this one.
-		 */
-		if (!dir) {
-			if (*sbnoa >= args->agbno + gdiff)
-				goto out_use_good;
-		} else {
-			if (*sbnoa <= args->agbno - gdiff)
-				goto out_use_good;
-		}
-
-		/*
-		 * Same distance, compare length and pick the best.
-		 */
-		if (*slena >= args->minlen) {
-			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
-			xfs_alloc_fix_len(args);
-
-			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-						       args->alignment,
-						       args->userdata, *sbnoa,
-						       *slena, &new);
-
-			/*
-			 * Choose closer size and invalidate other cursor.
-			 */
-			if (sdiff < gdiff)
-				goto out_use_search;
-			goto out_use_good;
-		}
-
-		if (!dir)
-			error = xfs_btree_increment(*scur, 0, &i);
-		else
-			error = xfs_btree_decrement(*scur, 0, &i);
-		if (error)
-			goto error0;
-	} while (i);
-
-out_use_good:
-	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
-	*scur = NULL;
-	return 0;
-
-out_use_search:
-	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
-	*gcur = NULL;
-	return 0;
-
-error0:
-	/* caller invalidates cursors */
-	return error;
-}
-
-/*
- * Allocate a variable extent near bno in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int				/* error */
-xfs_alloc_ag_vextent_near(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_btree_cur_t	*bno_cur_gt;	/* cursor for bno btree, right side */
-	xfs_btree_cur_t	*bno_cur_lt;	/* cursor for bno btree, left side */
-	xfs_btree_cur_t	*cnt_cur;	/* cursor for count btree */
-	xfs_agblock_t	gtbno;		/* start bno of right side entry */
-	xfs_agblock_t	gtbnoa;		/* aligned ... */
-	xfs_extlen_t	gtdiff;		/* difference to right side entry */
-	xfs_extlen_t	gtlen;		/* length of right side entry */
-	xfs_extlen_t	gtlena;		/* aligned ... */
-	xfs_agblock_t	gtnew;		/* useful start bno of right side */
-	int		error;		/* error code */
-	int		i;		/* result code, temporary */
-	int		j;		/* result code, temporary */
-	xfs_agblock_t	ltbno;		/* start bno of left side entry */
-	xfs_agblock_t	ltbnoa;		/* aligned ... */
-	xfs_extlen_t	ltdiff;		/* difference to left side entry */
-	xfs_extlen_t	ltlen;		/* length of left side entry */
-	xfs_extlen_t	ltlena;		/* aligned ... */
-	xfs_agblock_t	ltnew;		/* useful start bno of left side */
-	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
-#ifdef DEBUG
-	/*
-	 * Randomly don't execute the first algorithm.
-	 */
-	int		dofirst;	/* set to do first algorithm */
-
-	dofirst = prandom_u32() & 1;
-#endif
-
-restart:
-	bno_cur_lt = NULL;
-	bno_cur_gt = NULL;
-	ltlen = 0;
-	gtlena = 0;
-	ltlena = 0;
-
-	/*
-	 * Get a cursor for the by-size btree.
-	 */
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT);
-
-	/*
-	 * See if there are any free extents as big as maxlen.
-	 */
-	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
-		goto error0;
-	/*
-	 * If none, then pick up the last entry in the tree unless the
-	 * tree is empty.
-	 */
-	if (!i) {
-		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
-				&ltlen, &i)))
-			goto error0;
-		if (i == 0 || ltlen == 0) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_noentry(args);
-			return 0;
-		}
-		ASSERT(i == 1);
-	}
-	args->wasfromfl = 0;
-
-	/*
-	 * First algorithm.
-	 * If the requested extent is large wrt the freespaces available
-	 * in this a.g., then the cursor will be pointing to a btree entry
-	 * near the right edge of the tree.  If it's in the last btree leaf
-	 * block, then we just examine all the entries in that block
-	 * that are big enough, and pick the best one.
-	 * This is written as a while loop so we can break out of it,
-	 * but we never loop back to the top.
-	 */
-	while (xfs_btree_islastblock(cnt_cur, 0)) {
-		xfs_extlen_t	bdiff;
-		int		besti=0;
-		xfs_extlen_t	blen=0;
-		xfs_agblock_t	bnew=0;
-
-#ifdef DEBUG
-		if (dofirst)
-			break;
-#endif
-		/*
-		 * Start from the entry that lookup found, sequence through
-		 * all larger free blocks.  If we're actually pointing at a
-		 * record smaller than maxlen, go to the start of this block,
-		 * and skip all those smaller than minlen.
-		 */
-		if (ltlen || args->alignment > 1) {
-			cnt_cur->bc_ptrs[0] = 1;
-			do {
-				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
-						&ltlen, &i)))
-					goto error0;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-				if (ltlen >= args->minlen)
-					break;
-				if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
-					goto error0;
-			} while (i);
-			ASSERT(ltlen >= args->minlen);
-			if (!i)
-				break;
-		}
-		i = cnt_cur->bc_ptrs[0];
-		for (j = 1, blen = 0, bdiff = 0;
-		     !error && j && (blen < args->maxlen || bdiff > 0);
-		     error = xfs_btree_increment(cnt_cur, 0, &j)) {
-			/*
-			 * For each entry, decide if it's better than
-			 * the previous best entry.
-			 */
-			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
-			if (ltlena < args->minlen)
-				continue;
-			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-			xfs_alloc_fix_len(args);
-			ASSERT(args->len >= args->minlen);
-			if (args->len < blen)
-				continue;
-			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, ltbnoa,
-				ltlena, &ltnew);
-			if (ltnew != NULLAGBLOCK &&
-			    (args->len > blen || ltdiff < bdiff)) {
-				bdiff = ltdiff;
-				bnew = ltnew;
-				blen = args->len;
-				besti = cnt_cur->bc_ptrs[0];
-			}
-		}
-		/*
-		 * It didn't work.  We COULD be in a case where
-		 * there's a good record somewhere, so try again.
-		 */
-		if (blen == 0)
-			break;
-		/*
-		 * Point at the best entry, and retrieve it again.
-		 */
-		cnt_cur->bc_ptrs[0] = besti;
-		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-		args->len = blen;
-		if (!xfs_alloc_fix_minleft(args)) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_nominleft(args);
-			return 0;
-		}
-		blen = args->len;
-		/*
-		 * We are allocating starting at bnew for blen blocks.
-		 */
-		args->agbno = bnew;
-		ASSERT(bnew >= ltbno);
-		ASSERT(bnew + blen <= ltbno + ltlen);
-		/*
-		 * Set up a cursor for the by-bno tree.
-		 */
-		bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
-			args->agbp, args->agno, XFS_BTNUM_BNO);
-		/*
-		 * Fix up the btree entries.
-		 */
-		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
-				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
-			goto error0;
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-
-		trace_xfs_alloc_near_first(args);
-		return 0;
-	}
-	/*
-	 * Second algorithm.
-	 * Search in the by-bno tree to the left and to the right
-	 * simultaneously, until in each case we find a space big enough,
-	 * or run into the edge of the tree.  When we run into the edge,
-	 * we deallocate that cursor.
-	 * If both searches succeed, we compare the two spaces and pick
-	 * the better one.
-	 * With alignment, it's possible for both to fail; the upper
-	 * level algorithm that picks allocation groups for allocations
-	 * is not supposed to do this.
-	 */
-	/*
-	 * Allocate and initialize the cursor for the leftward search.
-	 */
-	bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO);
-	/*
-	 * Lookup <= bno to find the leftward search's starting point.
-	 */
-	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
-		goto error0;
-	if (!i) {
-		/*
-		 * Didn't find anything; use this cursor for the rightward
-		 * search.
-		 */
-		bno_cur_gt = bno_cur_lt;
-		bno_cur_lt = NULL;
-	}
-	/*
-	 * Found something.  Duplicate the cursor for the rightward search.
-	 */
-	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
-		goto error0;
-	/*
-	 * Increment the cursor, so we will point at the entry just right
-	 * of the leftward entry if any, or to the leftmost entry.
-	 */
-	if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
-		goto error0;
-	if (!i) {
-		/*
-		 * It failed, there are no rightward entries.
-		 */
-		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
-		bno_cur_gt = NULL;
-	}
-	/*
-	 * Loop going left with the leftward cursor, right with the
-	 * rightward cursor, until either both directions give up or
-	 * we find an entry at least as big as minlen.
-	 */
-	do {
-		if (bno_cur_lt) {
-			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
-			if (ltlena >= args->minlen)
-				break;
-			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
-				goto error0;
-			if (!i) {
-				xfs_btree_del_cursor(bno_cur_lt,
-						     XFS_BTREE_NOERROR);
-				bno_cur_lt = NULL;
-			}
-		}
-		if (bno_cur_gt) {
-			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(args, gtbno, gtlen,
-						  &gtbnoa, &gtlena);
-			if (gtlena >= args->minlen)
-				break;
-			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
-				goto error0;
-			if (!i) {
-				xfs_btree_del_cursor(bno_cur_gt,
-						     XFS_BTREE_NOERROR);
-				bno_cur_gt = NULL;
-			}
-		}
-	} while (bno_cur_lt || bno_cur_gt);
-
-	/*
-	 * Got both cursors still active, need to find better entry.
-	 */
-	if (bno_cur_lt && bno_cur_gt) {
-		if (ltlena >= args->minlen) {
-			/*
-			 * Left side is good, look for a right side entry.
-			 */
-			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-			xfs_alloc_fix_len(args);
-			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, ltbnoa,
-				ltlena, &ltnew);
-
-			error = xfs_alloc_find_best_extent(args,
-						&bno_cur_lt, &bno_cur_gt,
-						ltdiff, &gtbno, &gtlen,
-						&gtbnoa, &gtlena,
-						0 /* search right */);
-		} else {
-			ASSERT(gtlena >= args->minlen);
-
-			/*
-			 * Right side is good, look for a left side entry.
-			 */
-			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
-			xfs_alloc_fix_len(args);
-			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, args->userdata, gtbnoa,
-				gtlena, &gtnew);
-
-			error = xfs_alloc_find_best_extent(args,
-						&bno_cur_gt, &bno_cur_lt,
-						gtdiff, &ltbno, &ltlen,
-						&ltbnoa, &ltlena,
-						1 /* search left */);
-		}
-
-		if (error)
-			goto error0;
-	}
-
-	/*
-	 * If we couldn't get anything, give up.
-	 */
-	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
-		if (!forced++) {
-			trace_xfs_alloc_near_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
-			goto restart;
-		}
-		trace_xfs_alloc_size_neither(args);
-		args->agbno = NULLAGBLOCK;
-		return 0;
-	}
-
-	/*
-	 * At this point we have selected a freespace entry, either to the
-	 * left or to the right.  If it's on the right, copy all the
-	 * useful variables to the "left" set so we only have one
-	 * copy of this code.
-	 */
-	if (bno_cur_gt) {
-		bno_cur_lt = bno_cur_gt;
-		bno_cur_gt = NULL;
-		ltbno = gtbno;
-		ltbnoa = gtbnoa;
-		ltlen = gtlen;
-		ltlena = gtlena;
-		j = 1;
-	} else
-		j = 0;
-
-	/*
-	 * Fix up the length and compute the useful address.
-	 */
-	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
-	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		trace_xfs_alloc_near_nominleft(args);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
-	rlen = args->len;
-	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-				     args->userdata, ltbnoa, ltlena, &ltnew);
-	ASSERT(ltnew >= ltbno);
-	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
-	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-	args->agbno = ltnew;
-
-	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
-			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
-		goto error0;
-
-	if (j)
-		trace_xfs_alloc_near_greater(args);
-	else
-		trace_xfs_alloc_near_lesser(args);
-
-	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-	return 0;
-
- error0:
-	trace_xfs_alloc_near_error(args);
-	if (cnt_cur != NULL)
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-	if (bno_cur_lt != NULL)
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
-	if (bno_cur_gt != NULL)
-		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Allocate a variable extent anywhere in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int				/* error */
-xfs_alloc_ag_vextent_size(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_btree_cur_t	*bno_cur;	/* cursor for bno btree */
-	xfs_btree_cur_t	*cnt_cur;	/* cursor for cnt btree */
-	int		error;		/* error result */
-	xfs_agblock_t	fbno;		/* start of found freespace */
-	xfs_extlen_t	flen;		/* length of found freespace */
-	int		i;		/* temp status variable */
-	xfs_agblock_t	rbno;		/* returned block number */
-	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
-
-restart:
-	/*
-	 * Allocate and initialize a cursor for the by-size btree.
-	 */
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT);
-	bno_cur = NULL;
-
-	/*
-	 * Look for an entry >= maxlen+alignment-1 blocks.
-	 */
-	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
-			args->maxlen + args->alignment - 1, &i)))
-		goto error0;
-
-	/*
-	 * If none or we have busy extents that we cannot allocate from, then
-	 * we have to settle for a smaller extent. In the case that there are
-	 * no large extents, this will return the last entry in the tree unless
-	 * the tree is empty. In the case that there are only busy large
-	 * extents, this will return the largest small extent unless there
-	 * are no smaller extents available.
-	 */
-	if (!i || forced > 1) {
-		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
-						   &fbno, &flen, &i);
-		if (error)
-			goto error0;
-		if (i == 0 || flen == 0) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_size_noentry(args);
-			return 0;
-		}
-		ASSERT(i == 1);
-		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
-	} else {
-		/*
-		 * Search for a non-busy extent that is large enough.
-		 * If we are at low space, don't check, or if we fall of
-		 * the end of the btree, turn off the busy check and
-		 * restart.
-		 */
-		for (;;) {
-			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
-
-			if (rlen >= args->maxlen)
-				break;
-
-			error = xfs_btree_increment(cnt_cur, 0, &i);
-			if (error)
-				goto error0;
-			if (i == 0) {
-				/*
-				 * Our only valid extents must have been busy.
-				 * Make it unbusy by forcing the log out and
-				 * retrying. If we've been here before, forcing
-				 * the log isn't making the extents available,
-				 * which means they have probably been freed in
-				 * this transaction.  In that case, we have to
-				 * give up on them and we'll attempt a minlen
-				 * allocation the next time around.
-				 */
-				xfs_btree_del_cursor(cnt_cur,
-						     XFS_BTREE_NOERROR);
-				trace_xfs_alloc_size_busy(args);
-				if (!forced++)
-					xfs_log_force(args->mp, XFS_LOG_SYNC);
-				goto restart;
-			}
-		}
-	}
-
-	/*
-	 * In the first case above, we got the last entry in the
-	 * by-size btree.  Now we check to see if the space hits maxlen
-	 * once aligned; if not, we search left for something better.
-	 * This can't happen in the second case above.
-	 */
-	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
-			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
-	if (rlen < args->maxlen) {
-		xfs_agblock_t	bestfbno;
-		xfs_extlen_t	bestflen;
-		xfs_agblock_t	bestrbno;
-		xfs_extlen_t	bestrlen;
-
-		bestrlen = rlen;
-		bestrbno = rbno;
-		bestflen = flen;
-		bestfbno = fbno;
-		for (;;) {
-			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
-				goto error0;
-			if (i == 0)
-				break;
-			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
-					&i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (flen < bestrlen)
-				break;
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
-			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
-				(rlen <= flen && rbno + rlen <= fbno + flen),
-				error0);
-			if (rlen > bestrlen) {
-				bestrlen = rlen;
-				bestrbno = rbno;
-				bestflen = flen;
-				bestfbno = fbno;
-				if (rlen == args->maxlen)
-					break;
-			}
-		}
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
-				&i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		rlen = bestrlen;
-		rbno = bestrbno;
-		flen = bestflen;
-		fbno = bestfbno;
-	}
-	args->wasfromfl = 0;
-	/*
-	 * Fix up the length.
-	 */
-	args->len = rlen;
-	if (rlen < args->minlen) {
-		if (!forced++) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_size_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
-			goto restart;
-		}
-		goto out_nominleft;
-	}
-	xfs_alloc_fix_len(args);
-
-	if (!xfs_alloc_fix_minleft(args))
-		goto out_nominleft;
-	rlen = args->len;
-	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
-	/*
-	 * Allocate and initialize a cursor for the by-block tree.
-	 */
-	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO);
-	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
-			rbno, rlen, XFSA_FIXUP_CNT_OK)))
-		goto error0;
-	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-	cnt_cur = bno_cur = NULL;
-	args->len = rlen;
-	args->agbno = rbno;
-	XFS_WANT_CORRUPTED_GOTO(
-		args->agbno + args->len <=
-			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
-		error0);
-	trace_xfs_alloc_size_done(args);
-	return 0;
-
-error0:
-	trace_xfs_alloc_size_error(args);
-	if (cnt_cur)
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-	if (bno_cur)
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-	return error;
-
-out_nominleft:
-	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	trace_xfs_alloc_size_nominleft(args);
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
- */
-STATIC int			/* error */
-xfs_alloc_ag_vextent_small(
-	xfs_alloc_arg_t	*args,	/* allocation argument structure */
-	xfs_btree_cur_t	*ccur,	/* by-size cursor */
-	xfs_agblock_t	*fbnop,	/* result block number */
-	xfs_extlen_t	*flenp,	/* result length */
-	int		*stat)	/* status: 0-freelist, 1-normal/none */
-{
-	int		error;
-	xfs_agblock_t	fbno;
-	xfs_extlen_t	flen;
-	int		i;
-
-	if ((error = xfs_btree_decrement(ccur, 0, &i)))
-		goto error0;
-	if (i) {
-		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	}
-	/*
-	 * Nothing in the btree, try the freelist.  Make sure
-	 * to respect minleft even when pulling from the
-	 * freelist.
-	 */
-	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
-		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
-		  > args->minleft)) {
-		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
-		if (error)
-			goto error0;
-		if (fbno != NULLAGBLOCK) {
-			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
-					     args->userdata);
-
-			if (args->userdata) {
-				xfs_buf_t	*bp;
-
-				bp = xfs_btree_get_bufs(args->mp, args->tp,
-					args->agno, fbno, 0);
-				xfs_trans_binval(args->tp, bp);
-			}
-			args->len = 1;
-			args->agbno = fbno;
-			XFS_WANT_CORRUPTED_GOTO(
-				args->agbno + args->len <=
-				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
-				error0);
-			args->wasfromfl = 1;
-			trace_xfs_alloc_small_freelist(args);
-			*stat = 0;
-			return 0;
-		}
-		/*
-		 * Nothing in the freelist.
-		 */
-		else
-			flen = 0;
-	}
-	/*
-	 * Can't allocate from the freelist for some reason.
-	 */
-	else {
-		fbno = NULLAGBLOCK;
-		flen = 0;
-	}
-	/*
-	 * Can't do the allocation, give up.
-	 */
-	if (flen < args->minlen) {
-		args->agbno = NULLAGBLOCK;
-		trace_xfs_alloc_small_notenough(args);
-		flen = 0;
-	}
-	*fbnop = fbno;
-	*flenp = flen;
-	*stat = 1;
-	trace_xfs_alloc_small_done(args);
-	return 0;
-
-error0:
-	trace_xfs_alloc_small_error(args);
-	return error;
-}
-
-/*
- * Free the extent starting at agno/bno for length.
- */
-STATIC int			/* error */
-xfs_free_ag_extent(
-	xfs_trans_t	*tp,	/* transaction pointer */
-	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
-	xfs_agnumber_t	agno,	/* allocation group number */
-	xfs_agblock_t	bno,	/* starting block number */
-	xfs_extlen_t	len,	/* length of extent */
-	int		isfl)	/* set if is freelist blocks - no sb acctg */
-{
-	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
-	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
-	int		error;		/* error return value */
-	xfs_agblock_t	gtbno;		/* start of right neighbor block */
-	xfs_extlen_t	gtlen;		/* length of right neighbor block */
-	int		haveleft;	/* have a left neighbor block */
-	int		haveright;	/* have a right neighbor block */
-	int		i;		/* temp, result code */
-	xfs_agblock_t	ltbno;		/* start of left neighbor block */
-	xfs_extlen_t	ltlen;		/* length of left neighbor block */
-	xfs_mount_t	*mp;		/* mount point struct for filesystem */
-	xfs_agblock_t	nbno;		/* new starting block of freespace */
-	xfs_extlen_t	nlen;		/* new length of freespace */
-	xfs_perag_t	*pag;		/* per allocation group data */
-
-	mp = tp->t_mountp;
-	/*
-	 * Allocate and initialize a cursor for the by-block btree.
-	 */
-	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-	cnt_cur = NULL;
-	/*
-	 * Look for a neighboring block on the left (lower block numbers)
-	 * that is contiguous with this space.
-	 */
-	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
-		goto error0;
-	if (haveleft) {
-		/*
-		 * There is a block to our left.
-		 */
-		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * It's not contiguous, though.
-		 */
-		if (ltbno + ltlen < bno)
-			haveleft = 0;
-		else {
-			/*
-			 * If this failure happens the request to free this
-			 * space was invalid, it's (partly) already free.
-			 * Very bad.
-			 */
-			XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
-		}
-	}
-	/*
-	 * Look for a neighboring block on the right (higher block numbers)
-	 * that is contiguous with this space.
-	 */
-	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
-		goto error0;
-	if (haveright) {
-		/*
-		 * There is a block to our right.
-		 */
-		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * It's not contiguous, though.
-		 */
-		if (bno + len < gtbno)
-			haveright = 0;
-		else {
-			/*
-			 * If this failure happens the request to free this
-			 * space was invalid, it's (partly) already free.
-			 * Very bad.
-			 */
-			XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
-		}
-	}
-	/*
-	 * Now allocate and initialize a cursor for the by-size tree.
-	 */
-	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
-	/*
-	 * Have both left and right contiguous neighbors.
-	 * Merge all three into a single free block.
-	 */
-	if (haveleft && haveright) {
-		/*
-		 * Delete the old by-size entry on the left.
-		 */
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_btree_delete(cnt_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Delete the old by-size entry on the right.
-		 */
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_btree_delete(cnt_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Delete the old by-block entry for the right block.
-		 */
-		if ((error = xfs_btree_delete(bno_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Move the by-block cursor back to the left neighbor.
-		 */
-		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-#ifdef DEBUG
-		/*
-		 * Check that this is the right record: delete didn't
-		 * mangle the cursor.
-		 */
-		{
-			xfs_agblock_t	xxbno;
-			xfs_extlen_t	xxlen;
-
-			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
-					&i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(
-				i == 1 && xxbno == ltbno && xxlen == ltlen,
-				error0);
-		}
-#endif
-		/*
-		 * Update remaining by-block entry to the new, joined block.
-		 */
-		nbno = ltbno;
-		nlen = len + ltlen + gtlen;
-		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-			goto error0;
-	}
-	/*
-	 * Have only a left contiguous neighbor.
-	 * Merge it together with the new freespace.
-	 */
-	else if (haveleft) {
-		/*
-		 * Delete the old by-size entry on the left.
-		 */
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_btree_delete(cnt_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Back up the by-block cursor to the left neighbor, and
-		 * update its length.
-		 */
-		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		nbno = ltbno;
-		nlen = len + ltlen;
-		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-			goto error0;
-	}
-	/*
-	 * Have only a right contiguous neighbor.
-	 * Merge it together with the new freespace.
-	 */
-	else if (haveright) {
-		/*
-		 * Delete the old by-size entry on the right.
-		 */
-		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_btree_delete(cnt_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Update the starting block and length of the right
-		 * neighbor in the by-block tree.
-		 */
-		nbno = bno;
-		nlen = len + gtlen;
-		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
-			goto error0;
-	}
-	/*
-	 * No contiguous neighbors.
-	 * Insert the new freespace into the by-block tree.
-	 */
-	else {
-		nbno = bno;
-		nlen = len;
-		if ((error = xfs_btree_insert(bno_cur, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	}
-	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-	bno_cur = NULL;
-	/*
-	 * In all cases we need to insert the new freespace in the by-size tree.
-	 */
-	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
-		goto error0;
-	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-	if ((error = xfs_btree_insert(cnt_cur, &i)))
-		goto error0;
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	cnt_cur = NULL;
-
-	/*
-	 * Update the freespace totals in the ag and superblock.
-	 */
-	pag = xfs_perag_get(mp, agno);
-	error = xfs_alloc_update_counters(tp, pag, agbp, len);
-	xfs_perag_put(pag);
-	if (error)
-		goto error0;
-
-	if (!isfl)
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-	XFS_STATS_INC(xs_freex);
-	XFS_STATS_ADD(xs_freeb, len);
-
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-
-	return 0;
-
- error0:
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
-	if (bno_cur)
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-	if (cnt_cur)
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Visible (exported) allocation/free functions.
- * Some of these are used just by xfs_alloc_btree.c and this file.
- */
-
-/*
- * Compute and fill in value of m_ag_maxlevels.
- */
-void
-xfs_alloc_compute_maxlevels(
-	xfs_mount_t	*mp)	/* file system mount structure */
-{
-	int		level;
-	uint		maxblocks;
-	uint		maxleafents;
-	int		minleafrecs;
-	int		minnoderecs;
-
-	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-	minleafrecs = mp->m_alloc_mnr[0];
-	minnoderecs = mp->m_alloc_mnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++)
-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	mp->m_ag_maxlevels = level;
-}
-
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag)
-{
-	xfs_extlen_t		need, delta = 0;
-
-	need = XFS_MIN_FREELIST_PAG(pag, mp);
-	if (need > pag->pagf_flcount)
-		delta = need - pag->pagf_flcount;
-
-	if (pag->pagf_longest > delta)
-		return pag->pagf_longest - delta;
-	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
-}
-
-/*
- * Decide whether to use this allocation group for this allocation.
- * If so, fix up the btree freelist's size.
- */
-STATIC int			/* error */
-xfs_alloc_fix_freelist(
-	xfs_alloc_arg_t	*args,	/* allocation argument structure */
-	int		flags)	/* XFS_ALLOC_FLAG_... */
-{
-	xfs_buf_t	*agbp;	/* agf buffer pointer */
-	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
-	xfs_buf_t	*agflbp;/* agfl buffer pointer */
-	xfs_agblock_t	bno;	/* freelist block */
-	xfs_extlen_t	delta;	/* new blocks needed in freelist */
-	int		error;	/* error result code */
-	xfs_extlen_t	longest;/* longest extent in allocation group */
-	xfs_mount_t	*mp;	/* file system mount point structure */
-	xfs_extlen_t	need;	/* total blocks needed in freelist */
-	xfs_perag_t	*pag;	/* per-ag information structure */
-	xfs_alloc_arg_t	targs;	/* local allocation arguments */
-	xfs_trans_t	*tp;	/* transaction pointer */
-
-	mp = args->mp;
-
-	pag = args->pag;
-	tp = args->tp;
-	if (!pag->pagf_init) {
-		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-				&agbp)))
-			return error;
-		if (!pag->pagf_init) {
-			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
-			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-			args->agbp = NULL;
-			return 0;
-		}
-	} else
-		agbp = NULL;
-
-	/*
-	 * If this is a metadata preferred pag and we are user data
-	 * then try somewhere else if we are not being asked to
-	 * try harder at this point
-	 */
-	if (pag->pagf_metadata && args->userdata &&
-	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
-		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-		args->agbp = NULL;
-		return 0;
-	}
-
-	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-		/*
-		 * If it looks like there isn't a long enough extent, or enough
-		 * total blocks, reject it.
-		 */
-		need = XFS_MIN_FREELIST_PAG(pag, mp);
-		longest = xfs_alloc_longest_free_extent(mp, pag);
-		if ((args->minlen + args->alignment + args->minalignslop - 1) >
-				longest ||
-		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
-			   need - args->total) < (int)args->minleft)) {
-			if (agbp)
-				xfs_trans_brelse(tp, agbp);
-			args->agbp = NULL;
-			return 0;
-		}
-	}
-
-	/*
-	 * Get the a.g. freespace buffer.
-	 * Can fail if we're not blocking on locks, and it's held.
-	 */
-	if (agbp == NULL) {
-		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-				&agbp)))
-			return error;
-		if (agbp == NULL) {
-			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
-			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-			args->agbp = NULL;
-			return 0;
-		}
-	}
-	/*
-	 * Figure out how many blocks we should have in the freelist.
-	 */
-	agf = XFS_BUF_TO_AGF(agbp);
-	need = XFS_MIN_FREELIST(agf, mp);
-	/*
-	 * If there isn't enough total or single-extent, reject it.
-	 */
-	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-		delta = need > be32_to_cpu(agf->agf_flcount) ?
-			(need - be32_to_cpu(agf->agf_flcount)) : 0;
-		longest = be32_to_cpu(agf->agf_longest);
-		longest = (longest > delta) ? (longest - delta) :
-			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-		if ((args->minlen + args->alignment + args->minalignslop - 1) >
-				longest ||
-		    ((int)(be32_to_cpu(agf->agf_freeblks) +
-		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
-				(int)args->minleft)) {
-			xfs_trans_brelse(tp, agbp);
-			args->agbp = NULL;
-			return 0;
-		}
-	}
-	/*
-	 * Make the freelist shorter if it's too long.
-	 */
-	while (be32_to_cpu(agf->agf_flcount) > need) {
-		xfs_buf_t	*bp;
-
-		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
-		if (error)
-			return error;
-		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
-			return error;
-		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
-		xfs_trans_binval(tp, bp);
-	}
-	/*
-	 * Initialize the args structure.
-	 */
-	memset(&targs, 0, sizeof(targs));
-	targs.tp = tp;
-	targs.mp = mp;
-	targs.agbp = agbp;
-	targs.agno = args->agno;
-	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
-	targs.type = XFS_ALLOCTYPE_THIS_AG;
-	targs.pag = pag;
-	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
-		return error;
-	/*
-	 * Make the freelist longer if it's too short.
-	 */
-	while (be32_to_cpu(agf->agf_flcount) < need) {
-		targs.agbno = 0;
-		targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
-		/*
-		 * Allocate as many blocks as possible at once.
-		 */
-		if ((error = xfs_alloc_ag_vextent(&targs))) {
-			xfs_trans_brelse(tp, agflbp);
-			return error;
-		}
-		/*
-		 * Stop if we run out.  Won't happen if callers are obeying
-		 * the restrictions correctly.  Can happen for free calls
-		 * on a completely full ag.
-		 */
-		if (targs.agbno == NULLAGBLOCK) {
-			if (flags & XFS_ALLOC_FLAG_FREEING)
-				break;
-			xfs_trans_brelse(tp, agflbp);
-			args->agbp = NULL;
-			return 0;
-		}
-		/*
-		 * Put each allocated block on the list.
-		 */
-		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
-			error = xfs_alloc_put_freelist(tp, agbp,
-							agflbp, bno, 0);
-			if (error)
-				return error;
-		}
-	}
-	xfs_trans_brelse(tp, agflbp);
-	args->agbp = agbp;
-	return 0;
-}
-
-/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int				/* error */
-xfs_alloc_get_freelist(
-	xfs_trans_t	*tp,	/* transaction pointer */
-	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
-	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
-	int		btreeblk) /* destination is a AGF btree */
-{
-	xfs_agf_t	*agf;	/* a.g. freespace structure */
-	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
-	xfs_agblock_t	bno;	/* block number returned */
-	__be32		*agfl_bno;
-	int		error;
-	int		logflags;
-	xfs_mount_t	*mp = tp->t_mountp;
-	xfs_perag_t	*pag;	/* per allocation group data */
-
-	/*
-	 * Freelist is empty, give up.
-	 */
-	agf = XFS_BUF_TO_AGF(agbp);
-	if (!agf->agf_flcount) {
-		*bnop = NULLAGBLOCK;
-		return 0;
-	}
-	/*
-	 * Read the array of free blocks.
-	 */
-	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
-				    &agflbp);
-	if (error)
-		return error;
-
-
-	/*
-	 * Get the block number and update the data structures.
-	 */
-	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
-	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
-	be32_add_cpu(&agf->agf_flfirst, 1);
-	xfs_trans_brelse(tp, agflbp);
-	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
-		agf->agf_flfirst = 0;
-
-	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
-	be32_add_cpu(&agf->agf_flcount, -1);
-	xfs_trans_agflist_delta(tp, -1);
-	pag->pagf_flcount--;
-	xfs_perag_put(pag);
-
-	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
-	if (btreeblk) {
-		be32_add_cpu(&agf->agf_btreeblks, 1);
-		pag->pagf_btreeblks++;
-		logflags |= XFS_AGF_BTREEBLKS;
-	}
-
-	xfs_alloc_log_agf(tp, agbp, logflags);
-	*bnop = bno;
-
-	return 0;
-}
-
-/*
- * Log the given fields from the agf structure.
- */
-void
-xfs_alloc_log_agf(
-	xfs_trans_t	*tp,	/* transaction pointer */
-	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
-	int		fields)	/* mask of fields to be logged (XFS_AGF_...) */
-{
-	int	first;		/* first byte offset */
-	int	last;		/* last byte offset */
-	static const short	offsets[] = {
-		offsetof(xfs_agf_t, agf_magicnum),
-		offsetof(xfs_agf_t, agf_versionnum),
-		offsetof(xfs_agf_t, agf_seqno),
-		offsetof(xfs_agf_t, agf_length),
-		offsetof(xfs_agf_t, agf_roots[0]),
-		offsetof(xfs_agf_t, agf_levels[0]),
-		offsetof(xfs_agf_t, agf_flfirst),
-		offsetof(xfs_agf_t, agf_fllast),
-		offsetof(xfs_agf_t, agf_flcount),
-		offsetof(xfs_agf_t, agf_freeblks),
-		offsetof(xfs_agf_t, agf_longest),
-		offsetof(xfs_agf_t, agf_btreeblks),
-		offsetof(xfs_agf_t, agf_uuid),
-		sizeof(xfs_agf_t)
-	};
-
-	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
-
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
-
-	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
-	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
-}
-
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int					/* error */
-xfs_alloc_pagf_init(
-	xfs_mount_t		*mp,	/* file system mount structure */
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags)	/* XFS_ALLOC_FLAGS_... */
-{
-	xfs_buf_t		*bp;
-	int			error;
-
-	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
-		return error;
-	if (bp)
-		xfs_trans_brelse(tp, bp);
-	return 0;
-}
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int					/* error */
-xfs_alloc_put_freelist(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
-	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
-	xfs_agblock_t		bno,	/* block being freed */
-	int			btreeblk) /* block came from a AGF btree */
-{
-	xfs_agf_t		*agf;	/* a.g. freespace structure */
-	__be32			*blockp;/* pointer to array entry */
-	int			error;
-	int			logflags;
-	xfs_mount_t		*mp;	/* mount structure */
-	xfs_perag_t		*pag;	/* per allocation group data */
-	__be32			*agfl_bno;
-	int			startoff;
-
-	agf = XFS_BUF_TO_AGF(agbp);
-	mp = tp->t_mountp;
-
-	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
-			be32_to_cpu(agf->agf_seqno), &agflbp)))
-		return error;
-	be32_add_cpu(&agf->agf_fllast, 1);
-	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
-		agf->agf_fllast = 0;
-
-	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
-	be32_add_cpu(&agf->agf_flcount, 1);
-	xfs_trans_agflist_delta(tp, 1);
-	pag->pagf_flcount++;
-
-	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
-	if (btreeblk) {
-		be32_add_cpu(&agf->agf_btreeblks, -1);
-		pag->pagf_btreeblks--;
-		logflags |= XFS_AGF_BTREEBLKS;
-	}
-	xfs_perag_put(pag);
-
-	xfs_alloc_log_agf(tp, agbp, logflags);
-
-	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
-
-	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
-	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
-	*blockp = cpu_to_be32(bno);
-	startoff = (char *)blockp - (char *)agflbp->b_addr;
-
-	xfs_alloc_log_agf(tp, agbp, logflags);
-
-	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
-	xfs_trans_log_buf(tp, agflbp, startoff,
-			  startoff + sizeof(xfs_agblock_t) - 1);
-	return 0;
-}
-
-static bool
-xfs_agf_verify(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp)
- {
-	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
-			return false;
-
-	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-	      be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-	      be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
-		return false;
-
-	/*
-	 * during growfs operations, the perag is not fully initialised,
-	 * so we can't use it for any useful checking. growfs ensures we can't
-	 * use it by using uncached buffers that don't have the perag attached
-	 * so we can detect and avoid this problem.
-	 */
-	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
-		return false;
-
-	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
-	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
-		return false;
-
-	return true;;
-
-}
-
-static void
-xfs_agf_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
-				XFS_ERRTAG_ALLOC_READ_AGF,
-				XFS_RANDOM_ALLOC_READ_AGF))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_agf_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	if (!xfs_agf_verify(mp, bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agf_buf_ops = {
-	.verify_read = xfs_agf_read_verify,
-	.verify_write = xfs_agf_write_verify,
-};
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int					/* error */
-xfs_read_agf(
-	struct xfs_mount	*mp,	/* mount point structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags,	/* XFS_BUF_ */
-	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
-{
-	int		error;
-
-	trace_xfs_read_agf(mp, agno);
-
-	ASSERT(agno != NULLAGNUMBER);
-	error = xfs_trans_read_buf(
-			mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
-	if (error)
-		return error;
-	if (!*bpp)
-		return 0;
-
-	ASSERT(!(*bpp)->b_error);
-	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
-	return 0;
-}
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int					/* error */
-xfs_alloc_read_agf(
-	struct xfs_mount	*mp,	/* mount point structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags,	/* XFS_ALLOC_FLAG_... */
-	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
-{
-	struct xfs_agf		*agf;		/* ag freelist header */
-	struct xfs_perag	*pag;		/* per allocation group data */
-	int			error;
-
-	trace_xfs_alloc_read_agf(mp, agno);
-
-	ASSERT(agno != NULLAGNUMBER);
-	error = xfs_read_agf(mp, tp, agno,
-			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
-			bpp);
-	if (error)
-		return error;
-	if (!*bpp)
-		return 0;
-	ASSERT(!(*bpp)->b_error);
-
-	agf = XFS_BUF_TO_AGF(*bpp);
-	pag = xfs_perag_get(mp, agno);
-	if (!pag->pagf_init) {
-		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
-		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
-		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
-		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
-		pag->pagf_levels[XFS_BTNUM_BNOi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
-		pag->pagf_levels[XFS_BTNUM_CNTi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
-		spin_lock_init(&pag->pagb_lock);
-		pag->pagb_count = 0;
-		pag->pagb_tree = RB_ROOT;
-		pag->pagf_init = 1;
-	}
-#ifdef DEBUG
-	else if (!XFS_FORCED_SHUTDOWN(mp)) {
-		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
-		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
-		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
-		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
-		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
-		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
-		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
-		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
-	}
-#endif
-	xfs_perag_put(pag);
-	return 0;
-}
-
-/*
- * Allocate an extent (variable-size).
- * Depending on the allocation type, we either look in a single allocation
- * group or loop over the allocation groups to find the result.
- */
-int				/* error */
-xfs_alloc_vextent(
-	xfs_alloc_arg_t	*args)	/* allocation argument structure */
-{
-	xfs_agblock_t	agsize;	/* allocation group size */
-	int		error;
-	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	xfs_extlen_t	minleft;/* minimum left value, temp copy */
-	xfs_mount_t	*mp;	/* mount structure pointer */
-	xfs_agnumber_t	sagno;	/* starting allocation group number */
-	xfs_alloctype_t	type;	/* input allocation type */
-	int		bump_rotor = 0;
-	int		no_min = 0;
-	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
-
-	mp = args->mp;
-	type = args->otype = args->type;
-	args->agbno = NULLAGBLOCK;
-	/*
-	 * Just fix this up, for the case where the last a.g. is shorter
-	 * (or there's only one a.g.) and the caller couldn't easily figure
-	 * that out (xfs_bmap_alloc).
-	 */
-	agsize = mp->m_sb.sb_agblocks;
-	if (args->maxlen > agsize)
-		args->maxlen = agsize;
-	if (args->alignment == 0)
-		args->alignment = 1;
-	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
-	ASSERT(args->minlen <= args->maxlen);
-	ASSERT(args->minlen <= agsize);
-	ASSERT(args->mod < args->prod);
-	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
-	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
-	    args->minlen > args->maxlen || args->minlen > agsize ||
-	    args->mod >= args->prod) {
-		args->fsbno = NULLFSBLOCK;
-		trace_xfs_alloc_vextent_badargs(args);
-		return 0;
-	}
-	minleft = args->minleft;
-
-	switch (type) {
-	case XFS_ALLOCTYPE_THIS_AG:
-	case XFS_ALLOCTYPE_NEAR_BNO:
-	case XFS_ALLOCTYPE_THIS_BNO:
-		/*
-		 * These three force us into a single a.g.
-		 */
-		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-		args->pag = xfs_perag_get(mp, args->agno);
-		args->minleft = 0;
-		error = xfs_alloc_fix_freelist(args, 0);
-		args->minleft = minleft;
-		if (error) {
-			trace_xfs_alloc_vextent_nofix(args);
-			goto error0;
-		}
-		if (!args->agbp) {
-			trace_xfs_alloc_vextent_noagbp(args);
-			break;
-		}
-		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-		if ((error = xfs_alloc_ag_vextent(args)))
-			goto error0;
-		break;
-	case XFS_ALLOCTYPE_START_BNO:
-		/*
-		 * Try near allocation first, then anywhere-in-ag after
-		 * the first a.g. fails.
-		 */
-		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
-		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
-			args->fsbno = XFS_AGB_TO_FSB(mp,
-					((mp->m_agfrotor / rotorstep) %
-					mp->m_sb.sb_agcount), 0);
-			bump_rotor = 1;
-		}
-		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
-		args->type = XFS_ALLOCTYPE_NEAR_BNO;
-		/* FALLTHROUGH */
-	case XFS_ALLOCTYPE_ANY_AG:
-	case XFS_ALLOCTYPE_START_AG:
-	case XFS_ALLOCTYPE_FIRST_AG:
-		/*
-		 * Rotate through the allocation groups looking for a winner.
-		 */
-		if (type == XFS_ALLOCTYPE_ANY_AG) {
-			/*
-			 * Start with the last place we left off.
-			 */
-			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
-					mp->m_sb.sb_agcount;
-			args->type = XFS_ALLOCTYPE_THIS_AG;
-			flags = XFS_ALLOC_FLAG_TRYLOCK;
-		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
-			/*
-			 * Start with allocation group given by bno.
-			 */
-			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-			args->type = XFS_ALLOCTYPE_THIS_AG;
-			sagno = 0;
-			flags = 0;
-		} else {
-			if (type == XFS_ALLOCTYPE_START_AG)
-				args->type = XFS_ALLOCTYPE_THIS_AG;
-			/*
-			 * Start with the given allocation group.
-			 */
-			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-			flags = XFS_ALLOC_FLAG_TRYLOCK;
-		}
-		/*
-		 * Loop over allocation groups twice; first time with
-		 * trylock set, second time without.
-		 */
-		for (;;) {
-			args->pag = xfs_perag_get(mp, args->agno);
-			if (no_min) args->minleft = 0;
-			error = xfs_alloc_fix_freelist(args, flags);
-			args->minleft = minleft;
-			if (error) {
-				trace_xfs_alloc_vextent_nofix(args);
-				goto error0;
-			}
-			/*
-			 * If we get a buffer back then the allocation will fly.
-			 */
-			if (args->agbp) {
-				if ((error = xfs_alloc_ag_vextent(args)))
-					goto error0;
-				break;
-			}
-
-			trace_xfs_alloc_vextent_loopfailed(args);
-
-			/*
-			 * Didn't work, figure out the next iteration.
-			 */
-			if (args->agno == sagno &&
-			    type == XFS_ALLOCTYPE_START_BNO)
-				args->type = XFS_ALLOCTYPE_THIS_AG;
-			/*
-			* For the first allocation, we can try any AG to get
-			* space.  However, if we already have allocated a
-			* block, we don't want to try AGs whose number is below
-			* sagno. Otherwise, we may end up with out-of-order
-			* locking of AGF, which might cause deadlock.
-			*/
-			if (++(args->agno) == mp->m_sb.sb_agcount) {
-				if (args->firstblock != NULLFSBLOCK)
-					args->agno = sagno;
-				else
-					args->agno = 0;
-			}
-			/*
-			 * Reached the starting a.g., must either be done
-			 * or switch to non-trylock mode.
-			 */
-			if (args->agno == sagno) {
-				if (no_min == 1) {
-					args->agbno = NULLAGBLOCK;
-					trace_xfs_alloc_vextent_allfailed(args);
-					break;
-				}
-				if (flags == 0) {
-					no_min = 1;
-				} else {
-					flags = 0;
-					if (type == XFS_ALLOCTYPE_START_BNO) {
-						args->agbno = XFS_FSB_TO_AGBNO(mp,
-							args->fsbno);
-						args->type = XFS_ALLOCTYPE_NEAR_BNO;
-					}
-				}
-			}
-			xfs_perag_put(args->pag);
-		}
-		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
-			if (args->agno == sagno)
-				mp->m_agfrotor = (mp->m_agfrotor + 1) %
-					(mp->m_sb.sb_agcount * rotorstep);
-			else
-				mp->m_agfrotor = (args->agno * rotorstep + 1) %
-					(mp->m_sb.sb_agcount * rotorstep);
-		}
-		break;
-	default:
-		ASSERT(0);
-		/* NOTREACHED */
-	}
-	if (args->agbno == NULLAGBLOCK)
-		args->fsbno = NULLFSBLOCK;
-	else {
-		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
-#ifdef DEBUG
-		ASSERT(args->len >= args->minlen);
-		ASSERT(args->len <= args->maxlen);
-		ASSERT(args->agbno % args->alignment == 0);
-		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
-			args->len);
-#endif
-	}
-	xfs_perag_put(args->pag);
-	return 0;
-error0:
-	xfs_perag_put(args->pag);
-	return error;
-}
-
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int				/* error */
-xfs_free_extent(
-	xfs_trans_t	*tp,	/* transaction pointer */
-	xfs_fsblock_t	bno,	/* starting block number of extent */
-	xfs_extlen_t	len)	/* length of extent */
-{
-	xfs_alloc_arg_t	args;
-	int		error;
-
-	ASSERT(len != 0);
-	memset(&args, 0, sizeof(xfs_alloc_arg_t));
-	args.tp = tp;
-	args.mp = tp->t_mountp;
-
-	/*
-	 * validate that the block number is legal - the enables us to detect
-	 * and handle a silent filesystem corruption rather than crashing.
-	 */
-	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-	if (args.agno >= args.mp->m_sb.sb_agcount)
-		return EFSCORRUPTED;
-
-	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-	if (args.agbno >= args.mp->m_sb.sb_agblocks)
-		return EFSCORRUPTED;
-
-	args.pag = xfs_perag_get(args.mp, args.agno);
-	ASSERT(args.pag);
-
-	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
-	if (error)
-		goto error0;
-
-	/* validate the extent size is legal now we have the agf locked */
-	if (args.agbno + len >
-			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-		error = EFSCORRUPTED;
-		goto error0;
-	}
-
-	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
-	if (!error)
-		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
-	xfs_perag_put(args.pag);
-	return error;
-}
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
deleted file mode 100644
index 8358f1ded94d..000000000000
--- a/fs/xfs/xfs_alloc_btree.c
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-
-
-STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
-	struct xfs_btree_cur	*cur)
-{
-	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.a.agbp, cur->bc_private.a.agno,
-			cur->bc_btnum);
-}
-
-STATIC void
-xfs_allocbt_set_root(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr,
-	int			inc)
-{
-	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
-	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
-	int			btnum = cur->bc_btnum;
-	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
-
-	ASSERT(ptr->s != 0);
-
-	agf->agf_roots[btnum] = ptr->s;
-	be32_add_cpu(&agf->agf_levels[btnum], inc);
-	pag->pagf_levels[btnum] += inc;
-	xfs_perag_put(pag);
-
-	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-}
-
-STATIC int
-xfs_allocbt_alloc_block(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*start,
-	union xfs_btree_ptr	*new,
-	int			*stat)
-{
-	int			error;
-	xfs_agblock_t		bno;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-	/* Allocate the new block from the freelist. If we can't, give up.  */
-	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-				       &bno, 1);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	if (bno == NULLAGBLOCK) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-
-	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
-
-	xfs_trans_agbtree_delta(cur->bc_tp, 1);
-	new->s = cpu_to_be32(bno);
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-}
-
-STATIC int
-xfs_allocbt_free_block(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp)
-{
-	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
-	xfs_agblock_t		bno;
-	int			error;
-
-	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
-	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
-	if (error)
-		return error;
-
-	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
-			      XFS_EXTENT_BUSY_SKIP_DISCARD);
-	xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
-	xfs_trans_binval(cur->bc_tp, bp);
-	return 0;
-}
-
-/*
- * Update the longest extent in the AGF
- */
-STATIC void
-xfs_allocbt_update_lastrec(
-	struct xfs_btree_cur	*cur,
-	struct xfs_btree_block	*block,
-	union xfs_btree_rec	*rec,
-	int			ptr,
-	int			reason)
-{
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
-	struct xfs_perag	*pag;
-	__be32			len;
-	int			numrecs;
-
-	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
-
-	switch (reason) {
-	case LASTREC_UPDATE:
-		/*
-		 * If this is the last leaf block and it's the last record,
-		 * then update the size of the longest extent in the AG.
-		 */
-		if (ptr != xfs_btree_get_numrecs(block))
-			return;
-		len = rec->alloc.ar_blockcount;
-		break;
-	case LASTREC_INSREC:
-		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
-		    be32_to_cpu(agf->agf_longest))
-			return;
-		len = rec->alloc.ar_blockcount;
-		break;
-	case LASTREC_DELREC:
-		numrecs = xfs_btree_get_numrecs(block);
-		if (ptr <= numrecs)
-			return;
-		ASSERT(ptr == numrecs + 1);
-
-		if (numrecs) {
-			xfs_alloc_rec_t *rrp;
-
-			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
-			len = rrp->ar_blockcount;
-		} else {
-			len = 0;
-		}
-
-		break;
-	default:
-		ASSERT(0);
-		return;
-	}
-
-	agf->agf_longest = len;
-	pag = xfs_perag_get(cur->bc_mp, seqno);
-	pag->pagf_longest = be32_to_cpu(len);
-	xfs_perag_put(pag);
-	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
-}
-
-STATIC int
-xfs_allocbt_get_minrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	return cur->bc_mp->m_alloc_mnr[level != 0];
-}
-
-STATIC int
-xfs_allocbt_get_maxrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	return cur->bc_mp->m_alloc_mxr[level != 0];
-}
-
-STATIC void
-xfs_allocbt_init_key_from_rec(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	ASSERT(rec->alloc.ar_startblock != 0);
-
-	key->alloc.ar_startblock = rec->alloc.ar_startblock;
-	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
-}
-
-STATIC void
-xfs_allocbt_init_rec_from_key(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	ASSERT(key->alloc.ar_startblock != 0);
-
-	rec->alloc.ar_startblock = key->alloc.ar_startblock;
-	rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-}
-
-STATIC void
-xfs_allocbt_init_rec_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*rec)
-{
-	ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
-	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-}
-
-STATIC void
-xfs_allocbt_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-
-	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
-	ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
-
-	ptr->s = agf->agf_roots[cur->bc_btnum];
-}
-
-STATIC __int64_t
-xfs_allocbt_key_diff(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*key)
-{
-	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
-	xfs_alloc_key_t		*kp = &key->alloc;
-	__int64_t		diff;
-
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-				rec->ar_startblock;
-	}
-
-	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
-	if (diff)
-		return diff;
-
-	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
-}
-
-static bool
-xfs_allocbt_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_perag	*pag = bp->b_pag;
-	unsigned int		level;
-
-	/*
-	 * magic number and level verification
-	 *
-	 * During growfs operations, we can't verify the exact level or owner as
-	 * the perag is not fully initialised and hence not attached to the
-	 * buffer.  In this case, check against the maximum tree depth.
-	 *
-	 * Similarly, during log recovery we will have a perag structure
-	 * attached, but the agf information will not yet have been initialised
-	 * from the on disk AGF. Again, we can only check against maximum limits
-	 * in this case.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-			return false;
-		/* fall through */
-	case cpu_to_be32(XFS_ABTB_MAGIC):
-		if (pag && pag->pagf_init) {
-			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
-				return false;
-		} else if (level >= mp->m_ag_maxlevels)
-			return false;
-		break;
-	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-			return false;
-		/* fall through */
-	case cpu_to_be32(XFS_ABTC_MAGIC):
-		if (pag && pag->pagf_init) {
-			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
-				return false;
-		} else if (level >= mp->m_ag_maxlevels)
-			return false;
-		break;
-	default:
-		return false;
-	}
-
-	/* numrecs verification */
-	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
-		return false;
-
-	/* sibling pointer verification */
-	if (!block->bb_u.s.bb_leftsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-	if (!block->bb_u.s.bb_rightsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-
-	return true;
-}
-
-static void
-xfs_allocbt_read_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_btree_sblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_allocbt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_verifier_error(bp);
-	}
-}
-
-static void
-xfs_allocbt_write_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_allocbt_verify(bp)) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-	xfs_btree_sblock_calc_crc(bp);
-
-}
-
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
-	.verify_read = xfs_allocbt_read_verify,
-	.verify_write = xfs_allocbt_write_verify,
-};
-
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_allocbt_keys_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*k1,
-	union xfs_btree_key	*k2)
-{
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return be32_to_cpu(k1->alloc.ar_startblock) <
-		       be32_to_cpu(k2->alloc.ar_startblock);
-	} else {
-		return be32_to_cpu(k1->alloc.ar_blockcount) <
-			be32_to_cpu(k2->alloc.ar_blockcount) ||
-			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-			 be32_to_cpu(k1->alloc.ar_startblock) <
-			 be32_to_cpu(k2->alloc.ar_startblock));
-	}
-}
-
-STATIC int
-xfs_allocbt_recs_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*r1,
-	union xfs_btree_rec	*r2)
-{
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return be32_to_cpu(r1->alloc.ar_startblock) +
-			be32_to_cpu(r1->alloc.ar_blockcount) <=
-			be32_to_cpu(r2->alloc.ar_startblock);
-	} else {
-		return be32_to_cpu(r1->alloc.ar_blockcount) <
-			be32_to_cpu(r2->alloc.ar_blockcount) ||
-			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
-			 be32_to_cpu(r1->alloc.ar_startblock) <
-			 be32_to_cpu(r2->alloc.ar_startblock));
-	}
-}
-#endif	/* DEBUG */
-
-static const struct xfs_btree_ops xfs_allocbt_ops = {
-	.rec_len		= sizeof(xfs_alloc_rec_t),
-	.key_len		= sizeof(xfs_alloc_key_t),
-
-	.dup_cursor		= xfs_allocbt_dup_cursor,
-	.set_root		= xfs_allocbt_set_root,
-	.alloc_block		= xfs_allocbt_alloc_block,
-	.free_block		= xfs_allocbt_free_block,
-	.update_lastrec		= xfs_allocbt_update_lastrec,
-	.get_minrecs		= xfs_allocbt_get_minrecs,
-	.get_maxrecs		= xfs_allocbt_get_maxrecs,
-	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
-	.init_rec_from_key	= xfs_allocbt_init_rec_from_key,
-	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
-	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
-	.key_diff		= xfs_allocbt_key_diff,
-	.buf_ops		= &xfs_allocbt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-	.keys_inorder		= xfs_allocbt_keys_inorder,
-	.recs_inorder		= xfs_allocbt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur *			/* new alloc btree cursor */
-xfs_allocbt_init_cursor(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_buf		*agbp,		/* buffer for agf structure */
-	xfs_agnumber_t		agno,		/* allocation group number */
-	xfs_btnum_t		btnum)		/* btree identifier */
-{
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
-	struct xfs_btree_cur	*cur;
-
-	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-	cur->bc_tp = tp;
-	cur->bc_mp = mp;
-	cur->bc_btnum = btnum;
-	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-	cur->bc_ops = &xfs_allocbt_ops;
-
-	if (btnum == XFS_BTNUM_CNT) {
-		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
-		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-	} else {
-		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-	}
-
-	cur->bc_private.a.agbp = agbp;
-	cur->bc_private.a.agno = agno;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-	return cur;
-}
-
-/*
- * Calculate number of records in an alloc btree block.
- */
-int
-xfs_allocbt_maxrecs(
-	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
-{
-	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
-
-	if (leaf)
-		return blocklen / sizeof(xfs_alloc_rec_t);
-	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
-}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
deleted file mode 100644
index 7d95b16f0919..000000000000
--- a/fs/xfs/xfs_attr.c
+++ /dev/null
@@ -1,1459 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-/*
- * xfs_attr.c
- *
- * Provide the external interfaces to manage attribute lists.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when attribute list fits inside the inode.
- */
-STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
-
-/*
- * Internal routines when attribute list is one block.
- */
-STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-
-/*
- * Internal routines when attribute list is more than one block.
- */
-STATIC int xfs_attr_node_get(xfs_da_args_t *args);
-STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-
-
-STATIC int
-xfs_attr_args_init(
-	struct xfs_da_args	*args,
-	struct xfs_inode	*dp,
-	const unsigned char	*name,
-	int			flags)
-{
-
-	if (!name)
-		return EINVAL;
-
-	memset(args, 0, sizeof(*args));
-	args->geo = dp->i_mount->m_attr_geo;
-	args->whichfork = XFS_ATTR_FORK;
-	args->dp = dp;
-	args->flags = flags;
-	args->name = name;
-	args->namelen = strlen((const char *)name);
-	if (args->namelen >= MAXNAMELEN)
-		return EFAULT;		/* match IRIX behaviour */
-
-	args->hashval = xfs_da_hashname(args->name, args->namelen);
-	return 0;
-}
-
-int
-xfs_inode_hasattr(
-	struct xfs_inode	*ip)
-{
-	if (!XFS_IFORK_Q(ip) ||
-	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     ip->i_d.di_anextents == 0))
-		return 0;
-	return 1;
-}
-
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-
-int
-xfs_attr_get(
-	struct xfs_inode	*ip,
-	const unsigned char	*name,
-	unsigned char		*value,
-	int			*valuelenp,
-	int			flags)
-{
-	struct xfs_da_args	args;
-	uint			lock_mode;
-	int			error;
-
-	XFS_STATS_INC(xs_attr_get);
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EIO;
-
-	if (!xfs_inode_hasattr(ip))
-		return ENOATTR;
-
-	error = xfs_attr_args_init(&args, ip, name, flags);
-	if (error)
-		return error;
-
-	args.value = value;
-	args.valuelen = *valuelenp;
-
-	lock_mode = xfs_ilock_attr_map_shared(ip);
-	if (!xfs_inode_hasattr(ip))
-		error = ENOATTR;
-	else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
-		error = xfs_attr_shortform_getvalue(&args);
-	else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
-		error = xfs_attr_leaf_get(&args);
-	else
-		error = xfs_attr_node_get(&args);
-	xfs_iunlock(ip, lock_mode);
-
-	*valuelenp = args.valuelen;
-	return error == EEXIST ? 0 : error;
-}
-
-/*
- * Calculate how many blocks we need for the new attribute,
- */
-STATIC int
-xfs_attr_calc_size(
-	struct xfs_da_args	*args,
-	int			*local)
-{
-	struct xfs_mount	*mp = args->dp->i_mount;
-	int			size;
-	int			nblks;
-
-	/*
-	 * Determine space new attribute will use, and if it would be
-	 * "local" or "remote" (note: local != inline).
-	 */
-	size = xfs_attr_leaf_newentsize(args, local);
-	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-	if (*local) {
-		if (size > (args->geo->blksize / 2)) {
-			/* Double split possible */
-			nblks *= 2;
-		}
-	} else {
-		/*
-		 * Out of line attribute, cannot double split, but
-		 * make room for the attribute value itself.
-		 */
-		uint	dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
-		nblks += dblocks;
-		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-	}
-
-	return nblks;
-}
-
-int
-xfs_attr_set(
-	struct xfs_inode	*dp,
-	const unsigned char	*name,
-	unsigned char		*value,
-	int			valuelen,
-	int			flags)
-{
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_da_args	args;
-	struct xfs_bmap_free	flist;
-	struct xfs_trans_res	tres;
-	xfs_fsblock_t		firstblock;
-	int			rsvd = (flags & ATTR_ROOT) != 0;
-	int			error, err2, committed, local;
-
-	XFS_STATS_INC(xs_attr_set);
-
-	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
-
-	error = xfs_attr_args_init(&args, dp, name, flags);
-	if (error)
-		return error;
-
-	args.value = value;
-	args.valuelen = valuelen;
-	args.firstblock = &firstblock;
-	args.flist = &flist;
-	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-	args.total = xfs_attr_calc_size(&args, &local);
-
-	error = xfs_qm_dqattach(dp, 0);
-	if (error)
-		return error;
-
-	/*
-	 * If the inode doesn't have an attribute fork, add one.
-	 * (inode must not be locked when we call this routine)
-	 */
-	if (XFS_IFORK_Q(dp) == 0) {
-		int sf_size = sizeof(xfs_attr_sf_hdr_t) +
-			XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
-
-		error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Start our first transaction of the day.
-	 *
-	 * All future transactions during this code must be "chained" off
-	 * this one via the trans_dup() call.  All transactions will contain
-	 * the inode, and the inode will always be marked with trans_ihold().
-	 * Since the inode will be locked in all transactions, we must log
-	 * the inode in every transaction to let it float upward through
-	 * the log.
-	 */
-	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
-
-	/*
-	 * Root fork attributes can use reserved data blocks for this
-	 * operation if necessary
-	 */
-
-	if (rsvd)
-		args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-	tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-			 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-	tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-	error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-	if (error) {
-		xfs_trans_cancel(args.trans, 0);
-		return error;
-	}
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
-
-	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
-				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-				       XFS_QMOPT_RES_REGBLKS);
-	if (error) {
-		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-		return error;
-	}
-
-	xfs_trans_ijoin(args.trans, dp, 0);
-
-	/*
-	 * If the attribute list is non-existent or a shortform list,
-	 * upgrade it to a single-leaf-block attribute list.
-	 */
-	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
-
-		/*
-		 * Build initial attribute list (if required).
-		 */
-		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
-			xfs_attr_shortform_create(&args);
-
-		/*
-		 * Try to add the attr to the attribute list in
-		 * the inode.
-		 */
-		error = xfs_attr_shortform_addname(&args);
-		if (error != ENOSPC) {
-			/*
-			 * Commit the shortform mods, and we're done.
-			 * NOTE: this is also the error path (EEXIST, etc).
-			 */
-			ASSERT(args.trans != NULL);
-
-			/*
-			 * If this is a synchronous mount, make sure that
-			 * the transaction goes to disk before returning
-			 * to the user.
-			 */
-			if (mp->m_flags & XFS_MOUNT_WSYNC)
-				xfs_trans_set_sync(args.trans);
-
-			if (!error && (flags & ATTR_KERNOTIME) == 0) {
-				xfs_trans_ichgtime(args.trans, dp,
-							XFS_ICHGTIME_CHG);
-			}
-			err2 = xfs_trans_commit(args.trans,
-						 XFS_TRANS_RELEASE_LOG_RES);
-			xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-			return error ? error : err2;
-		}
-
-		/*
-		 * It won't fit in the shortform, transform to a leaf block.
-		 * GROT: another possible req'mt for a double-split btree op.
-		 */
-		xfs_bmap_init(args.flist, args.firstblock);
-		error = xfs_attr_shortform_to_leaf(&args);
-		if (!error) {
-			error = xfs_bmap_finish(&args.trans, args.flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args.trans = NULL;
-			xfs_bmap_cancel(&flist);
-			goto out;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args.trans, dp, 0);
-
-		/*
-		 * Commit the leaf transformation.  We'll need another (linked)
-		 * transaction to add the new attribute to the leaf.
-		 */
-
-		error = xfs_trans_roll(&args.trans, dp);
-		if (error)
-			goto out;
-
-	}
-
-	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
-		error = xfs_attr_leaf_addname(&args);
-	else
-		error = xfs_attr_node_addname(&args);
-	if (error)
-		goto out;
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * transaction goes to disk before returning to the user.
-	 */
-	if (mp->m_flags & XFS_MOUNT_WSYNC)
-		xfs_trans_set_sync(args.trans);
-
-	if ((flags & ATTR_KERNOTIME) == 0)
-		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
-
-	/*
-	 * Commit the last in the sequence of transactions.
-	 */
-	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-	return error;
-
-out:
-	if (args.trans) {
-		xfs_trans_cancel(args.trans,
-			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	}
-	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	return error;
-}
-
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-int
-xfs_attr_remove(
-	struct xfs_inode	*dp,
-	const unsigned char	*name,
-	int			flags)
-{
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_da_args	args;
-	struct xfs_bmap_free	flist;
-	xfs_fsblock_t		firstblock;
-	int			error;
-
-	XFS_STATS_INC(xs_attr_remove);
-
-	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
-
-	if (!xfs_inode_hasattr(dp))
-		return ENOATTR;
-
-	error = xfs_attr_args_init(&args, dp, name, flags);
-	if (error)
-		return error;
-
-	args.firstblock = &firstblock;
-	args.flist = &flist;
-
-	/*
-	 * we have no control over the attribute names that userspace passes us
-	 * to remove, so we have to allow the name lookup prior to attribute
-	 * removal to fail.
-	 */
-	args.op_flags = XFS_DA_OP_OKNOENT;
-
-	error = xfs_qm_dqattach(dp, 0);
-	if (error)
-		return error;
-
-	/*
-	 * Start our first transaction of the day.
-	 *
-	 * All future transactions during this code must be "chained" off
-	 * this one via the trans_dup() call.  All transactions will contain
-	 * the inode, and the inode will always be marked with trans_ihold().
-	 * Since the inode will be locked in all transactions, we must log
-	 * the inode in every transaction to let it float upward through
-	 * the log.
-	 */
-	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
-	/*
-	 * Root fork attributes can use reserved data blocks for this
-	 * operation if necessary
-	 */
-
-	if (flags & ATTR_ROOT)
-		args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-	error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
-				  XFS_ATTRRM_SPACE_RES(mp), 0);
-	if (error) {
-		xfs_trans_cancel(args.trans, 0);
-		return error;
-	}
-
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
-	/*
-	 * No need to make quota reservations here. We expect to release some
-	 * blocks not allocate in the common case.
-	 */
-	xfs_trans_ijoin(args.trans, dp, 0);
-
-	if (!xfs_inode_hasattr(dp)) {
-		error = ENOATTR;
-	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
-		error = xfs_attr_shortform_remove(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-		error = xfs_attr_leaf_removename(&args);
-	} else {
-		error = xfs_attr_node_removename(&args);
-	}
-
-	if (error)
-		goto out;
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * transaction goes to disk before returning to the user.
-	 */
-	if (mp->m_flags & XFS_MOUNT_WSYNC)
-		xfs_trans_set_sync(args.trans);
-
-	if ((flags & ATTR_KERNOTIME) == 0)
-		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
-
-	/*
-	 * Commit the last in the sequence of transactions.
-	 */
-	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
-	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-	return error;
-
-out:
-	if (args.trans) {
-		xfs_trans_cancel(args.trans,
-			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	}
-	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	return error;
-}
-
-/*========================================================================
- * External routines when attribute list is inside the inode
- *========================================================================*/
-
-/*
- * Add a name to the shortform attribute list structure
- * This is the external routine.
- */
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
-{
-	int newsize, forkoff, retval;
-
-	trace_xfs_attr_sf_addname(args);
-
-	retval = xfs_attr_shortform_lookup(args);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-		return retval;
-	} else if (retval == EEXIST) {
-		if (args->flags & ATTR_CREATE)
-			return retval;
-		retval = xfs_attr_shortform_remove(args);
-		ASSERT(retval == 0);
-	}
-
-	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
-	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-		return ENOSPC;
-
-	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
-	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
-
-	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
-	if (!forkoff)
-		return ENOSPC;
-
-	xfs_attr_shortform_add(args, forkoff);
-	return 0;
-}
-
-
-/*========================================================================
- * External routines when attribute list is one block
- *========================================================================*/
-
-/*
- * Add a name to the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_addname(xfs_da_args_t *args)
-{
-	xfs_inode_t *dp;
-	struct xfs_buf *bp;
-	int retval, error, committed, forkoff;
-
-	trace_xfs_attr_leaf_addname(args);
-
-	/*
-	 * Read the (only) block in the attribute list in.
-	 */
-	dp = args->dp;
-	args->blkno = 0;
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-	if (error)
-		return error;
-
-	/*
-	 * Look up the given attribute in the leaf block.  Figure out if
-	 * the given flags produce an error or call for an atomic rename.
-	 */
-	retval = xfs_attr3_leaf_lookup_int(bp, args);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-		xfs_trans_brelse(args->trans, bp);
-		return retval;
-	} else if (retval == EEXIST) {
-		if (args->flags & ATTR_CREATE) {	/* pure create op */
-			xfs_trans_brelse(args->trans, bp);
-			return retval;
-		}
-
-		trace_xfs_attr_leaf_replace(args);
-
-		/* save the attribute state for later removal*/
-		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
-		args->blkno2 = args->blkno;		/* set 2nd entry info*/
-		args->index2 = args->index;
-		args->rmtblkno2 = args->rmtblkno;
-		args->rmtblkcnt2 = args->rmtblkcnt;
-		args->rmtvaluelen2 = args->rmtvaluelen;
-
-		/*
-		 * clear the remote attr state now that it is saved so that the
-		 * values reflect the state of the attribute we are about to
-		 * add, not the attribute we just found and will remove later.
-		 */
-		args->rmtblkno = 0;
-		args->rmtblkcnt = 0;
-		args->rmtvaluelen = 0;
-	}
-
-	/*
-	 * Add the attribute to the leaf block, transitioning to a Btree
-	 * if required.
-	 */
-	retval = xfs_attr3_leaf_add(bp, args);
-	if (retval == ENOSPC) {
-		/*
-		 * Promote the attribute list to the Btree format, then
-		 * Commit that transaction so that the node_addname() call
-		 * can manage its own transactions.
-		 */
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_attr3_leaf_to_node(args);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return error;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
-		/*
-		 * Commit the current trans (including the inode) and start
-		 * a new one.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-		if (error)
-			return error;
-
-		/*
-		 * Fob the whole rest of the problem off on the Btree code.
-		 */
-		error = xfs_attr_node_addname(args);
-		return error;
-	}
-
-	/*
-	 * Commit the transaction that added the attr name so that
-	 * later routines can manage their own transactions.
-	 */
-	error = xfs_trans_roll(&args->trans, dp);
-	if (error)
-		return error;
-
-	/*
-	 * If there was an out-of-line value, allocate the blocks we
-	 * identified for its storage and copy the value.  This is done
-	 * after we create the attribute so that we don't overflow the
-	 * maximum size of a transaction and/or hit a deadlock.
-	 */
-	if (args->rmtblkno > 0) {
-		error = xfs_attr_rmtval_set(args);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * If this is an atomic rename operation, we must "flip" the
-	 * incomplete flags on the "new" and "old" attribute/value pairs
-	 * so that one disappears and one appears atomically.  Then we
-	 * must remove the "old" attribute/value pair.
-	 */
-	if (args->op_flags & XFS_DA_OP_RENAME) {
-		/*
-		 * In a separate transaction, set the incomplete flag on the
-		 * "old" attr and clear the incomplete flag on the "new" attr.
-		 */
-		error = xfs_attr3_leaf_flipflags(args);
-		if (error)
-			return error;
-
-		/*
-		 * Dismantle the "old" attribute/value pair by removing
-		 * a "remote" value (if it exists).
-		 */
-		args->index = args->index2;
-		args->blkno = args->blkno2;
-		args->rmtblkno = args->rmtblkno2;
-		args->rmtblkcnt = args->rmtblkcnt2;
-		args->rmtvaluelen = args->rmtvaluelen2;
-		if (args->rmtblkno) {
-			error = xfs_attr_rmtval_remove(args);
-			if (error)
-				return error;
-		}
-
-		/*
-		 * Read in the block containing the "old" attr, then
-		 * remove the "old" attr from that block (neat, huh!)
-		 */
-		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-					   -1, &bp);
-		if (error)
-			return error;
-
-		xfs_attr3_leaf_remove(bp, args);
-
-		/*
-		 * If the result is small enough, shrink it all into the inode.
-		 */
-		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-			/* bp is gone due to xfs_da_shrink_inode */
-			if (!error) {
-				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
-			if (error) {
-				ASSERT(committed);
-				args->trans = NULL;
-				xfs_bmap_cancel(args->flist);
-				return error;
-			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
-		}
-
-		/*
-		 * Commit the remove and start the next trans in series.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-
-	} else if (args->rmtblkno > 0) {
-		/*
-		 * Added a "remote" value, just clear the incomplete flag.
-		 */
-		error = xfs_attr3_leaf_clearflag(args);
-	}
-	return error;
-}
-
-/*
- * Remove a name from the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_removename(xfs_da_args_t *args)
-{
-	xfs_inode_t *dp;
-	struct xfs_buf *bp;
-	int error, committed, forkoff;
-
-	trace_xfs_attr_leaf_removename(args);
-
-	/*
-	 * Remove the attribute.
-	 */
-	dp = args->dp;
-	args->blkno = 0;
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-	if (error)
-		return error;
-
-	error = xfs_attr3_leaf_lookup_int(bp, args);
-	if (error == ENOATTR) {
-		xfs_trans_brelse(args->trans, bp);
-		return error;
-	}
-
-	xfs_attr3_leaf_remove(bp, args);
-
-	/*
-	 * If the result is small enough, shrink it all into the inode.
-	 */
-	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-		/* bp is gone due to xfs_da_shrink_inode */
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return error;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-	}
-	return 0;
-}
-
-/*
- * Look up a name in a leaf attribute list structure.
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_get(xfs_da_args_t *args)
-{
-	struct xfs_buf *bp;
-	int error;
-
-	trace_xfs_attr_leaf_get(args);
-
-	args->blkno = 0;
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-	if (error)
-		return error;
-
-	error = xfs_attr3_leaf_lookup_int(bp, args);
-	if (error != EEXIST)  {
-		xfs_trans_brelse(args->trans, bp);
-		return error;
-	}
-	error = xfs_attr3_leaf_getvalue(bp, args);
-	xfs_trans_brelse(args->trans, bp);
-	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-		error = xfs_attr_rmtval_get(args);
-	}
-	return error;
-}
-
-/*========================================================================
- * External routines when attribute list size > geo->blksize
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format attribute list.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- */
-STATIC int
-xfs_attr_node_addname(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	xfs_inode_t *dp;
-	xfs_mount_t *mp;
-	int committed, retval, error;
-
-	trace_xfs_attr_node_addname(args);
-
-	/*
-	 * Fill in bucket of arguments/results/context to carry around.
-	 */
-	dp = args->dp;
-	mp = dp->i_mount;
-restart:
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = mp;
-
-	/*
-	 * Search to see if name already exists, and get back a pointer
-	 * to where it should go.
-	 */
-	error = xfs_da3_node_lookup_int(state, &retval);
-	if (error)
-		goto out;
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-		goto out;
-	} else if (retval == EEXIST) {
-		if (args->flags & ATTR_CREATE)
-			goto out;
-
-		trace_xfs_attr_node_replace(args);
-
-		/* save the attribute state for later removal*/
-		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
-		args->blkno2 = args->blkno;		/* set 2nd entry info*/
-		args->index2 = args->index;
-		args->rmtblkno2 = args->rmtblkno;
-		args->rmtblkcnt2 = args->rmtblkcnt;
-		args->rmtvaluelen2 = args->rmtvaluelen;
-
-		/*
-		 * clear the remote attr state now that it is saved so that the
-		 * values reflect the state of the attribute we are about to
-		 * add, not the attribute we just found and will remove later.
-		 */
-		args->rmtblkno = 0;
-		args->rmtblkcnt = 0;
-		args->rmtvaluelen = 0;
-	}
-
-	retval = xfs_attr3_leaf_add(blk->bp, state->args);
-	if (retval == ENOSPC) {
-		if (state->path.active == 1) {
-			/*
-			 * Its really a single leaf node, but it had
-			 * out-of-line values so it looked like it *might*
-			 * have been a b-tree.
-			 */
-			xfs_da_state_free(state);
-			state = NULL;
-			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr3_leaf_to_node(args);
-			if (!error) {
-				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
-			if (error) {
-				ASSERT(committed);
-				args->trans = NULL;
-				xfs_bmap_cancel(args->flist);
-				goto out;
-			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
-
-			/*
-			 * Commit the node conversion and start the next
-			 * trans in the chain.
-			 */
-			error = xfs_trans_roll(&args->trans, dp);
-			if (error)
-				goto out;
-
-			goto restart;
-		}
-
-		/*
-		 * Split as many Btree elements as required.
-		 * This code tracks the new and old attr's location
-		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
-		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
-		 */
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_da3_split(state);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			goto out;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-	} else {
-		/*
-		 * Addition succeeded, update Btree hashvals.
-		 */
-		xfs_da3_fixhashpath(state, &state->path);
-	}
-
-	/*
-	 * Kill the state structure, we're done with it and need to
-	 * allow the buffers to come back later.
-	 */
-	xfs_da_state_free(state);
-	state = NULL;
-
-	/*
-	 * Commit the leaf addition or btree split and start the next
-	 * trans in the chain.
-	 */
-	error = xfs_trans_roll(&args->trans, dp);
-	if (error)
-		goto out;
-
-	/*
-	 * If there was an out-of-line value, allocate the blocks we
-	 * identified for its storage and copy the value.  This is done
-	 * after we create the attribute so that we don't overflow the
-	 * maximum size of a transaction and/or hit a deadlock.
-	 */
-	if (args->rmtblkno > 0) {
-		error = xfs_attr_rmtval_set(args);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * If this is an atomic rename operation, we must "flip" the
-	 * incomplete flags on the "new" and "old" attribute/value pairs
-	 * so that one disappears and one appears atomically.  Then we
-	 * must remove the "old" attribute/value pair.
-	 */
-	if (args->op_flags & XFS_DA_OP_RENAME) {
-		/*
-		 * In a separate transaction, set the incomplete flag on the
-		 * "old" attr and clear the incomplete flag on the "new" attr.
-		 */
-		error = xfs_attr3_leaf_flipflags(args);
-		if (error)
-			goto out;
-
-		/*
-		 * Dismantle the "old" attribute/value pair by removing
-		 * a "remote" value (if it exists).
-		 */
-		args->index = args->index2;
-		args->blkno = args->blkno2;
-		args->rmtblkno = args->rmtblkno2;
-		args->rmtblkcnt = args->rmtblkcnt2;
-		args->rmtvaluelen = args->rmtvaluelen2;
-		if (args->rmtblkno) {
-			error = xfs_attr_rmtval_remove(args);
-			if (error)
-				return error;
-		}
-
-		/*
-		 * Re-find the "old" attribute entry after any split ops.
-		 * The INCOMPLETE flag means that we will find the "old"
-		 * attr, not the "new" one.
-		 */
-		args->flags |= XFS_ATTR_INCOMPLETE;
-		state = xfs_da_state_alloc();
-		state->args = args;
-		state->mp = mp;
-		state->inleaf = 0;
-		error = xfs_da3_node_lookup_int(state, &retval);
-		if (error)
-			goto out;
-
-		/*
-		 * Remove the name and update the hashvals in the tree.
-		 */
-		blk = &state->path.blk[ state->path.active-1 ];
-		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-		error = xfs_attr3_leaf_remove(blk->bp, args);
-		xfs_da3_fixhashpath(state, &state->path);
-
-		/*
-		 * Check to see if the tree needs to be collapsed.
-		 */
-		if (retval && (state->path.active > 1)) {
-			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_da3_join(state);
-			if (!error) {
-				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
-			if (error) {
-				ASSERT(committed);
-				args->trans = NULL;
-				xfs_bmap_cancel(args->flist);
-				goto out;
-			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
-		}
-
-		/*
-		 * Commit and start the next trans in the chain.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-		if (error)
-			goto out;
-
-	} else if (args->rmtblkno > 0) {
-		/*
-		 * Added a "remote" value, just clear the incomplete flag.
-		 */
-		error = xfs_attr3_leaf_clearflag(args);
-		if (error)
-			goto out;
-	}
-	retval = error = 0;
-
-out:
-	if (state)
-		xfs_da_state_free(state);
-	if (error)
-		return error;
-	return retval;
-}
-
-/*
- * Remove a name from a B-tree attribute list.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_attr_node_removename(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	xfs_inode_t *dp;
-	struct xfs_buf *bp;
-	int retval, error, committed, forkoff;
-
-	trace_xfs_attr_node_removename(args);
-
-	/*
-	 * Tie a string around our finger to remind us where we are.
-	 */
-	dp = args->dp;
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = dp->i_mount;
-
-	/*
-	 * Search to see if name exists, and get back a pointer to it.
-	 */
-	error = xfs_da3_node_lookup_int(state, &retval);
-	if (error || (retval != EEXIST)) {
-		if (error == 0)
-			error = retval;
-		goto out;
-	}
-
-	/*
-	 * If there is an out-of-line value, de-allocate the blocks.
-	 * This is done before we remove the attribute so that we don't
-	 * overflow the maximum size of a transaction and/or hit a deadlock.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->bp != NULL);
-	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-	if (args->rmtblkno > 0) {
-		/*
-		 * Fill in disk block numbers in the state structure
-		 * so that we can get the buffers back after we commit
-		 * several transactions in the following calls.
-		 */
-		error = xfs_attr_fillstate(state);
-		if (error)
-			goto out;
-
-		/*
-		 * Mark the attribute as INCOMPLETE, then bunmapi() the
-		 * remote value.
-		 */
-		error = xfs_attr3_leaf_setflag(args);
-		if (error)
-			goto out;
-		error = xfs_attr_rmtval_remove(args);
-		if (error)
-			goto out;
-
-		/*
-		 * Refill the state structure with buffers, the prior calls
-		 * released our buffers.
-		 */
-		error = xfs_attr_refillstate(state);
-		if (error)
-			goto out;
-	}
-
-	/*
-	 * Remove the name and update the hashvals in the tree.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-	retval = xfs_attr3_leaf_remove(blk->bp, args);
-	xfs_da3_fixhashpath(state, &state->path);
-
-	/*
-	 * Check to see if the tree needs to be collapsed.
-	 */
-	if (retval && (state->path.active > 1)) {
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_da3_join(state);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			goto out;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
-		/*
-		 * Commit the Btree join operation and start a new trans.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-		if (error)
-			goto out;
-	}
-
-	/*
-	 * If the result is small enough, push it all into the inode.
-	 */
-	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-		/*
-		 * Have to get rid of the copy of this dabuf in the state.
-		 */
-		ASSERT(state->path.active == 1);
-		ASSERT(state->path.blk[0].bp);
-		state->path.blk[0].bp = NULL;
-
-		error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
-		if (error)
-			goto out;
-
-		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
-			/* bp is gone due to xfs_da_shrink_inode */
-			if (!error) {
-				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
-			if (error) {
-				ASSERT(committed);
-				args->trans = NULL;
-				xfs_bmap_cancel(args->flist);
-				goto out;
-			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
-		} else
-			xfs_trans_brelse(args->trans, bp);
-	}
-	error = 0;
-
-out:
-	xfs_da_state_free(state);
-	return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
-	xfs_da_state_path_t *path;
-	xfs_da_state_blk_t *blk;
-	int level;
-
-	trace_xfs_attr_fillstate(state->args);
-
-	/*
-	 * Roll down the "path" in the state structure, storing the on-disk
-	 * block number for those buffers in the "path".
-	 */
-	path = &state->path;
-	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-		if (blk->bp) {
-			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
-			blk->bp = NULL;
-		} else {
-			blk->disk_blkno = 0;
-		}
-	}
-
-	/*
-	 * Roll down the "altpath" in the state structure, storing the on-disk
-	 * block number for those buffers in the "altpath".
-	 */
-	path = &state->altpath;
-	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-		if (blk->bp) {
-			blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
-			blk->bp = NULL;
-		} else {
-			blk->disk_blkno = 0;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
-	xfs_da_state_path_t *path;
-	xfs_da_state_blk_t *blk;
-	int level, error;
-
-	trace_xfs_attr_refillstate(state->args);
-
-	/*
-	 * Roll down the "path" in the state structure, storing the on-disk
-	 * block number for those buffers in the "path".
-	 */
-	path = &state->path;
-	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-		if (blk->disk_blkno) {
-			error = xfs_da3_node_read(state->args->trans,
-						state->args->dp,
-						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK);
-			if (error)
-				return error;
-		} else {
-			blk->bp = NULL;
-		}
-	}
-
-	/*
-	 * Roll down the "altpath" in the state structure, storing the on-disk
-	 * block number for those buffers in the "altpath".
-	 */
-	path = &state->altpath;
-	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
-		if (blk->disk_blkno) {
-			error = xfs_da3_node_read(state->args->trans,
-						state->args->dp,
-						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK);
-			if (error)
-				return error;
-		} else {
-			blk->bp = NULL;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Look up a filename in a node attribute list.
- *
- * This routine gets called for any attribute fork that has more than one
- * block, ie: both true Btree attr lists and for single-leaf-blocks with
- * "remote" values taking up more blocks.
- */
-STATIC int
-xfs_attr_node_get(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	int error, retval;
-	int i;
-
-	trace_xfs_attr_node_get(args);
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-
-	/*
-	 * Search to see if name exists, and get back a pointer to it.
-	 */
-	error = xfs_da3_node_lookup_int(state, &retval);
-	if (error) {
-		retval = error;
-	} else if (retval == EEXIST) {
-		blk = &state->path.blk[ state->path.active-1 ];
-		ASSERT(blk->bp != NULL);
-		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-		/*
-		 * Get the value, local or "remote"
-		 */
-		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-		if (!retval && (args->rmtblkno > 0)
-		    && !(args->flags & ATTR_KERNOVAL)) {
-			retval = xfs_attr_rmtval_get(args);
-		}
-	}
-
-	/*
-	 * If not in a transaction, we have to release all the buffers.
-	 */
-	for (i = 0; i < state->path.active; i++) {
-		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-
-	xfs_da_state_free(state);
-	return retval;
-}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
deleted file mode 100644
index 127d96aba845..000000000000
--- a/fs/xfs/xfs_attr_leaf.c
+++ /dev/null
@@ -1,2697 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-#include "xfs_dir2.h"
-
-
-/*
- * xfs_attr_leaf.c
- *
- * Routines to implement leaf blocks of attributes as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
-				 xfs_dablk_t which_block, struct xfs_buf **bpp);
-STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
-				   struct xfs_attr3_icleaf_hdr *ichdr,
-				   struct xfs_da_args *args, int freemap_index);
-STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
-				   struct xfs_attr3_icleaf_hdr *ichdr,
-				   struct xfs_buf *leaf_buffer);
-STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
-						   xfs_da_state_blk_t *blk1,
-						   xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
-			xfs_da_state_blk_t *leaf_blk_1,
-			struct xfs_attr3_icleaf_hdr *ichdr1,
-			xfs_da_state_blk_t *leaf_blk_2,
-			struct xfs_attr3_icleaf_hdr *ichdr2,
-			int *number_entries_in_blk1,
-			int *number_usedbytes_in_blk1);
-
-/*
- * Utility routines.
- */
-STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
-			struct xfs_attr_leafblock *src_leaf,
-			struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
-			struct xfs_attr_leafblock *dst_leaf,
-			struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
-			int move_count);
-STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-
-void
-xfs_attr3_leaf_hdr_from_disk(
-	struct xfs_attr3_icleaf_hdr	*to,
-	struct xfs_attr_leafblock	*from)
-{
-	int	i;
-
-	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-	       from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-
-	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
-		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
-
-		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-		to->back = be32_to_cpu(hdr3->info.hdr.back);
-		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-		to->count = be16_to_cpu(hdr3->count);
-		to->usedbytes = be16_to_cpu(hdr3->usedbytes);
-		to->firstused = be16_to_cpu(hdr3->firstused);
-		to->holes = hdr3->holes;
-
-		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-			to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
-			to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
-		}
-		return;
-	}
-	to->forw = be32_to_cpu(from->hdr.info.forw);
-	to->back = be32_to_cpu(from->hdr.info.back);
-	to->magic = be16_to_cpu(from->hdr.info.magic);
-	to->count = be16_to_cpu(from->hdr.count);
-	to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
-	to->firstused = be16_to_cpu(from->hdr.firstused);
-	to->holes = from->hdr.holes;
-
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-		to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
-		to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
-	}
-}
-
-void
-xfs_attr3_leaf_hdr_to_disk(
-	struct xfs_attr_leafblock	*to,
-	struct xfs_attr3_icleaf_hdr	*from)
-{
-	int	i;
-
-	ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
-	       from->magic == XFS_ATTR3_LEAF_MAGIC);
-
-	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
-		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
-
-		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-		hdr3->info.hdr.back = cpu_to_be32(from->back);
-		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-		hdr3->count = cpu_to_be16(from->count);
-		hdr3->usedbytes = cpu_to_be16(from->usedbytes);
-		hdr3->firstused = cpu_to_be16(from->firstused);
-		hdr3->holes = from->holes;
-		hdr3->pad1 = 0;
-
-		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-			hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
-			hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
-		}
-		return;
-	}
-	to->hdr.info.forw = cpu_to_be32(from->forw);
-	to->hdr.info.back = cpu_to_be32(from->back);
-	to->hdr.info.magic = cpu_to_be16(from->magic);
-	to->hdr.count = cpu_to_be16(from->count);
-	to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
-	to->hdr.firstused = cpu_to_be16(from->firstused);
-	to->hdr.holes = from->holes;
-	to->hdr.pad1 = 0;
-
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-		to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
-		to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
-	}
-}
-
-static bool
-xfs_attr3_leaf_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_attr_leafblock *leaf = bp->b_addr;
-	struct xfs_attr3_icleaf_hdr ichdr;
-
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-		if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
-			return false;
-
-		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
-			return false;
-	}
-	if (ichdr.count == 0)
-		return false;
-
-	/* XXX: need to range check rest of attr header values */
-	/* XXX: hash order check? */
-
-	return true;
-}
-
-static void
-xfs_attr3_leaf_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
-
-	if (!xfs_attr3_leaf_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
-}
-
-/*
- * leaf/node format detection on trees is sketchy, so a node read can be done on
- * leaf level blocks when detection identifies the tree as a node format tree
- * incorrectly. In this case, we need to swap the verifier to match the correct
- * format of the block being read.
- */
-static void
-xfs_attr3_leaf_read_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	     !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_attr3_leaf_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
-	.verify_read = xfs_attr3_leaf_read_verify,
-	.verify_write = xfs_attr3_leaf_write_verify,
-};
-
-int
-xfs_attr3_leaf_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
-	if (!err && tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
-	return err;
-}
-
-/*========================================================================
- * Namespace helper routines
- *========================================================================*/
-
-/*
- * If namespace bits don't match return 0.
- * If all match then return 1.
- */
-STATIC int
-xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
-{
-	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
-}
-
-
-/*========================================================================
- * External routines when attribute fork size < XFS_LITINO(mp).
- *========================================================================*/
-
-/*
- * Query whether the requested number of additional bytes of extended
- * attribute space will be able to fit inline.
- *
- * Returns zero if not, else the di_forkoff fork offset to be used in the
- * literal area for attribute data once the new bytes have been added.
- *
- * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
- * special case for dev/uuid inodes, they have fixed size data forks.
- */
-int
-xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
-{
-	int offset;
-	int minforkoff;	/* lower limit on valid forkoff locations */
-	int maxforkoff;	/* upper limit on valid forkoff locations */
-	int dsize;
-	xfs_mount_t *mp = dp->i_mount;
-
-	/* rounded down */
-	offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
-
-	switch (dp->i_d.di_format) {
-	case XFS_DINODE_FMT_DEV:
-		minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-		return (offset >= minforkoff) ? minforkoff : 0;
-	case XFS_DINODE_FMT_UUID:
-		minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
-		return (offset >= minforkoff) ? minforkoff : 0;
-	}
-
-	/*
-	 * If the requested numbers of bytes is smaller or equal to the
-	 * current attribute fork size we can always proceed.
-	 *
-	 * Note that if_bytes in the data fork might actually be larger than
-	 * the current data fork size is due to delalloc extents. In that
-	 * case either the extent count will go down when they are converted
-	 * to real extents, or the delalloc conversion will take care of the
-	 * literal area rebalancing.
-	 */
-	if (bytes <= XFS_IFORK_ASIZE(dp))
-		return dp->i_d.di_forkoff;
-
-	/*
-	 * For attr2 we can try to move the forkoff if there is space in the
-	 * literal area, but for the old format we are done if there is no
-	 * space in the fixed attribute fork.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_ATTR2))
-		return 0;
-
-	dsize = dp->i_df.if_bytes;
-
-	switch (dp->i_d.di_format) {
-	case XFS_DINODE_FMT_EXTENTS:
-		/*
-		 * If there is no attr fork and the data fork is extents, 
-		 * determine if creating the default attr fork will result
-		 * in the extents form migrating to btree. If so, the
-		 * minimum offset only needs to be the space required for
-		 * the btree root.
-		 */
-		if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
-		    xfs_default_attroffset(dp))
-			dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		/*
-		 * If we have a data btree then keep forkoff if we have one,
-		 * otherwise we are adding a new attr, so then we set
-		 * minforkoff to where the btree root can finish so we have
-		 * plenty of room for attrs
-		 */
-		if (dp->i_d.di_forkoff) {
-			if (offset < dp->i_d.di_forkoff)
-				return 0;
-			return dp->i_d.di_forkoff;
-		}
-		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
-		break;
-	}
-
-	/*
-	 * A data fork btree root must have space for at least
-	 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
-	 */
-	minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
-	minforkoff = roundup(minforkoff, 8) >> 3;
-
-	/* attr fork btree root can have at least this many key/ptr pairs */
-	maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
-			XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	maxforkoff = maxforkoff >> 3;	/* rounded down */
-
-	if (offset >= maxforkoff)
-		return maxforkoff;
-	if (offset >= minforkoff)
-		return offset;
-	return 0;
-}
-
-/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2)
- */
-STATIC void
-xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
-{
-	if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
-	    !(xfs_sb_version_hasattr2(&mp->m_sb))) {
-		spin_lock(&mp->m_sb_lock);
-		if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
-			xfs_sb_version_addattr2(&mp->m_sb);
-			spin_unlock(&mp->m_sb_lock);
-			xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
-		} else
-			spin_unlock(&mp->m_sb_lock);
-	}
-}
-
-/*
- * Create the initial contents of a shortform attribute list.
- */
-void
-xfs_attr_shortform_create(xfs_da_args_t *args)
-{
-	xfs_attr_sf_hdr_t *hdr;
-	xfs_inode_t *dp;
-	xfs_ifork_t *ifp;
-
-	trace_xfs_attr_sf_create(args);
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	ifp = dp->i_afp;
-	ASSERT(ifp != NULL);
-	ASSERT(ifp->if_bytes == 0);
-	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
-		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
-		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
-		ifp->if_flags |= XFS_IFINLINE;
-	} else {
-		ASSERT(ifp->if_flags & XFS_IFINLINE);
-	}
-	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
-	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
-	hdr->count = 0;
-	hdr->totsize = cpu_to_be16(sizeof(*hdr));
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
-}
-
-/*
- * Add a name/value pair to the shortform attribute list.
- * Overflow from the inode has already been checked for.
- */
-void
-xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
-{
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int i, offset, size;
-	xfs_mount_t *mp;
-	xfs_inode_t *dp;
-	xfs_ifork_t *ifp;
-
-	trace_xfs_attr_sf_add(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	dp->i_d.di_forkoff = forkoff;
-
-	ifp = dp->i_afp;
-	ASSERT(ifp->if_flags & XFS_IFINLINE);
-	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-#ifdef DEBUG
-		if (sfe->namelen != args->namelen)
-			continue;
-		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-			continue;
-		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-			continue;
-		ASSERT(0);
-#endif
-	}
-
-	offset = (char *)sfe - (char *)sf;
-	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
-	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
-	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
-
-	sfe->namelen = args->namelen;
-	sfe->valuelen = args->valuelen;
-	sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-	memcpy(sfe->nameval, args->name, args->namelen);
-	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
-	sf->hdr.count++;
-	be16_add_cpu(&sf->hdr.totsize, size);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
-
-	xfs_sbversion_add_attr2(mp, args->trans);
-}
-
-/*
- * After the last attribute is removed revert to original inode format,
- * making all literal area available to the data fork once more.
- */
-STATIC void
-xfs_attr_fork_reset(
-	struct xfs_inode	*ip,
-	struct xfs_trans	*tp)
-{
-	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-	ip->i_d.di_forkoff = 0;
-	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-
-	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT(ip->i_afp == NULL);
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
-/*
- * Remove an attribute from the shortform attribute list structure.
- */
-int
-xfs_attr_shortform_remove(xfs_da_args_t *args)
-{
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int base, size=0, end, totsize, i;
-	xfs_mount_t *mp;
-	xfs_inode_t *dp;
-
-	trace_xfs_attr_sf_remove(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	base = sizeof(xfs_attr_sf_hdr_t);
-	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
-	sfe = &sf->list[0];
-	end = sf->hdr.count;
-	for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
-					base += size, i++) {
-		size = XFS_ATTR_SF_ENTSIZE(sfe);
-		if (sfe->namelen != args->namelen)
-			continue;
-		if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
-			continue;
-		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-			continue;
-		break;
-	}
-	if (i == end)
-		return ENOATTR;
-
-	/*
-	 * Fix up the attribute fork data, covering the hole
-	 */
-	end = base + size;
-	totsize = be16_to_cpu(sf->hdr.totsize);
-	if (end != totsize)
-		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
-	sf->hdr.count--;
-	be16_add_cpu(&sf->hdr.totsize, -size);
-
-	/*
-	 * Fix up the start offset of the attribute fork
-	 */
-	totsize -= size;
-	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-	    (mp->m_flags & XFS_MOUNT_ATTR2) &&
-	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-	    !(args->op_flags & XFS_DA_OP_ADDNAME)) {
-		xfs_attr_fork_reset(dp, args->trans);
-	} else {
-		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
-		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
-		ASSERT(dp->i_d.di_forkoff);
-		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
-				(args->op_flags & XFS_DA_OP_ADDNAME) ||
-				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
-				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-		xfs_trans_log_inode(args->trans, dp,
-					XFS_ILOG_CORE | XFS_ILOG_ADATA);
-	}
-
-	xfs_sbversion_add_attr2(mp, args->trans);
-
-	return 0;
-}
-
-/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int i;
-	xfs_ifork_t *ifp;
-
-	trace_xfs_attr_sf_lookup(args);
-
-	ifp = args->dp->i_afp;
-	ASSERT(ifp->if_flags & XFS_IFINLINE);
-	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count;
-				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-		if (sfe->namelen != args->namelen)
-			continue;
-		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-			continue;
-		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-			continue;
-		return EEXIST;
-	}
-	return ENOATTR;
-}
-
-/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
-{
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	int i;
-
-	ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
-	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count;
-				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-		if (sfe->namelen != args->namelen)
-			continue;
-		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
-			continue;
-		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-			continue;
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = sfe->valuelen;
-			return EEXIST;
-		}
-		if (args->valuelen < sfe->valuelen) {
-			args->valuelen = sfe->valuelen;
-			return ERANGE;
-		}
-		args->valuelen = sfe->valuelen;
-		memcpy(args->value, &sfe->nameval[args->namelen],
-						    args->valuelen);
-		return EEXIST;
-	}
-	return ENOATTR;
-}
-
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
-{
-	xfs_inode_t *dp;
-	xfs_attr_shortform_t *sf;
-	xfs_attr_sf_entry_t *sfe;
-	xfs_da_args_t nargs;
-	char *tmpbuffer;
-	int error, i, size;
-	xfs_dablk_t blkno;
-	struct xfs_buf *bp;
-	xfs_ifork_t *ifp;
-
-	trace_xfs_attr_sf_to_leaf(args);
-
-	dp = args->dp;
-	ifp = dp->i_afp;
-	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
-	size = be16_to_cpu(sf->hdr.totsize);
-	tmpbuffer = kmem_alloc(size, KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
-	sf = (xfs_attr_shortform_t *)tmpbuffer;
-
-	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
-	xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
-
-	bp = NULL;
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error) {
-		/*
-		 * If we hit an IO error middle of the transaction inside
-		 * grow_inode(), we may have inconsistent data. Bail out.
-		 */
-		if (error == EIO)
-			goto out;
-		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
-		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
-		goto out;
-	}
-
-	ASSERT(blkno == 0);
-	error = xfs_attr3_leaf_create(args, blkno, &bp);
-	if (error) {
-		error = xfs_da_shrink_inode(args, 0, bp);
-		bp = NULL;
-		if (error)
-			goto out;
-		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
-		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
-		goto out;
-	}
-
-	memset((char *)&nargs, 0, sizeof(nargs));
-	nargs.dp = dp;
-	nargs.geo = args->geo;
-	nargs.firstblock = args->firstblock;
-	nargs.flist = args->flist;
-	nargs.total = args->total;
-	nargs.whichfork = XFS_ATTR_FORK;
-	nargs.trans = args->trans;
-	nargs.op_flags = XFS_DA_OP_OKNOENT;
-
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count; i++) {
-		nargs.name = sfe->nameval;
-		nargs.namelen = sfe->namelen;
-		nargs.value = &sfe->nameval[nargs.namelen];
-		nargs.valuelen = sfe->valuelen;
-		nargs.hashval = xfs_da_hashname(sfe->nameval,
-						sfe->namelen);
-		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
-		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
-		ASSERT(error == ENOATTR);
-		error = xfs_attr3_leaf_add(bp, &nargs);
-		ASSERT(error != ENOSPC);
-		if (error)
-			goto out;
-		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-	}
-	error = 0;
-
-out:
-	kmem_free(tmpbuffer);
-	return error;
-}
-
-/*
- * Check a leaf attribute block to see if all the entries would fit into
- * a shortform attribute list.
- */
-int
-xfs_attr_shortform_allfit(
-	struct xfs_buf		*bp,
-	struct xfs_inode	*dp)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr_leaf_entry *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	struct xfs_attr3_icleaf_hdr leafhdr;
-	int			bytes;
-	int			i;
-
-	leaf = bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-	entry = xfs_attr3_leaf_entryp(leaf);
-
-	bytes = sizeof(struct xfs_attr_sf_hdr);
-	for (i = 0; i < leafhdr.count; entry++, i++) {
-		if (entry->flags & XFS_ATTR_INCOMPLETE)
-			continue;		/* don't copy partial entries */
-		if (!(entry->flags & XFS_ATTR_LOCAL))
-			return 0;
-		name_loc = xfs_attr3_leaf_name_local(leaf, i);
-		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-			return 0;
-		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
-			return 0;
-		bytes += sizeof(struct xfs_attr_sf_entry) - 1
-				+ name_loc->namelen
-				+ be16_to_cpu(name_loc->valuelen);
-	}
-	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
-	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
-		return -1;
-	return xfs_attr_shortform_bytesfit(dp, bytes);
-}
-
-/*
- * Convert a leaf attribute list to shortform attribute list
- */
-int
-xfs_attr3_leaf_to_shortform(
-	struct xfs_buf		*bp,
-	struct xfs_da_args	*args,
-	int			forkoff)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_name_local *name_loc;
-	struct xfs_da_args	nargs;
-	struct xfs_inode	*dp = args->dp;
-	char			*tmpbuffer;
-	int			error;
-	int			i;
-
-	trace_xfs_attr_leaf_to_sf(args);
-
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
-	if (!tmpbuffer)
-		return ENOMEM;
-
-	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
-
-	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	entry = xfs_attr3_leaf_entryp(leaf);
-
-	/* XXX (dgc): buffer is about to be marked stale - why zero it? */
-	memset(bp->b_addr, 0, args->geo->blksize);
-
-	/*
-	 * Clean out the prior contents of the attribute list.
-	 */
-	error = xfs_da_shrink_inode(args, 0, bp);
-	if (error)
-		goto out;
-
-	if (forkoff == -1) {
-		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
-		ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
-		xfs_attr_fork_reset(dp, args->trans);
-		goto out;
-	}
-
-	xfs_attr_shortform_create(args);
-
-	/*
-	 * Copy the attributes
-	 */
-	memset((char *)&nargs, 0, sizeof(nargs));
-	nargs.geo = args->geo;
-	nargs.dp = dp;
-	nargs.firstblock = args->firstblock;
-	nargs.flist = args->flist;
-	nargs.total = args->total;
-	nargs.whichfork = XFS_ATTR_FORK;
-	nargs.trans = args->trans;
-	nargs.op_flags = XFS_DA_OP_OKNOENT;
-
-	for (i = 0; i < ichdr.count; entry++, i++) {
-		if (entry->flags & XFS_ATTR_INCOMPLETE)
-			continue;	/* don't copy partial entries */
-		if (!entry->nameidx)
-			continue;
-		ASSERT(entry->flags & XFS_ATTR_LOCAL);
-		name_loc = xfs_attr3_leaf_name_local(leaf, i);
-		nargs.name = name_loc->nameval;
-		nargs.namelen = name_loc->namelen;
-		nargs.value = &name_loc->nameval[nargs.namelen];
-		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
-		nargs.hashval = be32_to_cpu(entry->hashval);
-		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
-		xfs_attr_shortform_add(&nargs, forkoff);
-	}
-	error = 0;
-
-out:
-	kmem_free(tmpbuffer);
-	return error;
-}
-
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_attr3_leaf_to_node(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr icleafhdr;
-	struct xfs_attr_leaf_entry *entries;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr icnodehdr;
-	struct xfs_da_intnode	*node;
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_buf		*bp1 = NULL;
-	struct xfs_buf		*bp2 = NULL;
-	xfs_dablk_t		blkno;
-	int			error;
-
-	trace_xfs_attr_leaf_to_node(args);
-
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error)
-		goto out;
-	error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
-	if (error)
-		goto out;
-
-	error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
-	if (error)
-		goto out;
-
-	/* copy leaf to new buffer, update identifiers */
-	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
-	bp2->b_ops = bp1->b_ops;
-	memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
-		hdr3->blkno = cpu_to_be64(bp2->b_bn);
-	}
-	xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
-
-	/*
-	 * Set up the new root node.
-	 */
-	error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
-	if (error)
-		goto out;
-	node = bp1->b_addr;
-	dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
-	btree = dp->d_ops->node_tree_p(node);
-
-	leaf = bp2->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
-	entries = xfs_attr3_leaf_entryp(leaf);
-
-	/* both on-disk, don't endian-flip twice */
-	btree[0].hashval = entries[icleafhdr.count - 1].hashval;
-	btree[0].before = cpu_to_be32(blkno);
-	icnodehdr.count = 1;
-	dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
-	xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
-	error = 0;
-out:
-	return error;
-}
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of a leaf attribute list
- * or a leaf in a node attribute list.
- */
-STATIC int
-xfs_attr3_leaf_create(
-	struct xfs_da_args	*args,
-	xfs_dablk_t		blkno,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_buf		*bp;
-	int			error;
-
-	trace_xfs_attr_leaf_create(args);
-
-	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
-					    XFS_ATTR_FORK);
-	if (error)
-		return error;
-	bp->b_ops = &xfs_attr3_leaf_buf_ops;
-	xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
-	leaf = bp->b_addr;
-	memset(leaf, 0, args->geo->blksize);
-
-	memset(&ichdr, 0, sizeof(ichdr));
-	ichdr.firstused = args->geo->blksize;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
-
-		ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
-
-		hdr3->blkno = cpu_to_be64(bp->b_bn);
-		hdr3->owner = cpu_to_be64(dp->i_ino);
-		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-
-		ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
-	} else {
-		ichdr.magic = XFS_ATTR_LEAF_MAGIC;
-		ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
-	}
-	ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
-
-	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-	xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
-
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_attr3_leaf_split(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*oldblk,
-	struct xfs_da_state_blk	*newblk)
-{
-	xfs_dablk_t blkno;
-	int error;
-
-	trace_xfs_attr_leaf_split(state->args);
-
-	/*
-	 * Allocate space for a new leaf node.
-	 */
-	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
-	error = xfs_da_grow_inode(state->args, &blkno);
-	if (error)
-		return error;
-	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
-	if (error)
-		return error;
-	newblk->blkno = blkno;
-	newblk->magic = XFS_ATTR_LEAF_MAGIC;
-
-	/*
-	 * Rebalance the entries across the two leaves.
-	 * NOTE: rebalance() currently depends on the 2nd block being empty.
-	 */
-	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
-	error = xfs_da3_blk_link(state, oldblk, newblk);
-	if (error)
-		return error;
-
-	/*
-	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
-	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
-	 * "new" attrs info.  Will need the "old" info to remove it later.
-	 *
-	 * Insert the "new" entry in the correct block.
-	 */
-	if (state->inleaf) {
-		trace_xfs_attr_leaf_add_old(state->args);
-		error = xfs_attr3_leaf_add(oldblk->bp, state->args);
-	} else {
-		trace_xfs_attr_leaf_add_new(state->args);
-		error = xfs_attr3_leaf_add(newblk->bp, state->args);
-	}
-
-	/*
-	 * Update last hashval in each block since we added the name.
-	 */
-	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
-	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
-	return error;
-}
-
-/*
- * Add a name to the leaf attribute list structure.
- */
-int
-xfs_attr3_leaf_add(
-	struct xfs_buf		*bp,
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	int			tablesize;
-	int			entsize;
-	int			sum;
-	int			tmp;
-	int			i;
-
-	trace_xfs_attr_leaf_add(args);
-
-	leaf = bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	ASSERT(args->index >= 0 && args->index <= ichdr.count);
-	entsize = xfs_attr_leaf_newentsize(args, NULL);
-
-	/*
-	 * Search through freemap for first-fit on new name length.
-	 * (may need to figure in size of entry struct too)
-	 */
-	tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
-					+ xfs_attr3_leaf_hdr_size(leaf);
-	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
-		if (tablesize > ichdr.firstused) {
-			sum += ichdr.freemap[i].size;
-			continue;
-		}
-		if (!ichdr.freemap[i].size)
-			continue;	/* no space in this map */
-		tmp = entsize;
-		if (ichdr.freemap[i].base < ichdr.firstused)
-			tmp += sizeof(xfs_attr_leaf_entry_t);
-		if (ichdr.freemap[i].size >= tmp) {
-			tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
-			goto out_log_hdr;
-		}
-		sum += ichdr.freemap[i].size;
-	}
-
-	/*
-	 * If there are no holes in the address space of the block,
-	 * and we don't have enough freespace, then compaction will do us
-	 * no good and we should just give up.
-	 */
-	if (!ichdr.holes && sum < entsize)
-		return ENOSPC;
-
-	/*
-	 * Compact the entries to coalesce free space.
-	 * This may change the hdr->count via dropping INCOMPLETE entries.
-	 */
-	xfs_attr3_leaf_compact(args, &ichdr, bp);
-
-	/*
-	 * After compaction, the block is guaranteed to have only one
-	 * free region, in freemap[0].  If it is not big enough, give up.
-	 */
-	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
-		tmp = ENOSPC;
-		goto out_log_hdr;
-	}
-
-	tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
-
-out_log_hdr:
-	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-	xfs_trans_log_buf(args->trans, bp,
-		XFS_DA_LOGRANGE(leaf, &leaf->hdr,
-				xfs_attr3_leaf_hdr_size(leaf)));
-	return tmp;
-}
-
-/*
- * Add a name to a leaf attribute list structure.
- */
-STATIC int
-xfs_attr3_leaf_add_work(
-	struct xfs_buf		*bp,
-	struct xfs_attr3_icleaf_hdr *ichdr,
-	struct xfs_da_args	*args,
-	int			mapindex)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_name_local *name_loc;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	struct xfs_mount	*mp;
-	int			tmp;
-	int			i;
-
-	trace_xfs_attr_leaf_add_work(args);
-
-	leaf = bp->b_addr;
-	ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
-	ASSERT(args->index >= 0 && args->index <= ichdr->count);
-
-	/*
-	 * Force open some space in the entry array and fill it in.
-	 */
-	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-	if (args->index < ichdr->count) {
-		tmp  = ichdr->count - args->index;
-		tmp *= sizeof(xfs_attr_leaf_entry_t);
-		memmove(entry + 1, entry, tmp);
-		xfs_trans_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
-	}
-	ichdr->count++;
-
-	/*
-	 * Allocate space for the new string (at the end of the run).
-	 */
-	mp = args->trans->t_mountp;
-	ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
-	ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
-	ASSERT(ichdr->freemap[mapindex].size >=
-		xfs_attr_leaf_newentsize(args, NULL));
-	ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
-	ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
-
-	ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
-
-	entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
-				     ichdr->freemap[mapindex].size);
-	entry->hashval = cpu_to_be32(args->hashval);
-	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
-	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-	if (args->op_flags & XFS_DA_OP_RENAME) {
-		entry->flags |= XFS_ATTR_INCOMPLETE;
-		if ((args->blkno2 == args->blkno) &&
-		    (args->index2 <= args->index)) {
-			args->index2++;
-		}
-	}
-	xfs_trans_log_buf(args->trans, bp,
-			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-	ASSERT((args->index == 0) ||
-	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
-	ASSERT((args->index == ichdr->count - 1) ||
-	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
-
-	/*
-	 * For "remote" attribute values, simply note that we need to
-	 * allocate space for the "remote" value.  We can't actually
-	 * allocate the extents in this transaction, and we can't decide
-	 * which blocks they should be as we might allocate more blocks
-	 * as part of this transaction (a split operation for example).
-	 */
-	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-		name_loc->namelen = args->namelen;
-		name_loc->valuelen = cpu_to_be16(args->valuelen);
-		memcpy((char *)name_loc->nameval, args->name, args->namelen);
-		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
-				   be16_to_cpu(name_loc->valuelen));
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		name_rmt->namelen = args->namelen;
-		memcpy((char *)name_rmt->name, args->name, args->namelen);
-		entry->flags |= XFS_ATTR_INCOMPLETE;
-		/* just in case */
-		name_rmt->valuelen = 0;
-		name_rmt->valueblk = 0;
-		args->rmtblkno = 1;
-		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
-		args->rmtvaluelen = args->valuelen;
-	}
-	xfs_trans_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
-				   xfs_attr_leaf_entsize(leaf, args->index)));
-
-	/*
-	 * Update the control info for this leaf node
-	 */
-	if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
-		ichdr->firstused = be16_to_cpu(entry->nameidx);
-
-	ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
-					+ xfs_attr3_leaf_hdr_size(leaf));
-	tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
-					+ xfs_attr3_leaf_hdr_size(leaf);
-
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-		if (ichdr->freemap[i].base == tmp) {
-			ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
-			ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
-		}
-	}
-	ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
-	return 0;
-}
-
-/*
- * Garbage collect a leaf attribute list block by copying it to a new buffer.
- */
-STATIC void
-xfs_attr3_leaf_compact(
-	struct xfs_da_args	*args,
-	struct xfs_attr3_icleaf_hdr *ichdr_dst,
-	struct xfs_buf		*bp)
-{
-	struct xfs_attr_leafblock *leaf_src;
-	struct xfs_attr_leafblock *leaf_dst;
-	struct xfs_attr3_icleaf_hdr ichdr_src;
-	struct xfs_trans	*trans = args->trans;
-	char			*tmpbuffer;
-
-	trace_xfs_attr_leaf_compact(args);
-
-	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
-	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
-	memset(bp->b_addr, 0, args->geo->blksize);
-	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
-	leaf_dst = bp->b_addr;
-
-	/*
-	 * Copy the on-disk header back into the destination buffer to ensure
-	 * all the information in the header that is not part of the incore
-	 * header structure is preserved.
-	 */
-	memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
-
-	/* Initialise the incore headers */
-	ichdr_src = *ichdr_dst;	/* struct copy */
-	ichdr_dst->firstused = args->geo->blksize;
-	ichdr_dst->usedbytes = 0;
-	ichdr_dst->count = 0;
-	ichdr_dst->holes = 0;
-	ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
-	ichdr_dst->freemap[0].size = ichdr_dst->firstused -
-						ichdr_dst->freemap[0].base;
-
-	/* write the header back to initialise the underlying buffer */
-	xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate name/value pairs packed and in sequence.
-	 */
-	xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
-				leaf_dst, ichdr_dst, 0, ichdr_src.count);
-	/*
-	 * this logs the entire buffer, but the caller must write the header
-	 * back to the buffer when it is finished modifying it.
-	 */
-	xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
-
-	kmem_free(tmpbuffer);
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-static int
-xfs_attr3_leaf_order(
-	struct xfs_buf	*leaf1_bp,
-	struct xfs_attr3_icleaf_hdr *leaf1hdr,
-	struct xfs_buf	*leaf2_bp,
-	struct xfs_attr3_icleaf_hdr *leaf2hdr)
-{
-	struct xfs_attr_leaf_entry *entries1;
-	struct xfs_attr_leaf_entry *entries2;
-
-	entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
-	entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
-	if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
-	    ((be32_to_cpu(entries2[0].hashval) <
-	      be32_to_cpu(entries1[0].hashval)) ||
-	     (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
-	      be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
-		return 1;
-	}
-	return 0;
-}
-
-int
-xfs_attr_leaf_order(
-	struct xfs_buf	*leaf1_bp,
-	struct xfs_buf	*leaf2_bp)
-{
-	struct xfs_attr3_icleaf_hdr ichdr1;
-	struct xfs_attr3_icleaf_hdr ichdr2;
-
-	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
-	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
-	return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
-}
-
-/*
- * Redistribute the attribute list entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of the
- * old block.  At present, all (one) callers pass in an empty second block.
- *
- * This code adjusts the args->index/blkno and args->index2/blkno2 fields
- * to match what it is doing in splitting the attribute leaf block.  Those
- * values are used in "atomic rename" operations on attributes.  Note that
- * the "new" and "old" values can end up in different blocks.
- */
-STATIC void
-xfs_attr3_leaf_rebalance(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*blk1,
-	struct xfs_da_state_blk	*blk2)
-{
-	struct xfs_da_args	*args;
-	struct xfs_attr_leafblock *leaf1;
-	struct xfs_attr_leafblock *leaf2;
-	struct xfs_attr3_icleaf_hdr ichdr1;
-	struct xfs_attr3_icleaf_hdr ichdr2;
-	struct xfs_attr_leaf_entry *entries1;
-	struct xfs_attr_leaf_entry *entries2;
-	int			count;
-	int			totallen;
-	int			max;
-	int			space;
-	int			swap;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
-	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
-	leaf1 = blk1->bp->b_addr;
-	leaf2 = blk2->bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
-	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
-	ASSERT(ichdr2.count == 0);
-	args = state->args;
-
-	trace_xfs_attr_leaf_rebalance(args);
-
-	/*
-	 * Check ordering of blocks, reverse if it makes things simpler.
-	 *
-	 * NOTE: Given that all (current) callers pass in an empty
-	 * second block, this code should never set "swap".
-	 */
-	swap = 0;
-	if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
-		struct xfs_da_state_blk	*tmp_blk;
-		struct xfs_attr3_icleaf_hdr tmp_ichdr;
-
-		tmp_blk = blk1;
-		blk1 = blk2;
-		blk2 = tmp_blk;
-
-		/* struct copies to swap them rather than reconverting */
-		tmp_ichdr = ichdr1;
-		ichdr1 = ichdr2;
-		ichdr2 = tmp_ichdr;
-
-		leaf1 = blk1->bp->b_addr;
-		leaf2 = blk2->bp->b_addr;
-		swap = 1;
-	}
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.  Then get
-	 * the direction to copy and the number of elements to move.
-	 *
-	 * "inleaf" is true if the new entry should be inserted into blk1.
-	 * If "swap" is also true, then reverse the sense of "inleaf".
-	 */
-	state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
-						      blk2, &ichdr2,
-						      &count, &totallen);
-	if (swap)
-		state->inleaf = !state->inleaf;
-
-	/*
-	 * Move any entries required from leaf to leaf:
-	 */
-	if (count < ichdr1.count) {
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		/* number entries being moved */
-		count = ichdr1.count - count;
-		space  = ichdr1.usedbytes - totallen;
-		space += count * sizeof(xfs_attr_leaf_entry_t);
-
-		/*
-		 * leaf2 is the destination, compact it if it looks tight.
-		 */
-		max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
-		max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
-		if (space > max)
-			xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
-
-		/*
-		 * Move high entries from leaf1 to low end of leaf2.
-		 */
-		xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
-				ichdr1.count - count, leaf2, &ichdr2, 0, count);
-
-	} else if (count > ichdr1.count) {
-		/*
-		 * I assert that since all callers pass in an empty
-		 * second buffer, this code should never execute.
-		 */
-		ASSERT(0);
-
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		/* number entries being moved */
-		count -= ichdr1.count;
-		space  = totallen - ichdr1.usedbytes;
-		space += count * sizeof(xfs_attr_leaf_entry_t);
-
-		/*
-		 * leaf1 is the destination, compact it if it looks tight.
-		 */
-		max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
-		max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
-		if (space > max)
-			xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
-
-		/*
-		 * Move low entries from leaf2 to high end of leaf1.
-		 */
-		xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
-					ichdr1.count, count);
-	}
-
-	xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
-	xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
-	xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
-	xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	entries1 = xfs_attr3_leaf_entryp(leaf1);
-	entries2 = xfs_attr3_leaf_entryp(leaf2);
-	blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
-	blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
-
-	/*
-	 * Adjust the expected index for insertion.
-	 * NOTE: this code depends on the (current) situation that the
-	 * second block was originally empty.
-	 *
-	 * If the insertion point moved to the 2nd block, we must adjust
-	 * the index.  We must also track the entry just following the
-	 * new entry for use in an "atomic rename" operation, that entry
-	 * is always the "old" entry and the "new" entry is what we are
-	 * inserting.  The index/blkno fields refer to the "old" entry,
-	 * while the index2/blkno2 fields refer to the "new" entry.
-	 */
-	if (blk1->index > ichdr1.count) {
-		ASSERT(state->inleaf == 0);
-		blk2->index = blk1->index - ichdr1.count;
-		args->index = args->index2 = blk2->index;
-		args->blkno = args->blkno2 = blk2->blkno;
-	} else if (blk1->index == ichdr1.count) {
-		if (state->inleaf) {
-			args->index = blk1->index;
-			args->blkno = blk1->blkno;
-			args->index2 = 0;
-			args->blkno2 = blk2->blkno;
-		} else {
-			/*
-			 * On a double leaf split, the original attr location
-			 * is already stored in blkno2/index2, so don't
-			 * overwrite it overwise we corrupt the tree.
-			 */
-			blk2->index = blk1->index - ichdr1.count;
-			args->index = blk2->index;
-			args->blkno = blk2->blkno;
-			if (!state->extravalid) {
-				/*
-				 * set the new attr location to match the old
-				 * one and let the higher level split code
-				 * decide where in the leaf to place it.
-				 */
-				args->index2 = blk2->index;
-				args->blkno2 = blk2->blkno;
-			}
-		}
-	} else {
-		ASSERT(state->inleaf == 1);
-		args->index = args->index2 = blk1->index;
-		args->blkno = args->blkno2 = blk1->blkno;
-	}
-}
-
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary?  With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_attr3_leaf_figure_balance(
-	struct xfs_da_state		*state,
-	struct xfs_da_state_blk		*blk1,
-	struct xfs_attr3_icleaf_hdr	*ichdr1,
-	struct xfs_da_state_blk		*blk2,
-	struct xfs_attr3_icleaf_hdr	*ichdr2,
-	int				*countarg,
-	int				*usedbytesarg)
-{
-	struct xfs_attr_leafblock	*leaf1 = blk1->bp->b_addr;
-	struct xfs_attr_leafblock	*leaf2 = blk2->bp->b_addr;
-	struct xfs_attr_leaf_entry	*entry;
-	int				count;
-	int				max;
-	int				index;
-	int				totallen = 0;
-	int				half;
-	int				lastdelta;
-	int				foundit = 0;
-	int				tmp;
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.
-	 */
-	max = ichdr1->count + ichdr2->count;
-	half = (max + 1) * sizeof(*entry);
-	half += ichdr1->usedbytes + ichdr2->usedbytes +
-			xfs_attr_leaf_newentsize(state->args, NULL);
-	half /= 2;
-	lastdelta = state->args->geo->blksize;
-	entry = xfs_attr3_leaf_entryp(leaf1);
-	for (count = index = 0; count < max; entry++, index++, count++) {
-
-#define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
-		/*
-		 * The new entry is in the first block, account for it.
-		 */
-		if (count == blk1->index) {
-			tmp = totallen + sizeof(*entry) +
-				xfs_attr_leaf_newentsize(state->args, NULL);
-			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
-				break;
-			lastdelta = XFS_ATTR_ABS(half - tmp);
-			totallen = tmp;
-			foundit = 1;
-		}
-
-		/*
-		 * Wrap around into the second block if necessary.
-		 */
-		if (count == ichdr1->count) {
-			leaf1 = leaf2;
-			entry = xfs_attr3_leaf_entryp(leaf1);
-			index = 0;
-		}
-
-		/*
-		 * Figure out if next leaf entry would be too much.
-		 */
-		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
-									index);
-		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
-			break;
-		lastdelta = XFS_ATTR_ABS(half - tmp);
-		totallen = tmp;
-#undef XFS_ATTR_ABS
-	}
-
-	/*
-	 * Calculate the number of usedbytes that will end up in lower block.
-	 * If new entry not in lower block, fix up the count.
-	 */
-	totallen -= count * sizeof(*entry);
-	if (foundit) {
-		totallen -= sizeof(*entry) +
-				xfs_attr_leaf_newentsize(state->args, NULL);
-	}
-
-	*countarg = count;
-	*usedbytesarg = totallen;
-	return foundit;
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- *
- * GROT: allow for INCOMPLETE entries in calculation.
- */
-int
-xfs_attr3_leaf_toosmall(
-	struct xfs_da_state	*state,
-	int			*action)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_da_state_blk	*blk;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_buf		*bp;
-	xfs_dablk_t		blkno;
-	int			bytes;
-	int			forward;
-	int			error;
-	int			retval;
-	int			i;
-
-	trace_xfs_attr_leaf_toosmall(state->args);
-
-	/*
-	 * Check for the degenerate case of the block being over 50% full.
-	 * If so, it's not worth even looking to see if we might be able
-	 * to coalesce with a sibling.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	leaf = blk->bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	bytes = xfs_attr3_leaf_hdr_size(leaf) +
-		ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
-		ichdr.usedbytes;
-	if (bytes > (state->args->geo->blksize >> 1)) {
-		*action = 0;	/* blk over 50%, don't try to join */
-		return 0;
-	}
-
-	/*
-	 * Check for the degenerate case of the block being empty.
-	 * If the block is empty, we'll simply delete it, no need to
-	 * coalesce it with a sibling block.  We choose (arbitrarily)
-	 * to merge with the forward block unless it is NULL.
-	 */
-	if (ichdr.count == 0) {
-		/*
-		 * Make altpath point to the block we want to keep and
-		 * path point to the block we want to drop (this one).
-		 */
-		forward = (ichdr.forw != 0);
-		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da3_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-		if (error)
-			return error;
-		if (retval) {
-			*action = 0;
-		} else {
-			*action = 2;
-		}
-		return 0;
-	}
-
-	/*
-	 * Examine each sibling block to see if we can coalesce with
-	 * at least 25% free space to spare.  We need to figure out
-	 * whether to merge with the forward or the backward block.
-	 * We prefer coalescing with the lower numbered sibling so as
-	 * to shrink an attribute list over time.
-	 */
-	/* start with smaller blk num */
-	forward = ichdr.forw < ichdr.back;
-	for (i = 0; i < 2; forward = !forward, i++) {
-		struct xfs_attr3_icleaf_hdr ichdr2;
-		if (forward)
-			blkno = ichdr.forw;
-		else
-			blkno = ichdr.back;
-		if (blkno == 0)
-			continue;
-		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
-					blkno, -1, &bp);
-		if (error)
-			return error;
-
-		xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
-
-		bytes = state->args->geo->blksize -
-			(state->args->geo->blksize >> 2) -
-			ichdr.usedbytes - ichdr2.usedbytes -
-			((ichdr.count + ichdr2.count) *
-					sizeof(xfs_attr_leaf_entry_t)) -
-			xfs_attr3_leaf_hdr_size(leaf);
-
-		xfs_trans_brelse(state->args->trans, bp);
-		if (bytes >= 0)
-			break;	/* fits with at least 25% to spare */
-	}
-	if (i >= 2) {
-		*action = 0;
-		return 0;
-	}
-
-	/*
-	 * Make altpath point to the block we want to keep (the lower
-	 * numbered block) and path point to the block we want to drop.
-	 */
-	memcpy(&state->altpath, &state->path, sizeof(state->path));
-	if (blkno < blk->blkno) {
-		error = xfs_da3_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-	} else {
-		error = xfs_da3_path_shift(state, &state->path, forward,
-						 0, &retval);
-	}
-	if (error)
-		return error;
-	if (retval) {
-		*action = 0;
-	} else {
-		*action = 1;
-	}
-	return 0;
-}
-
-/*
- * Remove a name from the leaf attribute list structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_attr3_leaf_remove(
-	struct xfs_buf		*bp,
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_attr_leaf_entry *entry;
-	int			before;
-	int			after;
-	int			smallest;
-	int			entsize;
-	int			tablesize;
-	int			tmp;
-	int			i;
-
-	trace_xfs_attr_leaf_remove(args);
-
-	leaf = bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-	ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
-	ASSERT(args->index >= 0 && args->index < ichdr.count);
-	ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
-					xfs_attr3_leaf_hdr_size(leaf));
-
-	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-
-	ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
-	ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
-
-	/*
-	 * Scan through free region table:
-	 *    check for adjacency of free'd entry with an existing one,
-	 *    find smallest free region in case we need to replace it,
-	 *    adjust any map that borders the entry table,
-	 */
-	tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
-					+ xfs_attr3_leaf_hdr_size(leaf);
-	tmp = ichdr.freemap[0].size;
-	before = after = -1;
-	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
-	entsize = xfs_attr_leaf_entsize(leaf, args->index);
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
-		ASSERT(ichdr.freemap[i].base < args->geo->blksize);
-		ASSERT(ichdr.freemap[i].size < args->geo->blksize);
-		if (ichdr.freemap[i].base == tablesize) {
-			ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
-			ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
-		}
-
-		if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
-				be16_to_cpu(entry->nameidx)) {
-			before = i;
-		} else if (ichdr.freemap[i].base ==
-				(be16_to_cpu(entry->nameidx) + entsize)) {
-			after = i;
-		} else if (ichdr.freemap[i].size < tmp) {
-			tmp = ichdr.freemap[i].size;
-			smallest = i;
-		}
-	}
-
-	/*
-	 * Coalesce adjacent freemap regions,
-	 * or replace the smallest region.
-	 */
-	if ((before >= 0) || (after >= 0)) {
-		if ((before >= 0) && (after >= 0)) {
-			ichdr.freemap[before].size += entsize;
-			ichdr.freemap[before].size += ichdr.freemap[after].size;
-			ichdr.freemap[after].base = 0;
-			ichdr.freemap[after].size = 0;
-		} else if (before >= 0) {
-			ichdr.freemap[before].size += entsize;
-		} else {
-			ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
-			ichdr.freemap[after].size += entsize;
-		}
-	} else {
-		/*
-		 * Replace smallest region (if it is smaller than free'd entry)
-		 */
-		if (ichdr.freemap[smallest].size < entsize) {
-			ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
-			ichdr.freemap[smallest].size = entsize;
-		}
-	}
-
-	/*
-	 * Did we remove the first entry?
-	 */
-	if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
-		smallest = 1;
-	else
-		smallest = 0;
-
-	/*
-	 * Compress the remaining entries and zero out the removed stuff.
-	 */
-	memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
-	ichdr.usedbytes -= entsize;
-	xfs_trans_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
-				   entsize));
-
-	tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
-	memmove(entry, entry + 1, tmp);
-	ichdr.count--;
-	xfs_trans_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
-
-	entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
-	memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
-
-	/*
-	 * If we removed the first entry, re-find the first used byte
-	 * in the name area.  Note that if the entry was the "firstused",
-	 * then we don't have a "hole" in our block resulting from
-	 * removing the name.
-	 */
-	if (smallest) {
-		tmp = args->geo->blksize;
-		entry = xfs_attr3_leaf_entryp(leaf);
-		for (i = ichdr.count - 1; i >= 0; entry++, i--) {
-			ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
-			ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
-
-			if (be16_to_cpu(entry->nameidx) < tmp)
-				tmp = be16_to_cpu(entry->nameidx);
-		}
-		ichdr.firstused = tmp;
-		if (!ichdr.firstused)
-			ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
-	} else {
-		ichdr.holes = 1;	/* mark as needing compaction */
-	}
-	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
-	xfs_trans_log_buf(args->trans, bp,
-			  XFS_DA_LOGRANGE(leaf, &leaf->hdr,
-					  xfs_attr3_leaf_hdr_size(leaf)));
-
-	/*
-	 * Check if leaf is less than 50% full, caller may want to
-	 * "join" the leaf with a sibling if so.
-	 */
-	tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
-	      ichdr.count * sizeof(xfs_attr_leaf_entry_t);
-
-	return tmp < args->geo->magicpct; /* leaf is < 37% full */
-}
-
-/*
- * Move all the attribute list entries from drop_leaf into save_leaf.
- */
-void
-xfs_attr3_leaf_unbalance(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*drop_blk,
-	struct xfs_da_state_blk	*save_blk)
-{
-	struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
-	struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
-	struct xfs_attr3_icleaf_hdr drophdr;
-	struct xfs_attr3_icleaf_hdr savehdr;
-	struct xfs_attr_leaf_entry *entry;
-
-	trace_xfs_attr_leaf_unbalance(state->args);
-
-	drop_leaf = drop_blk->bp->b_addr;
-	save_leaf = save_blk->bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
-	xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
-	entry = xfs_attr3_leaf_entryp(drop_leaf);
-
-	/*
-	 * Save last hashval from dying block for later Btree fixup.
-	 */
-	drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
-
-	/*
-	 * Check if we need a temp buffer, or can we do it in place.
-	 * Note that we don't check "leaf" for holes because we will
-	 * always be dropping it, toosmall() decided that for us already.
-	 */
-	if (savehdr.holes == 0) {
-		/*
-		 * dest leaf has no holes, so we add there.  May need
-		 * to make some room in the entry array.
-		 */
-		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
-					 drop_blk->bp, &drophdr)) {
-			xfs_attr3_leaf_moveents(state->args,
-						drop_leaf, &drophdr, 0,
-						save_leaf, &savehdr, 0,
-						drophdr.count);
-		} else {
-			xfs_attr3_leaf_moveents(state->args,
-						drop_leaf, &drophdr, 0,
-						save_leaf, &savehdr,
-						savehdr.count, drophdr.count);
-		}
-	} else {
-		/*
-		 * Destination has holes, so we make a temporary copy
-		 * of the leaf and add them both to that.
-		 */
-		struct xfs_attr_leafblock *tmp_leaf;
-		struct xfs_attr3_icleaf_hdr tmphdr;
-
-		tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
-
-		/*
-		 * Copy the header into the temp leaf so that all the stuff
-		 * not in the incore header is present and gets copied back in
-		 * once we've moved all the entries.
-		 */
-		memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
-
-		memset(&tmphdr, 0, sizeof(tmphdr));
-		tmphdr.magic = savehdr.magic;
-		tmphdr.forw = savehdr.forw;
-		tmphdr.back = savehdr.back;
-		tmphdr.firstused = state->args->geo->blksize;
-
-		/* write the header to the temp buffer to initialise it */
-		xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
-
-		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
-					 drop_blk->bp, &drophdr)) {
-			xfs_attr3_leaf_moveents(state->args,
-						drop_leaf, &drophdr, 0,
-						tmp_leaf, &tmphdr, 0,
-						drophdr.count);
-			xfs_attr3_leaf_moveents(state->args,
-						save_leaf, &savehdr, 0,
-						tmp_leaf, &tmphdr, tmphdr.count,
-						savehdr.count);
-		} else {
-			xfs_attr3_leaf_moveents(state->args,
-						save_leaf, &savehdr, 0,
-						tmp_leaf, &tmphdr, 0,
-						savehdr.count);
-			xfs_attr3_leaf_moveents(state->args,
-						drop_leaf, &drophdr, 0,
-						tmp_leaf, &tmphdr, tmphdr.count,
-						drophdr.count);
-		}
-		memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
-		savehdr = tmphdr; /* struct copy */
-		kmem_free(tmp_leaf);
-	}
-
-	xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
-	xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
-					   state->args->geo->blksize - 1);
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	entry = xfs_attr3_leaf_entryp(save_leaf);
-	save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Look up a name in a leaf attribute list structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node.  The Btree code must check in adjacent leaf nodes.
- *
- * Return in args->index the index into the entry[] array of either
- * the found entry, or where the entry should have been (insert before
- * that entry).
- *
- * Don't change the args->value unless we find the attribute.
- */
-int
-xfs_attr3_leaf_lookup_int(
-	struct xfs_buf		*bp,
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_entry *entries;
-	struct xfs_attr_leaf_name_local *name_loc;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	xfs_dahash_t		hashval;
-	int			probe;
-	int			span;
-
-	trace_xfs_attr_leaf_lookup(args);
-
-	leaf = bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	entries = xfs_attr3_leaf_entryp(leaf);
-	ASSERT(ichdr.count < args->geo->blksize / 8);
-
-	/*
-	 * Binary search.  (note: small blocks will skip this loop)
-	 */
-	hashval = args->hashval;
-	probe = span = ichdr.count / 2;
-	for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
-		span /= 2;
-		if (be32_to_cpu(entry->hashval) < hashval)
-			probe += span;
-		else if (be32_to_cpu(entry->hashval) > hashval)
-			probe -= span;
-		else
-			break;
-	}
-	ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
-	ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
-
-	/*
-	 * Since we may have duplicate hashval's, find the first matching
-	 * hashval in the leaf.
-	 */
-	while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
-		entry--;
-		probe--;
-	}
-	while (probe < ichdr.count &&
-	       be32_to_cpu(entry->hashval) < hashval) {
-		entry++;
-		probe++;
-	}
-	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
-		args->index = probe;
-		return ENOATTR;
-	}
-
-	/*
-	 * Duplicate keys may be present, so search all of them for a match.
-	 */
-	for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
-			entry++, probe++) {
-/*
- * GROT: Add code to remove incomplete entries.
- */
-		/*
-		 * If we are looking for INCOMPLETE entries, show only those.
-		 * If we are looking for complete entries, show only those.
-		 */
-		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
-		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
-			continue;
-		}
-		if (entry->flags & XFS_ATTR_LOCAL) {
-			name_loc = xfs_attr3_leaf_name_local(leaf, probe);
-			if (name_loc->namelen != args->namelen)
-				continue;
-			if (memcmp(args->name, name_loc->nameval,
-							args->namelen) != 0)
-				continue;
-			if (!xfs_attr_namesp_match(args->flags, entry->flags))
-				continue;
-			args->index = probe;
-			return EEXIST;
-		} else {
-			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
-			if (name_rmt->namelen != args->namelen)
-				continue;
-			if (memcmp(args->name, name_rmt->name,
-							args->namelen) != 0)
-				continue;
-			if (!xfs_attr_namesp_match(args->flags, entry->flags))
-				continue;
-			args->index = probe;
-			args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-			args->rmtblkcnt = xfs_attr3_rmt_blocks(
-							args->dp->i_mount,
-							args->rmtvaluelen);
-			return EEXIST;
-		}
-	}
-	args->index = probe;
-	return ENOATTR;
-}
-
-/*
- * Get the value associated with an attribute name from a leaf attribute
- * list structure.
- */
-int
-xfs_attr3_leaf_getvalue(
-	struct xfs_buf		*bp,
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_name_local *name_loc;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	int			valuelen;
-
-	leaf = bp->b_addr;
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	ASSERT(ichdr.count < args->geo->blksize / 8);
-	ASSERT(args->index < ichdr.count);
-
-	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-		ASSERT(name_loc->namelen == args->namelen);
-		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-		valuelen = be16_to_cpu(name_loc->valuelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = valuelen;
-			return 0;
-		}
-		if (args->valuelen < valuelen) {
-			args->valuelen = valuelen;
-			return ERANGE;
-		}
-		args->valuelen = valuelen;
-		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		ASSERT(name_rmt->namelen == args->namelen);
-		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-						       args->rmtvaluelen);
-		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = args->rmtvaluelen;
-			return 0;
-		}
-		if (args->valuelen < args->rmtvaluelen) {
-			args->valuelen = args->rmtvaluelen;
-			return ERANGE;
-		}
-		args->valuelen = args->rmtvaluelen;
-	}
-	return 0;
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_attr3_leaf_moveents(
-	struct xfs_da_args		*args,
-	struct xfs_attr_leafblock	*leaf_s,
-	struct xfs_attr3_icleaf_hdr	*ichdr_s,
-	int				start_s,
-	struct xfs_attr_leafblock	*leaf_d,
-	struct xfs_attr3_icleaf_hdr	*ichdr_d,
-	int				start_d,
-	int				count)
-{
-	struct xfs_attr_leaf_entry	*entry_s;
-	struct xfs_attr_leaf_entry	*entry_d;
-	int				desti;
-	int				tmp;
-	int				i;
-
-	/*
-	 * Check for nothing to do.
-	 */
-	if (count == 0)
-		return;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
-	       ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
-	ASSERT(ichdr_s->magic == ichdr_d->magic);
-	ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
-	ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
-					+ xfs_attr3_leaf_hdr_size(leaf_s));
-	ASSERT(ichdr_d->count < args->geo->blksize / 8);
-	ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
-					+ xfs_attr3_leaf_hdr_size(leaf_d));
-
-	ASSERT(start_s < ichdr_s->count);
-	ASSERT(start_d <= ichdr_d->count);
-	ASSERT(count <= ichdr_s->count);
-
-
-	/*
-	 * Move the entries in the destination leaf up to make a hole?
-	 */
-	if (start_d < ichdr_d->count) {
-		tmp  = ichdr_d->count - start_d;
-		tmp *= sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
-		entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
-		memmove(entry_d, entry_s, tmp);
-	}
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate attribute info packed and in sequence.
-	 */
-	entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-	entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
-	desti = start_d;
-	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
-		ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
-		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
-#ifdef GROT
-		/*
-		 * Code to drop INCOMPLETE entries.  Difficult to use as we
-		 * may also need to change the insertion index.  Code turned
-		 * off for 6.2, should be revisited later.
-		 */
-		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
-			ichdr_s->usedbytes -= tmp;
-			ichdr_s->count -= 1;
-			entry_d--;	/* to compensate for ++ in loop hdr */
-			desti--;
-			if ((start_s + i) < offset)
-				result++;	/* insertion index adjustment */
-		} else {
-#endif /* GROT */
-			ichdr_d->firstused -= tmp;
-			/* both on-disk, don't endian flip twice */
-			entry_d->hashval = entry_s->hashval;
-			entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
-			entry_d->flags = entry_s->flags;
-			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
-							<= args->geo->blksize);
-			memmove(xfs_attr3_leaf_name(leaf_d, desti),
-				xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
-			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
-							<= args->geo->blksize);
-			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
-			ichdr_s->usedbytes -= tmp;
-			ichdr_d->usedbytes += tmp;
-			ichdr_s->count -= 1;
-			ichdr_d->count += 1;
-			tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
-					+ xfs_attr3_leaf_hdr_size(leaf_d);
-			ASSERT(ichdr_d->firstused >= tmp);
-#ifdef GROT
-		}
-#endif /* GROT */
-	}
-
-	/*
-	 * Zero out the entries we just copied.
-	 */
-	if (start_s == ichdr_s->count) {
-		tmp = count * sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-		ASSERT(((char *)entry_s + tmp) <=
-		       ((char *)leaf_s + args->geo->blksize));
-		memset(entry_s, 0, tmp);
-	} else {
-		/*
-		 * Move the remaining entries down to fill the hole,
-		 * then zero the entries at the top.
-		 */
-		tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
-		entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
-		memmove(entry_d, entry_s, tmp);
-
-		tmp = count * sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
-		ASSERT(((char *)entry_s + tmp) <=
-		       ((char *)leaf_s + args->geo->blksize));
-		memset(entry_s, 0, tmp);
-	}
-
-	/*
-	 * Fill in the freemap information
-	 */
-	ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
-	ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
-	ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
-	ichdr_d->freemap[1].base = 0;
-	ichdr_d->freemap[2].base = 0;
-	ichdr_d->freemap[1].size = 0;
-	ichdr_d->freemap[2].size = 0;
-	ichdr_s->holes = 1;	/* leaf may not be compact */
-}
-
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_attr_leaf_lasthash(
-	struct xfs_buf	*bp,
-	int		*count)
-{
-	struct xfs_attr3_icleaf_hdr ichdr;
-	struct xfs_attr_leaf_entry *entries;
-
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
-	entries = xfs_attr3_leaf_entryp(bp->b_addr);
-	if (count)
-		*count = ichdr.count;
-	if (!ichdr.count)
-		return 0;
-	return be32_to_cpu(entries[ichdr.count - 1].hashval);
-}
-
-/*
- * Calculate the number of bytes used to store the indicated attribute
- * (whether local or remote only calculate bytes in this block).
- */
-STATIC int
-xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
-{
-	struct xfs_attr_leaf_entry *entries;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	int size;
-
-	entries = xfs_attr3_leaf_entryp(leaf);
-	if (entries[index].flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf, index);
-		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
-						   be16_to_cpu(name_loc->valuelen));
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
-		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
-	}
-	return size;
-}
-
-/*
- * Calculate the number of bytes that would be required to store the new
- * attribute (whether local or remote only calculate bytes in this block).
- * This routine decides as a side effect whether the attribute will be
- * a "local" or a "remote" attribute.
- */
-int
-xfs_attr_leaf_newentsize(
-	struct xfs_da_args	*args,
-	int			*local)
-{
-	int			size;
-
-	size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
-	if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
-		if (local)
-			*local = 1;
-		return size;
-	}
-	if (local)
-		*local = 0;
-	return xfs_attr_leaf_entsize_remote(args->namelen);
-}
-
-
-/*========================================================================
- * Manage the INCOMPLETE flag in a leaf entry
- *========================================================================*/
-
-/*
- * Clear the INCOMPLETE flag on an entry in a leaf block.
- */
-int
-xfs_attr3_leaf_clearflag(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	struct xfs_buf		*bp;
-	int			error;
-#ifdef DEBUG
-	struct xfs_attr3_icleaf_hdr ichdr;
-	xfs_attr_leaf_name_local_t *name_loc;
-	int namelen;
-	char *name;
-#endif /* DEBUG */
-
-	trace_xfs_attr_leaf_clearflag(args);
-	/*
-	 * Set up the operation.
-	 */
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-	if (error)
-		return error;
-
-	leaf = bp->b_addr;
-	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
-
-#ifdef DEBUG
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	ASSERT(args->index < ichdr.count);
-	ASSERT(args->index >= 0);
-
-	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
-		namelen = name_loc->namelen;
-		name = (char *)name_loc->nameval;
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		namelen = name_rmt->namelen;
-		name = (char *)name_rmt->name;
-	}
-	ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
-	ASSERT(namelen == args->namelen);
-	ASSERT(memcmp(name, args->name, namelen) == 0);
-#endif /* DEBUG */
-
-	entry->flags &= ~XFS_ATTR_INCOMPLETE;
-	xfs_trans_log_buf(args->trans, bp,
-			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-
-	if (args->rmtblkno) {
-		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
-		xfs_trans_log_buf(args->trans, bp,
-			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
-	}
-
-	/*
-	 * Commit the flag value change and start the next trans in series.
-	 */
-	return xfs_trans_roll(&args->trans, args->dp);
-}
-
-/*
- * Set the INCOMPLETE flag on an entry in a leaf block.
- */
-int
-xfs_attr3_leaf_setflag(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf;
-	struct xfs_attr_leaf_entry *entry;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	struct xfs_buf		*bp;
-	int error;
-#ifdef DEBUG
-	struct xfs_attr3_icleaf_hdr ichdr;
-#endif
-
-	trace_xfs_attr_leaf_setflag(args);
-
-	/*
-	 * Set up the operation.
-	 */
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-	if (error)
-		return error;
-
-	leaf = bp->b_addr;
-#ifdef DEBUG
-	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-	ASSERT(args->index < ichdr.count);
-	ASSERT(args->index >= 0);
-#endif
-	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
-
-	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
-	entry->flags |= XFS_ATTR_INCOMPLETE;
-	xfs_trans_log_buf(args->trans, bp,
-			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-		name_rmt->valueblk = 0;
-		name_rmt->valuelen = 0;
-		xfs_trans_log_buf(args->trans, bp,
-			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
-	}
-
-	/*
-	 * Commit the flag value change and start the next trans in series.
-	 */
-	return xfs_trans_roll(&args->trans, args->dp);
-}
-
-/*
- * In a single transaction, clear the INCOMPLETE flag on the leaf entry
- * given by args->blkno/index and set the INCOMPLETE flag on the leaf
- * entry given by args->blkno2/index2.
- *
- * Note that they could be in different blocks, or in the same block.
- */
-int
-xfs_attr3_leaf_flipflags(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_leafblock *leaf1;
-	struct xfs_attr_leafblock *leaf2;
-	struct xfs_attr_leaf_entry *entry1;
-	struct xfs_attr_leaf_entry *entry2;
-	struct xfs_attr_leaf_name_remote *name_rmt;
-	struct xfs_buf		*bp1;
-	struct xfs_buf		*bp2;
-	int error;
-#ifdef DEBUG
-	struct xfs_attr3_icleaf_hdr ichdr1;
-	struct xfs_attr3_icleaf_hdr ichdr2;
-	xfs_attr_leaf_name_local_t *name_loc;
-	int namelen1, namelen2;
-	char *name1, *name2;
-#endif /* DEBUG */
-
-	trace_xfs_attr_leaf_flipflags(args);
-
-	/*
-	 * Read the block containing the "old" attr
-	 */
-	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
-	if (error)
-		return error;
-
-	/*
-	 * Read the block containing the "new" attr, if it is different
-	 */
-	if (args->blkno2 != args->blkno) {
-		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
-					   -1, &bp2);
-		if (error)
-			return error;
-	} else {
-		bp2 = bp1;
-	}
-
-	leaf1 = bp1->b_addr;
-	entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
-
-	leaf2 = bp2->b_addr;
-	entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
-
-#ifdef DEBUG
-	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
-	ASSERT(args->index < ichdr1.count);
-	ASSERT(args->index >= 0);
-
-	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
-	ASSERT(args->index2 < ichdr2.count);
-	ASSERT(args->index2 >= 0);
-
-	if (entry1->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
-		namelen1 = name_loc->namelen;
-		name1 = (char *)name_loc->nameval;
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
-		namelen1 = name_rmt->namelen;
-		name1 = (char *)name_rmt->name;
-	}
-	if (entry2->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
-		namelen2 = name_loc->namelen;
-		name2 = (char *)name_loc->nameval;
-	} else {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
-		namelen2 = name_rmt->namelen;
-		name2 = (char *)name_rmt->name;
-	}
-	ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
-	ASSERT(namelen1 == namelen2);
-	ASSERT(memcmp(name1, name2, namelen1) == 0);
-#endif /* DEBUG */
-
-	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
-	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
-
-	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
-	xfs_trans_log_buf(args->trans, bp1,
-			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
-	if (args->rmtblkno) {
-		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
-		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
-		xfs_trans_log_buf(args->trans, bp1,
-			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
-	}
-
-	entry2->flags |= XFS_ATTR_INCOMPLETE;
-	xfs_trans_log_buf(args->trans, bp2,
-			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
-	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
-		name_rmt->valueblk = 0;
-		name_rmt->valuelen = 0;
-		xfs_trans_log_buf(args->trans, bp2,
-			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
-	}
-
-	/*
-	 * Commit the flag value change and start the next trans in series.
-	 */
-	error = xfs_trans_roll(&args->trans, args->dp);
-
-	return error;
-}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
deleted file mode 100644
index a8bbc562ff35..000000000000
--- a/fs/xfs/xfs_attr_remote.c
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-#include "xfs_error.h"
-
-#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
-
-/*
- * Each contiguous block has a header, so it is not just a simple attribute
- * length to FSB conversion.
- */
-int
-xfs_attr3_rmt_blocks(
-	struct xfs_mount *mp,
-	int		attrlen)
-{
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-		return (attrlen + buflen - 1) / buflen;
-	}
-	return XFS_B_TO_FSB(mp, attrlen);
-}
-
-/*
- * Checking of the remote attribute header is split into two parts. The verifier
- * does CRC, location and bounds checking, the unpacking function checks the
- * attribute parameters and owner.
- */
-static bool
-xfs_attr3_rmt_hdr_ok(
-	void			*ptr,
-	xfs_ino_t		ino,
-	uint32_t		offset,
-	uint32_t		size,
-	xfs_daddr_t		bno)
-{
-	struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-	if (bno != be64_to_cpu(rmt->rm_blkno))
-		return false;
-	if (offset != be32_to_cpu(rmt->rm_offset))
-		return false;
-	if (size != be32_to_cpu(rmt->rm_bytes))
-		return false;
-	if (ino != be64_to_cpu(rmt->rm_owner))
-		return false;
-
-	/* ok */
-	return true;
-}
-
-static bool
-xfs_attr3_rmt_verify(
-	struct xfs_mount	*mp,
-	void			*ptr,
-	int			fsbsize,
-	xfs_daddr_t		bno)
-{
-	struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return false;
-	if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
-		return false;
-	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
-		return false;
-	if (be64_to_cpu(rmt->rm_blkno) != bno)
-		return false;
-	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
-		return false;
-	if (be32_to_cpu(rmt->rm_offset) +
-				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
-		return false;
-	if (rmt->rm_owner == 0)
-		return false;
-
-	return true;
-}
-
-static void
-xfs_attr3_rmt_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	char		*ptr;
-	int		len;
-	xfs_daddr_t	bno;
-	int		blksize = mp->m_attr_geo->blksize;
-
-	/* no verification of non-crc buffers */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	ptr = bp->b_addr;
-	bno = bp->b_bn;
-	len = BBTOB(bp->b_length);
-	ASSERT(len >= blksize);
-
-	while (len > 0) {
-		if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
-			xfs_buf_ioerror(bp, EFSBADCRC);
-			break;
-		}
-		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			break;
-		}
-		len -= blksize;
-		ptr += blksize;
-		bno += BTOBB(blksize);
-	}
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-	else
-		ASSERT(len == 0);
-}
-
-static void
-xfs_attr3_rmt_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	char		*ptr;
-	int		len;
-	xfs_daddr_t	bno;
-	int		blksize = mp->m_attr_geo->blksize;
-
-	/* no verification of non-crc buffers */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	ptr = bp->b_addr;
-	bno = bp->b_bn;
-	len = BBTOB(bp->b_length);
-	ASSERT(len >= blksize);
-
-	while (len > 0) {
-		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			xfs_verifier_error(bp);
-			return;
-		}
-		if (bip) {
-			struct xfs_attr3_rmt_hdr *rmt;
-
-			rmt = (struct xfs_attr3_rmt_hdr *)ptr;
-			rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-		}
-		xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
-
-		len -= blksize;
-		ptr += blksize;
-		bno += BTOBB(blksize);
-	}
-	ASSERT(len == 0);
-}
-
-const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
-	.verify_read = xfs_attr3_rmt_read_verify,
-	.verify_write = xfs_attr3_rmt_write_verify,
-};
-
-STATIC int
-xfs_attr3_rmt_hdr_set(
-	struct xfs_mount	*mp,
-	void			*ptr,
-	xfs_ino_t		ino,
-	uint32_t		offset,
-	uint32_t		size,
-	xfs_daddr_t		bno)
-{
-	struct xfs_attr3_rmt_hdr *rmt = ptr;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return 0;
-
-	rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
-	rmt->rm_offset = cpu_to_be32(offset);
-	rmt->rm_bytes = cpu_to_be32(size);
-	uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
-	rmt->rm_owner = cpu_to_be64(ino);
-	rmt->rm_blkno = cpu_to_be64(bno);
-
-	return sizeof(struct xfs_attr3_rmt_hdr);
-}
-
-/*
- * Helper functions to copy attribute data in and out of the one disk extents
- */
-STATIC int
-xfs_attr_rmtval_copyout(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	xfs_ino_t	ino,
-	int		*offset,
-	int		*valuelen,
-	__uint8_t	**dst)
-{
-	char		*src = bp->b_addr;
-	xfs_daddr_t	bno = bp->b_bn;
-	int		len = BBTOB(bp->b_length);
-	int		blksize = mp->m_attr_geo->blksize;
-
-	ASSERT(len >= blksize);
-
-	while (len > 0 && *valuelen > 0) {
-		int hdr_size = 0;
-		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
-
-		byte_cnt = min(*valuelen, byte_cnt);
-
-		if (xfs_sb_version_hascrc(&mp->m_sb)) {
-			if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
-						  byte_cnt, bno)) {
-				xfs_alert(mp,
-"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
-					bno, *offset, byte_cnt, ino);
-				return EFSCORRUPTED;
-			}
-			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
-		}
-
-		memcpy(*dst, src + hdr_size, byte_cnt);
-
-		/* roll buffer forwards */
-		len -= blksize;
-		src += blksize;
-		bno += BTOBB(blksize);
-
-		/* roll attribute data forwards */
-		*valuelen -= byte_cnt;
-		*dst += byte_cnt;
-		*offset += byte_cnt;
-	}
-	return 0;
-}
-
-STATIC void
-xfs_attr_rmtval_copyin(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	xfs_ino_t	ino,
-	int		*offset,
-	int		*valuelen,
-	__uint8_t	**src)
-{
-	char		*dst = bp->b_addr;
-	xfs_daddr_t	bno = bp->b_bn;
-	int		len = BBTOB(bp->b_length);
-	int		blksize = mp->m_attr_geo->blksize;
-
-	ASSERT(len >= blksize);
-
-	while (len > 0 && *valuelen > 0) {
-		int hdr_size;
-		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
-
-		byte_cnt = min(*valuelen, byte_cnt);
-		hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
-						 byte_cnt, bno);
-
-		memcpy(dst + hdr_size, *src, byte_cnt);
-
-		/*
-		 * If this is the last block, zero the remainder of it.
-		 * Check that we are actually the last block, too.
-		 */
-		if (byte_cnt + hdr_size < blksize) {
-			ASSERT(*valuelen - byte_cnt == 0);
-			ASSERT(len == blksize);
-			memset(dst + hdr_size + byte_cnt, 0,
-					blksize - hdr_size - byte_cnt);
-		}
-
-		/* roll buffer forwards */
-		len -= blksize;
-		dst += blksize;
-		bno += BTOBB(blksize);
-
-		/* roll attribute data forwards */
-		*valuelen -= byte_cnt;
-		*src += byte_cnt;
-		*offset += byte_cnt;
-	}
-}
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-int
-xfs_attr_rmtval_get(
-	struct xfs_da_args	*args)
-{
-	struct xfs_bmbt_irec	map[ATTR_RMTVALUE_MAPSIZE];
-	struct xfs_mount	*mp = args->dp->i_mount;
-	struct xfs_buf		*bp;
-	xfs_dablk_t		lblkno = args->rmtblkno;
-	__uint8_t		*dst = args->value;
-	int			valuelen;
-	int			nmap;
-	int			error;
-	int			blkcnt = args->rmtblkcnt;
-	int			i;
-	int			offset = 0;
-
-	trace_xfs_attr_rmtval_get(args);
-
-	ASSERT(!(args->flags & ATTR_KERNOVAL));
-	ASSERT(args->rmtvaluelen == args->valuelen);
-
-	valuelen = args->rmtvaluelen;
-	while (valuelen > 0) {
-		nmap = ATTR_RMTVALUE_MAPSIZE;
-		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-				       blkcnt, map, &nmap,
-				       XFS_BMAPI_ATTRFORK);
-		if (error)
-			return error;
-		ASSERT(nmap >= 1);
-
-		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
-			xfs_daddr_t	dblkno;
-			int		dblkcnt;
-
-			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
-			       (map[i].br_startblock != HOLESTARTBLOCK));
-			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
-			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-						   dblkno, dblkcnt, 0, &bp,
-						   &xfs_attr3_rmt_buf_ops);
-			if (error)
-				return error;
-
-			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
-							&offset, &valuelen,
-							&dst);
-			xfs_buf_relse(bp);
-			if (error)
-				return error;
-
-			/* roll attribute extent map forwards */
-			lblkno += map[i].br_blockcount;
-			blkcnt -= map[i].br_blockcount;
-		}
-	}
-	ASSERT(valuelen == 0);
-	return 0;
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-int
-xfs_attr_rmtval_set(
-	struct xfs_da_args	*args)
-{
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_bmbt_irec	map;
-	xfs_dablk_t		lblkno;
-	xfs_fileoff_t		lfileoff = 0;
-	__uint8_t		*src = args->value;
-	int			blkcnt;
-	int			valuelen;
-	int			nmap;
-	int			error;
-	int			offset = 0;
-
-	trace_xfs_attr_rmtval_set(args);
-
-	/*
-	 * Find a "hole" in the attribute address space large enough for
-	 * us to drop the new attribute's value into. Because CRC enable
-	 * attributes have headers, we can't just do a straight byte to FSB
-	 * conversion and have to take the header space into account.
-	 */
-	blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
-	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
-						   XFS_ATTR_FORK);
-	if (error)
-		return error;
-
-	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
-	args->rmtblkcnt = blkcnt;
-
-	/*
-	 * Roll through the "value", allocating blocks on disk as required.
-	 */
-	while (blkcnt > 0) {
-		int	committed;
-
-		/*
-		 * Allocate a single extent, up to the size of the value.
-		 */
-		xfs_bmap_init(args->flist, args->firstblock);
-		nmap = 1;
-		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
-				  blkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, args->total, &map, &nmap,
-				  args->flist);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return error;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-		lblkno += map.br_blockcount;
-		blkcnt -= map.br_blockcount;
-
-		/*
-		 * Start the next trans in the chain.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Roll through the "value", copying the attribute value to the
-	 * already-allocated blocks.  Blocks are written synchronously
-	 * so that we can know they are all on disk before we turn off
-	 * the INCOMPLETE flag.
-	 */
-	lblkno = args->rmtblkno;
-	blkcnt = args->rmtblkcnt;
-	valuelen = args->rmtvaluelen;
-	while (valuelen > 0) {
-		struct xfs_buf	*bp;
-		xfs_daddr_t	dblkno;
-		int		dblkcnt;
-
-		ASSERT(blkcnt > 0);
-
-		xfs_bmap_init(args->flist, args->firstblock);
-		nmap = 1;
-		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
-				       blkcnt, &map, &nmap,
-				       XFS_BMAPI_ATTRFORK);
-		if (error)
-			return error;
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-
-		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
-		if (!bp)
-			return ENOMEM;
-		bp->b_ops = &xfs_attr3_rmt_buf_ops;
-
-		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
-				       &valuelen, &src);
-
-		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
-		xfs_buf_relse(bp);
-		if (error)
-			return error;
-
-
-		/* roll attribute extent map forwards */
-		lblkno += map.br_blockcount;
-		blkcnt -= map.br_blockcount;
-	}
-	ASSERT(valuelen == 0);
-	return 0;
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-int
-xfs_attr_rmtval_remove(
-	struct xfs_da_args	*args)
-{
-	struct xfs_mount	*mp = args->dp->i_mount;
-	xfs_dablk_t		lblkno;
-	int			blkcnt;
-	int			error;
-	int			done;
-
-	trace_xfs_attr_rmtval_remove(args);
-
-	/*
-	 * Roll through the "value", invalidating the attribute value's blocks.
-	 */
-	lblkno = args->rmtblkno;
-	blkcnt = args->rmtblkcnt;
-	while (blkcnt > 0) {
-		struct xfs_bmbt_irec	map;
-		struct xfs_buf		*bp;
-		xfs_daddr_t		dblkno;
-		int			dblkcnt;
-		int			nmap;
-
-		/*
-		 * Try to remember where we decided to put the value.
-		 */
-		nmap = 1;
-		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
-		if (error)
-			return error;
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-
-		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-		/*
-		 * If the "remote" value is in the cache, remove it.
-		 */
-		bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
-		if (bp) {
-			xfs_buf_stale(bp);
-			xfs_buf_relse(bp);
-			bp = NULL;
-		}
-
-		lblkno += map.br_blockcount;
-		blkcnt -= map.br_blockcount;
-	}
-
-	/*
-	 * Keep de-allocating extents until the remote-value region is gone.
-	 */
-	lblkno = args->rmtblkno;
-	blkcnt = args->rmtblkcnt;
-	done = 0;
-	while (!done) {
-		int committed;
-
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
-				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				    1, args->firstblock, args->flist,
-				    &done);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return error;
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, args->dp, 0);
-
-		/*
-		 * Close out trans and start the next one in the chain.
-		 */
-		error = xfs_trans_roll(&args->trans, args->dp);
-		if (error)
-			return error;
-	}
-	return 0;
-}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
deleted file mode 100644
index b44d63189dab..000000000000
--- a/fs/xfs/xfs_bmap.c
+++ /dev/null
@@ -1,5609 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trans_space.h"
-#include "xfs_buf_item.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_dinode.h"
-#include "xfs_filestream.h"
-
-
-kmem_zone_t		*xfs_bmap_free_item_zone;
-
-/*
- * Miscellaneous helper functions
- */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem.  Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	int		whichfork)	/* data or attr fork */
-{
-	int		level;		/* btree level */
-	uint		maxblocks;	/* max blocks at this level */
-	uint		maxleafents;	/* max leaf entries possible */
-	int		maxrootrecs;	/* max records in root block */
-	int		minleafrecs;	/* min records in leaf block */
-	int		minnoderecs;	/* min records in node block */
-	int		sz;		/* root block size */
-
-	/*
-	 * The maximum number of extents in a file, hence the maximum
-	 * number of leaf entries, is controlled by the type of di_nextents
-	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
-	 * (a signed 16-bit number, xfs_aextnum_t).
-	 *
-	 * Note that we can no longer assume that if we are in ATTR1 that
-	 * the fork offset of all the inodes will be
-	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
-	 * with ATTR2 and then mounted back with ATTR1, keeping the
-	 * di_forkoff's fixed but probably at various positions. Therefore,
-	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
-	 * of a minimum size available.
-	 */
-	if (whichfork == XFS_DATA_FORK) {
-		maxleafents = MAXEXTNUM;
-		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-	} else {
-		maxleafents = MAXAEXTNUM;
-		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	}
-	maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
-	minleafrecs = mp->m_bmap_dmnr[0];
-	minnoderecs = mp->m_bmap_dmnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++) {
-		if (maxblocks <= maxrootrecs)
-			maxblocks = 1;
-		else
-			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	}
-	mp->m_bm_maxlevels[whichfork] = level;
-}
-
-STATIC int				/* error */
-xfs_bmbt_lookup_eq(
-	struct xfs_btree_cur	*cur,
-	xfs_fileoff_t		off,
-	xfs_fsblock_t		bno,
-	xfs_filblks_t		len,
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.b.br_startoff = off;
-	cur->bc_rec.b.br_startblock = bno;
-	cur->bc_rec.b.br_blockcount = len;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-STATIC int				/* error */
-xfs_bmbt_lookup_ge(
-	struct xfs_btree_cur	*cur,
-	xfs_fileoff_t		off,
-	xfs_fsblock_t		bno,
-	xfs_filblks_t		len,
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.b.br_startoff = off;
-	cur->bc_rec.b.br_startblock = bno;
-	cur->bc_rec.b.br_blockcount = len;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Check if the inode needs to be converted to btree format.
- */
-static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
-{
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		XFS_IFORK_NEXTENTS(ip, whichfork) >
-			XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Check if the inode should be converted to extent format.
- */
-static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
-{
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-		XFS_IFORK_NEXTENTS(ip, whichfork) <=
-			XFS_IFORK_MAXEXT(ip, whichfork);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [off, bno, len, state].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int
-xfs_bmbt_update(
-	struct xfs_btree_cur	*cur,
-	xfs_fileoff_t		off,
-	xfs_fsblock_t		bno,
-	xfs_filblks_t		len,
-	xfs_exntst_t		state)
-{
-	union xfs_btree_rec	rec;
-
-	xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
-	return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_filblks_t	len)		/* delayed extent length */
-{
-	int		level;		/* btree level number */
-	int		maxrecs;	/* maximum record count at this level */
-	xfs_mount_t	*mp;		/* mount structure */
-	xfs_filblks_t	rval;		/* return value */
-
-	mp = ip->i_mount;
-	maxrecs = mp->m_bmap_dmxr[0];
-	for (level = 0, rval = 0;
-	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
-	     level++) {
-		len += maxrecs - 1;
-		do_div(len, maxrecs);
-		rval += len;
-		if (len == 1)
-			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
-				level - 1;
-		if (level == 0)
-			maxrecs = mp->m_bmap_dmxr[1];
-	}
-	return rval;
-}
-
-/*
- * Calculate the default attribute fork offset for newly created inodes.
- */
-uint
-xfs_default_attroffset(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	uint			offset;
-
-	if (mp->m_sb.sb_inodesize == 256) {
-		offset = XFS_LITINO(mp, ip->i_d.di_version) -
-				XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	} else {
-		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-	}
-
-	ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
-	return offset;
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
-	xfs_inode_t	*ip,
-	int		whichfork)
-{
-	if (whichfork == XFS_ATTR_FORK &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
-		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-
-		if (dfl_forkoff > ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = dfl_forkoff;
-	}
-}
-
-/*
- * Debug/sanity checking code
- */
-
-STATIC int
-xfs_bmap_sanity_check(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	int			level)
-{
-	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-
-	if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
-	    block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
-		return 0;
-
-	if (be16_to_cpu(block->bb_level) != level ||
-	    be16_to_cpu(block->bb_numrecs) == 0 ||
-	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-		return 0;
-
-	return 1;
-}
-
-#ifdef DEBUG
-STATIC struct xfs_buf *
-xfs_bmap_get_bp(
-	struct xfs_btree_cur	*cur,
-	xfs_fsblock_t		bno)
-{
-	struct xfs_log_item_desc *lidp;
-	int			i;
-
-	if (!cur)
-		return NULL;
-
-	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-		if (!cur->bc_bufs[i])
-			break;
-		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
-			return cur->bc_bufs[i];
-	}
-
-	/* Chase down all the log items to see if the bp is there */
-	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-		struct xfs_buf_log_item	*bip;
-		bip = (struct xfs_buf_log_item *)lidp->lid_item;
-		if (bip->bli_item.li_type == XFS_LI_BUF &&
-		    XFS_BUF_ADDR(bip->bli_buf) == bno)
-			return bip->bli_buf;
-	}
-
-	return NULL;
-}
-
-STATIC void
-xfs_check_block(
-	struct xfs_btree_block	*block,
-	xfs_mount_t		*mp,
-	int			root,
-	short			sz)
-{
-	int			i, j, dmxr;
-	__be64			*pp, *thispa;	/* pointer to block address */
-	xfs_bmbt_key_t		*prevp, *keyp;
-
-	ASSERT(be16_to_cpu(block->bb_level) > 0);
-
-	prevp = NULL;
-	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
-		dmxr = mp->m_bmap_dmxr[0];
-		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-
-		if (prevp) {
-			ASSERT(be64_to_cpu(prevp->br_startoff) <
-			       be64_to_cpu(keyp->br_startoff));
-		}
-		prevp = keyp;
-
-		/*
-		 * Compare the block numbers to see if there are dups.
-		 */
-		if (root)
-			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
-		else
-			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-
-		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-			if (root)
-				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-			else
-				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-			if (*thispa == *pp) {
-				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
-					__func__, j, i,
-					(unsigned long long)be64_to_cpu(*thispa));
-				panic("%s: ptrs are equal in node\n",
-					__func__);
-			}
-		}
-	}
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
-	xfs_btree_cur_t		*cur,	/* btree cursor or null */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	int			whichfork)	/* data or attr fork */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	xfs_fsblock_t		bno;	/* block # of "block" */
-	xfs_buf_t		*bp;	/* buffer for "block" */
-	int			error;	/* error return value */
-	xfs_extnum_t		i=0, j;	/* index into the extents list */
-	xfs_ifork_t		*ifp;	/* fork structure */
-	int			level;	/* btree level, for checking */
-	xfs_mount_t		*mp;	/* file system mount structure */
-	__be64			*pp;	/* pointer to block address */
-	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
-	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
-	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
-	int			bp_release = 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
-		return;
-	}
-
-	bno = NULLFSBLOCK;
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	block = ifp->if_broot;
-	/*
-	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
-	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-	bno = be64_to_cpu(*pp);
-
-	ASSERT(bno != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-	/*
-	 * Go down the tree until leaf level is reached, following the first
-	 * pointer (leftmost) at each level.
-	 */
-	while (level-- > 0) {
-		/* See if buf is in cur first */
-		bp_release = 0;
-		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (!bp) {
-			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				goto error_norelse;
-		}
-		block = XFS_BUF_TO_BLOCK(bp);
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, level),
-			error0);
-		if (level == 0)
-			break;
-
-		/*
-		 * Check this block for basic sanity (increasing keys and
-		 * no duplicate blocks).
-		 */
-
-		xfs_check_block(block, mp, 0, 0);
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-		bno = be64_to_cpu(*pp);
-		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-		if (bp_release) {
-			bp_release = 0;
-			xfs_trans_brelse(NULL, bp);
-		}
-	}
-
-	/*
-	 * Here with bp and block set to the leftmost leaf node in the tree.
-	 */
-	i = 0;
-
-	/*
-	 * Loop over all leaf nodes checking that all extents are in the right order.
-	 */
-	for (;;) {
-		xfs_fsblock_t	nextbno;
-		xfs_extnum_t	num_recs;
-
-
-		num_recs = xfs_btree_get_numrecs(block);
-
-		/*
-		 * Read-ahead the next leaf block, if any.
-		 */
-
-		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
-		/*
-		 * Check all the extents to make sure they are OK.
-		 * If we had a previous block, the last entry should
-		 * conform with the first entry in this one.
-		 */
-
-		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
-		if (i) {
-			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
-			       xfs_bmbt_disk_get_blockcount(&last) <=
-			       xfs_bmbt_disk_get_startoff(ep));
-		}
-		for (j = 1; j < num_recs; j++) {
-			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
-			       xfs_bmbt_disk_get_blockcount(ep) <=
-			       xfs_bmbt_disk_get_startoff(nextp));
-			ep = nextp;
-		}
-
-		last = *ep;
-		i += num_recs;
-		if (bp_release) {
-			bp_release = 0;
-			xfs_trans_brelse(NULL, bp);
-		}
-		bno = nextbno;
-		/*
-		 * If we've reached the end, stop.
-		 */
-		if (bno == NULLFSBLOCK)
-			break;
-
-		bp_release = 0;
-		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (!bp) {
-			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				goto error_norelse;
-		}
-		block = XFS_BUF_TO_BLOCK(bp);
-	}
-	if (bp_release) {
-		bp_release = 0;
-		xfs_trans_brelse(NULL, bp);
-	}
-	return;
-
-error0:
-	xfs_warn(mp, "%s: at error0", __func__);
-	if (bp_release)
-		xfs_trans_brelse(NULL, bp);
-error_norelse:
-	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
-		__func__, i);
-	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
-	return;
-}
-
-/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	cnt,		/* count of entries in the list */
-	int		whichfork,	/* data or attr fork */
-	unsigned long	caller_ip)
-{
-	xfs_extnum_t	idx;		/* extent record index */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	int		state = 0;
-
-	if (whichfork == XFS_ATTR_FORK)
-		state |= BMAP_ATTRFORK;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	for (idx = 0; idx < cnt; idx++)
-		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-}
-
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the caller's original parameters.  Specifically check the
- * ranges of the returned irecs to ensure that they only extend beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	int			flags,
-	xfs_bmbt_irec_t		*mval,
-	int			nmap,
-	int			ret_nmap)
-{
-	int			i;		/* index to map values */
-
-	ASSERT(ret_nmap <= nmap);
-
-	for (i = 0; i < ret_nmap; i++) {
-		ASSERT(mval[i].br_blockcount > 0);
-		if (!(flags & XFS_BMAPI_ENTIRE)) {
-			ASSERT(mval[i].br_startoff >= bno);
-			ASSERT(mval[i].br_blockcount <= len);
-			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
-			       bno + len);
-		} else {
-			ASSERT(mval[i].br_startoff < bno + len);
-			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
-			       bno);
-		}
-		ASSERT(i == 0 ||
-		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
-		       mval[i].br_startoff);
-		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-		       mval[i].br_startblock != HOLESTARTBLOCK);
-		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
-		       mval[i].br_state == XFS_EXT_UNWRITTEN);
-	}
-}
-
-#else
-#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
-#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
-
-/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
-	xfs_fsblock_t		bno,		/* fs block number of extent */
-	xfs_filblks_t		len,		/* length of extent */
-	xfs_bmap_free_t		*flist,		/* list of extents */
-	xfs_mount_t		*mp)		/* mount point structure */
-{
-	xfs_bmap_free_item_t	*cur;		/* current (next) element */
-	xfs_bmap_free_item_t	*new;		/* new element */
-	xfs_bmap_free_item_t	*prev;		/* previous element */
-#ifdef DEBUG
-	xfs_agnumber_t		agno;
-	xfs_agblock_t		agbno;
-
-	ASSERT(bno != NULLFSBLOCK);
-	ASSERT(len > 0);
-	ASSERT(len <= MAXEXTLEN);
-	ASSERT(!isnullstartblock(bno));
-	agno = XFS_FSB_TO_AGNO(mp, bno);
-	agbno = XFS_FSB_TO_AGBNO(mp, bno);
-	ASSERT(agno < mp->m_sb.sb_agcount);
-	ASSERT(agbno < mp->m_sb.sb_agblocks);
-	ASSERT(len < mp->m_sb.sb_agblocks);
-	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
-	ASSERT(xfs_bmap_free_item_zone != NULL);
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
-	new->xbfi_startblock = bno;
-	new->xbfi_blockcount = (xfs_extlen_t)len;
-	for (prev = NULL, cur = flist->xbf_first;
-	     cur != NULL;
-	     prev = cur, cur = cur->xbfi_next) {
-		if (cur->xbfi_startblock >= bno)
-			break;
-	}
-	if (prev)
-		prev->xbfi_next = new;
-	else
-		flist->xbf_first = new;
-	new->xbfi_next = cur;
-	flist->xbf_count++;
-}
-
-/*
- * Remove the entry "free" from the free item list.  Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-void
-xfs_bmap_del_free(
-	xfs_bmap_free_t		*flist,	/* free item list header */
-	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
-	xfs_bmap_free_item_t	*free)	/* list item to be freed */
-{
-	if (prev)
-		prev->xbfi_next = free->xbfi_next;
-	else
-		flist->xbf_first = free->xbfi_next;
-	flist->xbf_count--;
-	kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
-	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
-{
-	xfs_bmap_free_item_t	*free;	/* free list item */
-	xfs_bmap_free_item_t	*next;
-
-	if (flist->xbf_count == 0)
-		return;
-	ASSERT(flist->xbf_first != NULL);
-	for (free = flist->xbf_first; free; free = next) {
-		next = free->xbfi_next;
-		xfs_bmap_del_free(flist, NULL, free);
-	}
-	ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Inode fork format manipulation functions
- */
-
-/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int				/* error */
-xfs_bmap_btree_to_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork)  /* data or attr fork */
-{
-	/* REFERENCED */
-	struct xfs_btree_block	*cblock;/* child btree block */
-	xfs_fsblock_t		cbno;	/* child block number */
-	xfs_buf_t		*cbp;	/* child block's buffer */
-	int			error;	/* error return value */
-	xfs_ifork_t		*ifp;	/* inode fork data */
-	xfs_mount_t		*mp;	/* mount point structure */
-	__be64			*pp;	/* ptr to block address */
-	struct xfs_btree_block	*rblock;/* root btree block */
-
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-	rblock = ifp->if_broot;
-	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
-	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-	cbno = be64_to_cpu(*pp);
-	*logflagsp = 0;
-#ifdef DEBUG
-	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
-		return error;
-#endif
-	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-				&xfs_bmbt_buf_ops);
-	if (error)
-		return error;
-	cblock = XFS_BUF_TO_BLOCK(cbp);
-	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
-		return error;
-	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
-	ip->i_d.di_nblocks--;
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-	xfs_trans_binval(tp, cbp);
-	if (cur->bc_bufs[0] == cbp)
-		cur->bc_bufs[0] = NULL;
-	xfs_iroot_realloc(ip, -1, whichfork);
-	ASSERT(ifp->if_broot == NULL);
-	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-	return 0;
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int					/* error */
-xfs_bmap_extents_to_btree(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
-	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
-	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
-	int			wasdel,		/* converting a delayed alloc */
-	int			*logflagsp,	/* inode logging flags */
-	int			whichfork)	/* data or attr fork */
-{
-	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
-	xfs_buf_t		*abp;		/* buffer for ablock */
-	xfs_alloc_arg_t		args;		/* allocation arguments */
-	xfs_bmbt_rec_t		*arp;		/* child record pointer */
-	struct xfs_btree_block	*block;		/* btree root block */
-	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
-	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
-	int			error;		/* error return value */
-	xfs_extnum_t		i, cnt;		/* extent record index */
-	xfs_ifork_t		*ifp;		/* inode fork pointer */
-	xfs_bmbt_key_t		*kp;		/* root block key pointer */
-	xfs_mount_t		*mp;		/* mount structure */
-	xfs_extnum_t		nextents;	/* number of file extents */
-	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
-
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
-	/*
-	 * Make space in the inode incore.
-	 */
-	xfs_iroot_realloc(ip, 1, whichfork);
-	ifp->if_flags |= XFS_IFBROOT;
-
-	/*
-	 * Fill in the root.
-	 */
-	block = ifp->if_broot;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS);
-
-	/*
-	 * Need a cursor.  Can't allocate until bb_level is filled in.
-	 */
-	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-	cur->bc_private.b.firstblock = *firstblock;
-	cur->bc_private.b.flist = flist;
-	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
-	/*
-	 * Convert to a btree with two levels, one record in root.
-	 */
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
-	memset(&args, 0, sizeof(args));
-	args.tp = tp;
-	args.mp = mp;
-	args.firstblock = *firstblock;
-	if (*firstblock == NULLFSBLOCK) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
-	} else if (flist->xbf_low) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = *firstblock;
-	} else {
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.fsbno = *firstblock;
-	}
-	args.minlen = args.maxlen = args.prod = 1;
-	args.wasdel = wasdel;
-	*logflagsp = 0;
-	if ((error = xfs_alloc_vextent(&args))) {
-		xfs_iroot_realloc(ip, -1, whichfork);
-		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-		return error;
-	}
-	/*
-	 * Allocation can't fail, the space was reserved.
-	 */
-	ASSERT(args.fsbno != NULLFSBLOCK);
-	ASSERT(*firstblock == NULLFSBLOCK ||
-	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
-	       (flist->xbf_low &&
-		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
-	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
-	cur->bc_private.b.allocated++;
-	ip->i_d.di_nblocks++;
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
-	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
-	/*
-	 * Fill in the child block.
-	 */
-	abp->b_ops = &xfs_bmbt_buf_ops;
-	ablock = XFS_BUF_TO_BLOCK(abp);
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
-				XFS_BTREE_LONG_PTRS);
-
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	for (cnt = i = 0; i < nextents; i++) {
-		ep = xfs_iext_get_ext(ifp, i);
-		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
-			arp->l0 = cpu_to_be64(ep->l0);
-			arp->l1 = cpu_to_be64(ep->l1);
-			arp++; cnt++;
-		}
-	}
-	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-	xfs_btree_set_numrecs(ablock, cnt);
-
-	/*
-	 * Fill in the root key and pointer.
-	 */
-	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
-						be16_to_cpu(block->bb_level)));
-	*pp = cpu_to_be64(args.fsbno);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
-	ASSERT(*curp == NULL);
-	*curp = cur;
-	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
-	return 0;
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-void
-xfs_bmap_local_to_extents_empty(
-	struct xfs_inode	*ip,
-	int			whichfork)
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-	ASSERT(ifp->if_bytes == 0);
-	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
-
-	xfs_bmap_forkoff_reset(ip, whichfork);
-	ifp->if_flags &= ~XFS_IFINLINE;
-	ifp->if_flags |= XFS_IFEXTENTS;
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-}
-
-
-STATIC int				/* error */
-xfs_bmap_local_to_extents(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
-	xfs_extlen_t	total,		/* total blocks needed by transaction */
-	int		*logflagsp,	/* inode logging flags */
-	int		whichfork,
-	void		(*init_fn)(struct xfs_trans *tp,
-				   struct xfs_buf *bp,
-				   struct xfs_inode *ip,
-				   struct xfs_ifork *ifp))
-{
-	int		error = 0;
-	int		flags;		/* logging flags returned */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_alloc_arg_t	args;		/* allocation arguments */
-	xfs_buf_t	*bp;		/* buffer for extent block */
-	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
-
-	/*
-	 * We don't want to deal with the case of keeping inode data inline yet.
-	 * So sending the data fork of a regular inode is invalid.
-	 */
-	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-
-	if (!ifp->if_bytes) {
-		xfs_bmap_local_to_extents_empty(ip, whichfork);
-		flags = XFS_ILOG_CORE;
-		goto done;
-	}
-
-	flags = 0;
-	error = 0;
-	ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
-								XFS_IFINLINE);
-	memset(&args, 0, sizeof(args));
-	args.tp = tp;
-	args.mp = ip->i_mount;
-	args.firstblock = *firstblock;
-	/*
-	 * Allocate a block.  We know we need only one, since the
-	 * file currently fits in an inode.
-	 */
-	if (*firstblock == NULLFSBLOCK) {
-		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
-		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else {
-		args.fsbno = *firstblock;
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-	}
-	args.total = total;
-	args.minlen = args.maxlen = args.prod = 1;
-	error = xfs_alloc_vextent(&args);
-	if (error)
-		goto done;
-
-	/* Can't fail, the space was reserved. */
-	ASSERT(args.fsbno != NULLFSBLOCK);
-	ASSERT(args.len == 1);
-	*firstblock = args.fsbno;
-	bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-
-	/* initialise the block and copy the data */
-	init_fn(tp, bp, ip, ifp);
-
-	/* account for the change in fork size and log everything */
-	xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
-	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-	xfs_bmap_local_to_extents_empty(ip, whichfork);
-	flags |= XFS_ILOG_CORE;
-
-	xfs_iext_add(ifp, 0, 1);
-	ep = xfs_iext_get_ext(ifp, 0);
-	xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-	trace_xfs_bmap_post_update(ip, 0,
-			whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
-			_THIS_IP_);
-	XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-	ip->i_d.di_nblocks = 1;
-	xfs_trans_mod_dquot_byino(tp, ip,
-		XFS_TRANS_DQ_BCOUNT, 1L);
-	flags |= xfs_ilog_fext(whichfork);
-
-done:
-	*logflagsp = flags;
-	return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle btree format files.
- */
-STATIC int					/* error */
-xfs_bmap_add_attrfork_btree(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first block allocated */
-	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
-	int			*flags)		/* inode logging flags */
-{
-	xfs_btree_cur_t		*cur;		/* btree cursor */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* file system mount struct */
-	int			stat;		/* newroot status */
-
-	mp = ip->i_mount;
-	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
-		*flags |= XFS_ILOG_DBROOT;
-	else {
-		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-		cur->bc_private.b.flist = flist;
-		cur->bc_private.b.firstblock = *firstblock;
-		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
-			goto error0;
-		/* must be at least one entry */
-		XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-		if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
-			goto error0;
-		if (stat == 0) {
-			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-			return ENOSPC;
-		}
-		*firstblock = cur->bc_private.b.firstblock;
-		cur->bc_private.b.allocated = 0;
-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	}
-	return 0;
-error0:
-	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
- */
-STATIC int					/* error */
-xfs_bmap_add_attrfork_extents(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first block allocated */
-	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
-	int			*flags)		/* inode logging flags */
-{
-	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
-	int			error;		/* error return value */
-
-	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
-		return 0;
-	cur = NULL;
-	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
-		flags, XFS_DATA_FORK);
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		xfs_btree_del_cursor(cur,
-			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-	}
-	return error;
-}
-
-/*
- * Called from xfs_bmap_add_attrfork to handle local format files. Each
- * different data fork content type needs a different callout to do the
- * conversion. Some are basic and only require special block initialisation
- * callouts for the data formating, others (directories) are so specialised they
- * handle everything themselves.
- *
- * XXX (dgc): investigate whether directory conversion can use the generic
- * formatting callout. It should be possible - it's just a very complex
- * formatter.
- */
-STATIC int					/* error */
-xfs_bmap_add_attrfork_local(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first block allocated */
-	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
-	int			*flags)		/* inode logging flags */
-{
-	xfs_da_args_t		dargs;		/* args for dir/attr code */
-
-	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
-		return 0;
-
-	if (S_ISDIR(ip->i_d.di_mode)) {
-		memset(&dargs, 0, sizeof(dargs));
-		dargs.geo = ip->i_mount->m_dir_geo;
-		dargs.dp = ip;
-		dargs.firstblock = firstblock;
-		dargs.flist = flist;
-		dargs.total = dargs.geo->fsbcount;
-		dargs.whichfork = XFS_DATA_FORK;
-		dargs.trans = tp;
-		return xfs_dir2_sf_to_block(&dargs);
-	}
-
-	if (S_ISLNK(ip->i_d.di_mode))
-		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
-						 flags, XFS_DATA_FORK,
-						 xfs_symlink_local_to_remote);
-
-	/* should only be called for types that support local format data */
-	ASSERT(0);
-	return EFSCORRUPTED;
-}
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int						/* error code */
-xfs_bmap_add_attrfork(
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	int			size,		/* space new attribute needs */
-	int			rsvd)		/* xact may use reserved blks */
-{
-	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
-	xfs_bmap_free_t		flist;		/* freed extent records */
-	xfs_mount_t		*mp;		/* mount structure */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	int			blks;		/* space reservation */
-	int			version = 1;	/* superblock attr version */
-	int			committed;	/* xaction was committed */
-	int			logflags;	/* logging flags */
-	int			error;		/* error return value */
-	int			cancel_flags = 0;
-
-	ASSERT(XFS_IFORK_Q(ip) == 0);
-
-	mp = ip->i_mount;
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
-	blks = XFS_ADDAFORK_SPACE_RES(mp);
-	if (rsvd)
-		tp->t_flags |= XFS_TRANS_RESERVE;
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
-			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-			XFS_QMOPT_RES_REGBLKS);
-	if (error)
-		goto trans_cancel;
-	cancel_flags |= XFS_TRANS_ABORT;
-	if (XFS_IFORK_Q(ip))
-		goto trans_cancel;
-	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
-		/*
-		 * For inodes coming from pre-6.2 filesystems.
-		 */
-		ASSERT(ip->i_d.di_aformat == 0);
-		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-	}
-	ASSERT(ip->i_d.di_anextents == 0);
-
-	xfs_trans_ijoin(tp, ip, 0);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_DEV:
-		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-		break;
-	case XFS_DINODE_FMT_UUID:
-		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
-		break;
-	case XFS_DINODE_FMT_LOCAL:
-	case XFS_DINODE_FMT_EXTENTS:
-	case XFS_DINODE_FMT_BTREE:
-		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
-		if (!ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
-		else if (mp->m_flags & XFS_MOUNT_ATTR2)
-			version = 2;
-		break;
-	default:
-		ASSERT(0);
-		error = EINVAL;
-		goto trans_cancel;
-	}
-
-	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-	ip->i_afp->if_flags = XFS_IFEXTENTS;
-	logflags = 0;
-	xfs_bmap_init(&flist, &firstblock);
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_LOCAL:
-		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
-			&logflags);
-		break;
-	case XFS_DINODE_FMT_EXTENTS:
-		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
-			&flist, &logflags);
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
-			&logflags);
-		break;
-	default:
-		error = 0;
-		break;
-	}
-	if (logflags)
-		xfs_trans_log_inode(tp, ip, logflags);
-	if (error)
-		goto bmap_cancel;
-	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
-	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-		__int64_t sbfields = 0;
-
-		spin_lock(&mp->m_sb_lock);
-		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
-			xfs_sb_version_addattr(&mp->m_sb);
-			sbfields |= XFS_SB_VERSIONNUM;
-		}
-		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
-			xfs_sb_version_addattr2(&mp->m_sb);
-			sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
-		}
-		if (sbfields) {
-			spin_unlock(&mp->m_sb_lock);
-			xfs_mod_sb(tp, sbfields);
-		} else
-			spin_unlock(&mp->m_sb_lock);
-	}
-
-	error = xfs_bmap_finish(&tp, &flist, &committed);
-	if (error)
-		goto bmap_cancel;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return error;
-
-bmap_cancel:
-	xfs_bmap_cancel(&flist);
-trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return error;
-}
-
-/*
- * Internal and external extent tree search functions.
- */
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int					/* error */
-xfs_bmap_read_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode */
-	int			whichfork) /* data or attr fork */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	xfs_fsblock_t		bno;	/* block # of "block" */
-	xfs_buf_t		*bp;	/* buffer for "block" */
-	int			error;	/* error return value */
-	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
-	xfs_extnum_t		i, j;	/* index into the extents list */
-	xfs_ifork_t		*ifp;	/* fork structure */
-	int			level;	/* btree level, for checking */
-	xfs_mount_t		*mp;	/* file system mount structure */
-	__be64			*pp;	/* pointer to block address */
-	/* REFERENCED */
-	xfs_extnum_t		room;	/* number of entries there's room for */
-
-	bno = NULLFSBLOCK;
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
-					XFS_EXTFMT_INODE(ip);
-	block = ifp->if_broot;
-	/*
-	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-	/*
-	 * Go down the tree until leaf level is reached, following the first
-	 * pointer (leftmost) at each level.
-	 */
-	while (level-- > 0) {
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-		if (error)
-			return error;
-		block = XFS_BUF_TO_BLOCK(bp);
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, level),
-			error0);
-		if (level == 0)
-			break;
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-		bno = be64_to_cpu(*pp);
-		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-		xfs_trans_brelse(tp, bp);
-	}
-	/*
-	 * Here with bp and block set to the leftmost leaf node in the tree.
-	 */
-	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	i = 0;
-	/*
-	 * Loop over all leaf nodes.  Copy information to the extent records.
-	 */
-	for (;;) {
-		xfs_bmbt_rec_t	*frp;
-		xfs_fsblock_t	nextbno;
-		xfs_extnum_t	num_recs;
-		xfs_extnum_t	start;
-
-		num_recs = xfs_btree_get_numrecs(block);
-		if (unlikely(i + num_recs > room)) {
-			ASSERT(i + num_recs <= room);
-			xfs_warn(ip->i_mount,
-				"corrupt dinode %Lu, (btree extents).",
-				(unsigned long long) ip->i_ino);
-			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-				XFS_ERRLEVEL_LOW, ip->i_mount, block);
-			goto error0;
-		}
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, 0),
-			error0);
-		/*
-		 * Read-ahead the next leaf block, if any.
-		 */
-		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-		if (nextbno != NULLFSBLOCK)
-			xfs_btree_reada_bufl(mp, nextbno, 1,
-					     &xfs_bmbt_buf_ops);
-		/*
-		 * Copy records into the extent records.
-		 */
-		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
-		start = i;
-		for (j = 0; j < num_recs; j++, i++, frp++) {
-			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
-			trp->l0 = be64_to_cpu(frp->l0);
-			trp->l1 = be64_to_cpu(frp->l1);
-		}
-		if (exntf == XFS_EXTFMT_NOSTATE) {
-			/*
-			 * Check all attribute bmap btree records and
-			 * any "older" data bmap btree records for a
-			 * set bit in the "extent flag" position.
-			 */
-			if (unlikely(xfs_check_nostate_extents(ifp,
-					start, num_recs))) {
-				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
-						 XFS_ERRLEVEL_LOW,
-						 ip->i_mount);
-				goto error0;
-			}
-		}
-		xfs_trans_brelse(tp, bp);
-		bno = nextbno;
-		/*
-		 * If we've reached the end, stop.
-		 */
-		if (bno == NULLFSBLOCK)
-			break;
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-		if (error)
-			return error;
-		block = XFS_BUF_TO_BLOCK(bp);
-	}
-	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
-	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
-	return 0;
-error0:
-	xfs_trans_brelse(tp, bp);
-	return EFSCORRUPTED;
-}
-
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number searched for */
-	int		*eofp,		/* out: end of file found */
-	xfs_extnum_t	*lastxp,	/* out: last extent index */
-	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
-	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
-{
-	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
-	xfs_extnum_t	lastx;		/* last extent index */
-
-	/*
-	 * Initialize the extent entry structure to catch access to
-	 * uninitialized br_startblock field.
-	 */
-	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
-	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
-	gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
-	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
-	gotp->br_startblock = 0xffffa5a5;
-#endif
-	prevp->br_startoff = NULLFILEOFF;
-
-	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
-	if (lastx > 0) {
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
-	}
-	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
-		xfs_bmbt_get_all(ep, gotp);
-		*eofp = 0;
-	} else {
-		if (lastx > 0) {
-			*gotp = *prevp;
-		}
-		*eofp = 1;
-		ep = NULL;
-	}
-	*lastxp = lastx;
-	return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
-xfs_bmap_search_extents(
-	xfs_inode_t     *ip,            /* incore inode pointer */
-	xfs_fileoff_t   bno,            /* block number searched for */
-	int             fork,      	/* data or attr fork */
-	int             *eofp,          /* out: end of file found */
-	xfs_extnum_t    *lastxp,        /* out: last extent index */
-	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
-
-	XFS_STATS_INC(xs_look_exlist);
-	ifp = XFS_IFORK_PTR(ip, fork);
-
-	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
-	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-				"Access to block zero in inode %llu "
-				"start_block: %llx start_off: %llx "
-				"blkcnt: %llx extent-state: %x lastx: %x",
-			(unsigned long long)ip->i_ino,
-			(unsigned long long)gotp->br_startblock,
-			(unsigned long long)gotp->br_startoff,
-			(unsigned long long)gotp->br_blockcount,
-			gotp->br_state, *lastxp);
-		*lastxp = NULLEXTNUM;
-		*eofp = 1;
-		return NULL;
-	}
-	return ep;
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int						/* error */
-xfs_bmap_first_unused(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_extlen_t	len,			/* size of hole to find */
-	xfs_fileoff_t	*first_unused,		/* unused block */
-	int		whichfork)		/* data or attr fork */
-{
-	int		error;			/* error return value */
-	int		idx;			/* extent record index */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_fileoff_t	lastaddr;		/* last block number seen */
-	xfs_fileoff_t	lowest;			/* lowest useful block */
-	xfs_fileoff_t	max;			/* starting useful block */
-	xfs_fileoff_t	off;			/* offset for this block */
-	xfs_extnum_t	nextents;		/* number of extent entries */
-
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
-	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*first_unused = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	lowest = *first_unused;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-		off = xfs_bmbt_get_startoff(ep);
-		/*
-		 * See if the hole before this extent will work.
-		 */
-		if (off >= lowest + len && off - max >= len) {
-			*first_unused = max;
-			return 0;
-		}
-		lastaddr = off + xfs_bmbt_get_blockcount(ep);
-		max = XFS_FILEOFF_MAX(lastaddr, lowest);
-	}
-	*first_unused = max;
-	return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block - 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int						/* error */
-xfs_bmap_last_before(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
-{
-	xfs_fileoff_t	bno;			/* input file offset */
-	int		eof;			/* hit end of file */
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_bmbt_irec_t	got;			/* current extent value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	lastx;			/* last extent used */
-	xfs_bmbt_irec_t	prev;			/* previous extent value */
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-	       return EIO;
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*last_block = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	bno = *last_block - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
-		if (prev.br_startoff == NULLFILEOFF)
-			*last_block = 0;
-		else
-			*last_block = prev.br_startoff + prev.br_blockcount;
-	}
-	/*
-	 * Otherwise *last_block is already the right answer.
-	 */
-	return 0;
-}
-
-int
-xfs_bmap_last_extent(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	struct xfs_bmbt_irec	*rec,
-	int			*is_empty)
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-	int			error;
-	int			nextents;
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(tp, ip, whichfork);
-		if (error)
-			return error;
-	}
-
-	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*is_empty = 1;
-		return 0;
-	}
-
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
-	*is_empty = 0;
-	return 0;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- *
- * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
- * at, or past the EOF.
- */
-STATIC int
-xfs_bmap_isaeof(
-	struct xfs_bmalloca	*bma,
-	int			whichfork)
-{
-	struct xfs_bmbt_irec	rec;
-	int			is_empty;
-	int			error;
-
-	bma->aeof = 0;
-	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
-				     &is_empty);
-	if (error)
-		return error;
-
-	if (is_empty) {
-		bma->aeof = 1;
-		return 0;
-	}
-
-	/*
-	 * Check if we are allocation or past the last extent, or at least into
-	 * the last delayed allocated extent.
-	 */
-	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
-		(bma->offset >= rec.br_startoff &&
-		 isnullstartblock(rec.br_startblock));
-	return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int
-xfs_bmap_last_offset(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		*last_block,
-	int			whichfork)
-{
-	struct xfs_bmbt_irec	rec;
-	int			is_empty;
-	int			error;
-
-	*last_block = 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
-		return 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-	       return EIO;
-
-	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
-	if (error || is_empty)
-		return error;
-
-	*last_block = rec.br_startoff + rec.br_blockcount;
-	return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int					/* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
-	xfs_inode_t	*ip,		/* incore inode */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	int		rval;		/* return value */
-	xfs_bmbt_irec_t	s;		/* internal version of extent */
-
-#ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK)
-		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif	/* !DEBUG */
-	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
-		return 0;
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		return 0;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	ep = xfs_iext_get_ext(ifp, 0);
-	xfs_bmbt_get_all(ep, &s);
-	rval = s.br_startoff == 0 && s.br_blockcount == 1;
-	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
-	return rval;
-}
-
-/*
- * Extent tree manipulation functions used during allocation.
- */
-
-/*
- * Convert a delayed allocation to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_delay_real(
-	struct xfs_bmalloca	*bma)
-{
-	struct xfs_bmbt_irec	*new = &bma->got;
-	int			diff;	/* temp value */
-	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
-	int			error;	/* error return value */
-	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
-	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
-					/* left is 0, right is 1, prev is 2 */
-	int			rval=0;	/* return value (logging flags) */
-	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		da_new; /* new count del alloc blocks used */
-	xfs_filblks_t		da_old; /* old count del alloc blocks used */
-	xfs_filblks_t		temp=0;	/* value for da_new calculations */
-	xfs_filblks_t		temp2=0;/* value for da_new calculations */
-	int			tmp_rval;	/* partial logging flags */
-
-	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
-
-	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!bma->cur ||
-	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
-
-	XFS_STATS_INC(xs_add_exlist);
-
-#define	LEFT		r[0]
-#define	RIGHT		r[1]
-#define	PREV		r[2]
-
-	/*
-	 * Set up a bunch of variables to make the tests simpler.
-	 */
-	ep = xfs_iext_get_ext(ifp, bma->idx);
-	xfs_bmbt_get_all(ep, &PREV);
-	new_endoff = new->br_startoff + new->br_blockcount;
-	ASSERT(PREV.br_startoff <= new->br_startoff);
-	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
-
-	da_old = startblockval(PREV.br_startblock);
-	da_new = 0;
-
-	/*
-	 * Set flags determining what part of the previous delayed allocation
-	 * extent is being replaced by a real allocation.
-	 */
-	if (PREV.br_startoff == new->br_startoff)
-		state |= BMAP_LEFT_FILLING;
-	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
-		state |= BMAP_RIGHT_FILLING;
-
-	/*
-	 * Check and set flags if this segment has a left neighbor.
-	 * Don't set contiguous if the combined extent would be too large.
-	 */
-	if (bma->idx > 0) {
-		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
-
-		if (isnullstartblock(LEFT.br_startblock))
-			state |= BMAP_LEFT_DELAY;
-	}
-
-	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-	    LEFT.br_state == new->br_state &&
-	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-		state |= BMAP_LEFT_CONTIG;
-
-	/*
-	 * Check and set flags if this segment has a right neighbor.
-	 * Don't set contiguous if the combined extent would be too large.
-	 * Also check for all-three-contiguous being too large.
-	 */
-	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
-
-		if (isnullstartblock(RIGHT.br_startblock))
-			state |= BMAP_RIGHT_DELAY;
-	}
-
-	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-	    new_endoff == RIGHT.br_startoff &&
-	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-	    new->br_state == RIGHT.br_state &&
-	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-		       BMAP_RIGHT_FILLING)) !=
-		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-		       BMAP_RIGHT_FILLING) ||
-	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-			<= MAXEXTLEN))
-		state |= BMAP_RIGHT_CONTIG;
-
-	error = 0;
-	/*
-	 * Switch out based on the FILLING and CONTIG state bits.
-	 */
-	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Filling in all of a previously delayed allocation extent.
-		 * The left and right neighbors are both contiguous with new.
-		 */
-		bma->idx--;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-			LEFT.br_blockcount + PREV.br_blockcount +
-			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
-		bma->ip->i_d.di_nextents--;
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_btree_delete(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_btree_decrement(bma->cur, 0, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-					LEFT.br_startblock,
-					LEFT.br_blockcount +
-					PREV.br_blockcount +
-					RIGHT.br_blockcount, LEFT.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-		/*
-		 * Filling in all of a previously delayed allocation extent.
-		 * The left neighbor is contiguous, the right is not.
-		 */
-		bma->idx--;
-
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
-					LEFT.br_startblock, LEFT.br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-					LEFT.br_startblock,
-					LEFT.br_blockcount +
-					PREV.br_blockcount, LEFT.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Filling in all of a previously delayed allocation extent.
-		 * The right neighbor is contiguous, the left is not.
-		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount + RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
-					new->br_startblock,
-					PREV.br_blockcount +
-					RIGHT.br_blockcount, PREV.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
-		/*
-		 * Filling in all of a previously delayed allocation extent.
-		 * Neither the left nor right neighbors are contiguous with
-		 * the new one.
-		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep, new->br_startblock);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		bma->ip->i_d.di_nextents++;
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			error = xfs_btree_insert(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
-		/*
-		 * Filling in the first part of a previous delayed allocation.
-		 * The left neighbor is contiguous.
-		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
-			LEFT.br_blockcount + new->br_blockcount);
-		xfs_bmbt_set_startoff(ep,
-			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-
-		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
-					LEFT.br_startblock, LEFT.br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
-					LEFT.br_startblock,
-					LEFT.br_blockcount +
-					new->br_blockcount,
-					LEFT.br_state);
-			if (error)
-				goto done;
-		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-			startblockval(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		bma->idx--;
-		break;
-
-	case BMAP_LEFT_FILLING:
-		/*
-		 * Filling in the first part of a previous delayed allocation.
-		 * The left neighbor is not contiguous.
-		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_startoff(ep, new_endoff);
-		temp = PREV.br_blockcount - new->br_blockcount;
-		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-		bma->ip->i_d.di_nextents++;
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			error = xfs_btree_insert(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-					bma->firstblock, bma->flist,
-					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
-			rval |= tmp_rval;
-			if (error)
-				goto done;
-		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-			startblockval(PREV.br_startblock) -
-			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, bma->idx + 1);
-		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-		break;
-
-	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Filling in the last part of a previous delayed allocation.
-		 * The right neighbor is contiguous with the new allocation.
-		 */
-		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
-			new->br_startoff, new->br_startblock,
-			new->br_blockcount + RIGHT.br_blockcount,
-			RIGHT.br_state);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
-					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, new->br_startoff,
-					new->br_startblock,
-					new->br_blockcount +
-					RIGHT.br_blockcount,
-					RIGHT.br_state);
-			if (error)
-				goto done;
-		}
-
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-			startblockval(PREV.br_startblock));
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		bma->idx++;
-		break;
-
-	case BMAP_RIGHT_FILLING:
-		/*
-		 * Filling in the last part of a previous delayed allocation.
-		 * The right neighbor is not contiguous.
-		 */
-		temp = PREV.br_blockcount - new->br_blockcount;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
-		bma->ip->i_d.di_nextents++;
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			error = xfs_btree_insert(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-				bma->firstblock, bma->flist, &bma->cur, 1,
-				&tmp_rval, XFS_DATA_FORK);
-			rval |= tmp_rval;
-			if (error)
-				goto done;
-		}
-		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
-			startblockval(PREV.br_startblock) -
-			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
-		ep = xfs_iext_get_ext(ifp, bma->idx);
-		xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		bma->idx++;
-		break;
-
-	case 0:
-		/*
-		 * Filling in the middle part of a previous delayed allocation.
-		 * Contiguity is impossible here.
-		 * This case is avoided almost all the time.
-		 *
-		 * We start with a delayed allocation:
-		 *
-		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
-		 *  PREV @ idx
-		 *
-	         * and we are allocating:
-		 *                     +rrrrrrrrrrrrrrrrr+
-		 *			      new
-		 *
-		 * and we set it up for insertion as:
-		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
-		 *                            new
-		 *  PREV @ idx          LEFT              RIGHT
-		 *                      inserted at idx + 1
-		 */
-		temp = new->br_startoff - PREV.br_startoff;
-		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
-		LEFT = *new;
-		RIGHT.br_state = PREV.br_state;
-		RIGHT.br_startblock = nullstartblock(
-				(int)xfs_bmap_worst_indlen(bma->ip, temp2));
-		RIGHT.br_startoff = new_endoff;
-		RIGHT.br_blockcount = temp2;
-		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
-		bma->ip->i_d.di_nextents++;
-		if (bma->cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			error = xfs_btree_insert(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-					bma->firstblock, bma->flist, &bma->cur,
-					1, &tmp_rval, XFS_DATA_FORK);
-			rval |= tmp_rval;
-			if (error)
-				goto done;
-		}
-		temp = xfs_bmap_worst_indlen(bma->ip, temp);
-		temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
-		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
-			(bma->cur ? bma->cur->bc_private.b.allocated : 0));
-		if (diff > 0) {
-			error = xfs_icsb_modify_counters(bma->ip->i_mount,
-					XFS_SBS_FDBLOCKS,
-					-((int64_t)diff), 0);
-			ASSERT(!error);
-			if (error)
-				goto done;
-		}
-
-		ep = xfs_iext_get_ext(ifp, bma->idx);
-		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
-			nullstartblock((int)temp2));
-		trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-
-		bma->idx++;
-		da_new = temp + temp2;
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_LEFT_CONTIG:
-	case BMAP_RIGHT_CONTIG:
-		/*
-		 * These cases are all impossible.
-		 */
-		ASSERT(0);
-	}
-
-	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-		int	tmp_logflags;	/* partial log flag return val */
-
-		ASSERT(bma->cur == NULL);
-		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-				bma->firstblock, bma->flist, &bma->cur,
-				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
-		bma->logflags |= tmp_logflags;
-		if (error)
-			goto done;
-	}
-
-	/* adjust for changes in reserved delayed indirect blocks */
-	if (da_old || da_new) {
-		temp = da_new;
-		if (bma->cur)
-			temp += bma->cur->bc_private.b.allocated;
-		ASSERT(temp <= da_old);
-		if (temp < da_old)
-			xfs_icsb_modify_counters(bma->ip->i_mount,
-					XFS_SBS_FDBLOCKS,
-					(int64_t)(da_old - temp), 0);
-	}
-
-	/* clear out the allocated field, done with it now in any case. */
-	if (bma->cur)
-		bma->cur->bc_private.b.allocated = 0;
-
-	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
-done:
-	bma->logflags |= rval;
-	return error;
-#undef	LEFT
-#undef	RIGHT
-#undef	PREV
-}
-
-/*
- * Convert an unwritten allocation to a real allocation or vice versa.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_unwritten_real(
-	struct xfs_trans	*tp,
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
-	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	int			*logflagsp) /* inode logging flags */
-{
-	xfs_btree_cur_t		*cur;	/* btree cursor */
-	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
-	int			error;	/* error return value */
-	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
-	xfs_exntst_t		newext;	/* new extent state */
-	xfs_exntst_t		oldext;	/* old extent state */
-	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
-					/* left is 0, right is 1, prev is 2 */
-	int			rval=0;	/* return value (logging flags) */
-	int			state = 0;/* state bits, accessed thru macros */
-
-	*logflagsp = 0;
-
-	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-
-	ASSERT(*idx >= 0);
-	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-	ASSERT(!isnullstartblock(new->br_startblock));
-
-	XFS_STATS_INC(xs_add_exlist);
-
-#define	LEFT		r[0]
-#define	RIGHT		r[1]
-#define	PREV		r[2]
-
-	/*
-	 * Set up a bunch of variables to make the tests simpler.
-	 */
-	error = 0;
-	ep = xfs_iext_get_ext(ifp, *idx);
-	xfs_bmbt_get_all(ep, &PREV);
-	newext = new->br_state;
-	oldext = (newext == XFS_EXT_UNWRITTEN) ?
-		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
-	ASSERT(PREV.br_state == oldext);
-	new_endoff = new->br_startoff + new->br_blockcount;
-	ASSERT(PREV.br_startoff <= new->br_startoff);
-	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
-
-	/*
-	 * Set flags determining what part of the previous oldext allocation
-	 * extent is being replaced by a newext allocation.
-	 */
-	if (PREV.br_startoff == new->br_startoff)
-		state |= BMAP_LEFT_FILLING;
-	if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
-		state |= BMAP_RIGHT_FILLING;
-
-	/*
-	 * Check and set flags if this segment has a left neighbor.
-	 * Don't set contiguous if the combined extent would be too large.
-	 */
-	if (*idx > 0) {
-		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
-
-		if (isnullstartblock(LEFT.br_startblock))
-			state |= BMAP_LEFT_DELAY;
-	}
-
-	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-	    LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
-	    LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
-	    LEFT.br_state == newext &&
-	    LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-		state |= BMAP_LEFT_CONTIG;
-
-	/*
-	 * Check and set flags if this segment has a right neighbor.
-	 * Don't set contiguous if the combined extent would be too large.
-	 * Also check for all-three-contiguous being too large.
-	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
-		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
-		if (isnullstartblock(RIGHT.br_startblock))
-			state |= BMAP_RIGHT_DELAY;
-	}
-
-	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-	    new_endoff == RIGHT.br_startoff &&
-	    new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
-	    newext == RIGHT.br_state &&
-	    new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
-	    ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-		       BMAP_RIGHT_FILLING)) !=
-		      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
-		       BMAP_RIGHT_FILLING) ||
-	     LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
-			<= MAXEXTLEN))
-		state |= BMAP_RIGHT_CONTIG;
-
-	/*
-	 * Switch out based on the FILLING and CONTIG state bits.
-	 */
-	switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-			 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
-	     BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Setting all of a previous oldext extent to newext.
-		 * The left and right neighbors are both contiguous with new.
-		 */
-		--*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
-			LEFT.br_blockcount + PREV.br_blockcount +
-			RIGHT.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		xfs_iext_remove(ip, *idx + 1, 2, state);
-		ip->i_d.di_nextents -= 2;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
-					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
-				LEFT.br_startblock,
-				LEFT.br_blockcount + PREV.br_blockcount +
-				RIGHT.br_blockcount, LEFT.br_state)))
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-		/*
-		 * Setting all of a previous oldext extent to newext.
-		 * The left neighbor is contiguous, the right is not.
-		 */
-		--*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
-			LEFT.br_blockcount + PREV.br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock, PREV.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
-				LEFT.br_startblock,
-				LEFT.br_blockcount + PREV.br_blockcount,
-				LEFT.br_state)))
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Setting all of a previous oldext extent to newext.
-		 * The right neighbor is contiguous, the left is not.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount + RIGHT.br_blockcount);
-		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
-					RIGHT.br_startblock,
-					RIGHT.br_blockcount, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_delete(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
-				new->br_startblock,
-				new->br_blockcount + RIGHT.br_blockcount,
-				newext)))
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
-		/*
-		 * Setting all of a previous oldext extent to newext.
-		 * Neither the left nor right neighbors are contiguous with
-		 * the new one.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_state(ep, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		if (cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
-				new->br_startblock, new->br_blockcount,
-				newext)))
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
-		/*
-		 * Setting the first part of a previous oldext extent to newext.
-		 * The left neighbor is contiguous.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
-			LEFT.br_blockcount + new->br_blockcount);
-		xfs_bmbt_set_startoff(ep,
-			PREV.br_startoff + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_startblock(ep,
-			new->br_startblock + new->br_blockcount);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		--*idx;
-
-		if (cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock, PREV.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur,
-				PREV.br_startoff + new->br_blockcount,
-				PREV.br_startblock + new->br_blockcount,
-				PREV.br_blockcount - new->br_blockcount,
-				oldext)))
-				goto done;
-			if ((error = xfs_btree_decrement(cur, 0, &i)))
-				goto done;
-			error = xfs_bmbt_update(cur, LEFT.br_startoff,
-				LEFT.br_startblock,
-				LEFT.br_blockcount + new->br_blockcount,
-				LEFT.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_FILLING:
-		/*
-		 * Setting the first part of a previous oldext extent to newext.
-		 * The left neighbor is not contiguous.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
-		xfs_bmbt_set_startoff(ep, new_endoff);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount - new->br_blockcount);
-		xfs_bmbt_set_startblock(ep,
-			new->br_startblock + new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock, PREV.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur,
-				PREV.br_startoff + new->br_blockcount,
-				PREV.br_startblock + new->br_blockcount,
-				PREV.br_blockcount - new->br_blockcount,
-				oldext)))
-				goto done;
-			cur->bc_rec.b = *new;
-			if ((error = xfs_btree_insert(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		break;
-
-	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
-		/*
-		 * Setting the last part of a previous oldext extent to newext.
-		 * The right neighbor is contiguous with the new allocation.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-			new->br_startoff, new->br_startblock,
-			new->br_blockcount + RIGHT.br_blockcount, newext);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		if (cur == NULL)
-			rval = XFS_ILOG_DEXT;
-		else {
-			rval = 0;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock,
-					PREV.br_blockcount, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
-				PREV.br_startblock,
-				PREV.br_blockcount - new->br_blockcount,
-				oldext)))
-				goto done;
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto done;
-			if ((error = xfs_bmbt_update(cur, new->br_startoff,
-				new->br_startblock,
-				new->br_blockcount + RIGHT.br_blockcount,
-				newext)))
-				goto done;
-		}
-		break;
-
-	case BMAP_RIGHT_FILLING:
-		/*
-		 * Setting the last part of a previous oldext extent to newext.
-		 * The right neighbor is not contiguous.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep,
-			PREV.br_blockcount - new->br_blockcount);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		++*idx;
-		xfs_iext_insert(ip, *idx, 1, new, state);
-
-		ip->i_d.di_nextents++;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock, PREV.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
-				PREV.br_startblock,
-				PREV.br_blockcount - new->br_blockcount,
-				oldext)))
-				goto done;
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			cur->bc_rec.b.br_state = XFS_EXT_NORM;
-			if ((error = xfs_btree_insert(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		break;
-
-	case 0:
-		/*
-		 * Setting the middle part of a previous oldext extent to
-		 * newext.  Contiguity is impossible here.
-		 * One extent becomes three extents.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep,
-			new->br_startoff - PREV.br_startoff);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		r[0] = *new;
-		r[1].br_startoff = new_endoff;
-		r[1].br_blockcount =
-			PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		r[1].br_startblock = new->br_startblock + new->br_blockcount;
-		r[1].br_state = oldext;
-
-		++*idx;
-		xfs_iext_insert(ip, *idx, 2, &r[0], state);
-
-		ip->i_d.di_nextents += 2;
-		if (cur == NULL)
-			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
-		else {
-			rval = XFS_ILOG_CORE;
-			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
-					PREV.br_startblock, PREV.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			/* new right extent - oldext */
-			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
-				r[1].br_startblock, r[1].br_blockcount,
-				r[1].br_state)))
-				goto done;
-			/* new left extent - oldext */
-			cur->bc_rec.b = PREV;
-			cur->bc_rec.b.br_blockcount =
-				new->br_startoff - PREV.br_startoff;
-			if ((error = xfs_btree_insert(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			/*
-			 * Reset the cursor to the position of the new extent
-			 * we are about to insert as we can't trust it after
-			 * the previous insert.
-			 */
-			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-					new->br_startblock, new->br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			/* new middle extent - newext */
-			cur->bc_rec.b.br_state = new->br_state;
-			if ((error = xfs_btree_insert(cur, &i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		break;
-
-	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
-	case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
-	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-	case BMAP_LEFT_CONTIG:
-	case BMAP_RIGHT_CONTIG:
-		/*
-		 * These cases are all impossible.
-		 */
-		ASSERT(0);
-	}
-
-	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
-		int	tmp_logflags;	/* partial log flag return val */
-
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
-				0, &tmp_logflags, XFS_DATA_FORK);
-		*logflagsp |= tmp_logflags;
-		if (error)
-			goto done;
-	}
-
-	/* clear out the allocated field, done with it now in any case. */
-	if (cur) {
-		cur->bc_private.b.allocated = 0;
-		*curp = cur;
-	}
-
-	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
-done:
-	*logflagsp |= rval;
-	return error;
-#undef	LEFT
-#undef	RIGHT
-#undef	PREV
-}
-
-/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/insert */
-	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
-{
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
-	xfs_filblks_t		newlen=0;	/* new indirect size */
-	xfs_filblks_t		oldlen=0;	/* old indirect size */
-	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
-	int			state;  /* state bits, accessed thru macros */
-	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
-
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	state = 0;
-	ASSERT(isnullstartblock(new->br_startblock));
-
-	/*
-	 * Check and set flags if this segment has a left neighbor
-	 */
-	if (*idx > 0) {
-		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
-
-		if (isnullstartblock(left.br_startblock))
-			state |= BMAP_LEFT_DELAY;
-	}
-
-	/*
-	 * Check and set flags if the current (right) segment exists.
-	 * If it doesn't exist, we're converting the hole at end-of-file.
-	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
-
-		if (isnullstartblock(right.br_startblock))
-			state |= BMAP_RIGHT_DELAY;
-	}
-
-	/*
-	 * Set contiguity flags on the left and right neighbors.
-	 * Don't let extents get too large, even if the pieces are contiguous.
-	 */
-	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
-	    left.br_startoff + left.br_blockcount == new->br_startoff &&
-	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-		state |= BMAP_LEFT_CONTIG;
-
-	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
-	    new->br_startoff + new->br_blockcount == right.br_startoff &&
-	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-	    (!(state & BMAP_LEFT_CONTIG) ||
-	     (left.br_blockcount + new->br_blockcount +
-	      right.br_blockcount <= MAXEXTLEN)))
-		state |= BMAP_RIGHT_CONTIG;
-
-	/*
-	 * Switch out based on the contiguity flags.
-	 */
-	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
-	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-		/*
-		 * New allocation is contiguous with delayed allocations
-		 * on the left and on the right.
-		 * Merge all three into a single extent record.
-		 */
-		--*idx;
-		temp = left.br_blockcount + new->br_blockcount +
-			right.br_blockcount;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
-		oldlen = startblockval(left.br_startblock) +
-			startblockval(new->br_startblock) +
-			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
-			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
-		xfs_iext_remove(ip, *idx + 1, 1, state);
-		break;
-
-	case BMAP_LEFT_CONTIG:
-		/*
-		 * New allocation is contiguous with a delayed allocation
-		 * on the left.
-		 * Merge the new allocation with the left neighbor.
-		 */
-		--*idx;
-		temp = left.br_blockcount + new->br_blockcount;
-
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
-		oldlen = startblockval(left.br_startblock) +
-			startblockval(new->br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
-			nullstartblock((int)newlen));
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		break;
-
-	case BMAP_RIGHT_CONTIG:
-		/*
-		 * New allocation is contiguous with a delayed allocation
-		 * on the right.
-		 * Merge the new allocation with the right neighbor.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		temp = new->br_blockcount + right.br_blockcount;
-		oldlen = startblockval(new->br_startblock) +
-			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
-			new->br_startoff,
-			nullstartblock((int)newlen), temp, right.br_state);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		break;
-
-	case 0:
-		/*
-		 * New allocation is not contiguous with another
-		 * delayed allocation.
-		 * Insert a new entry.
-		 */
-		oldlen = newlen = 0;
-		xfs_iext_insert(ip, *idx, 1, new, state);
-		break;
-	}
-	if (oldlen != newlen) {
-		ASSERT(oldlen > newlen);
-		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-			(int64_t)(oldlen - newlen), 0);
-		/*
-		 * Nothing to do for disk quota accounting here.
-		 */
-	}
-}
-
-/*
- * Convert a hole to a real allocation.
- */
-STATIC int				/* error */
-xfs_bmap_add_extent_hole_real(
-	struct xfs_bmalloca	*bma,
-	int			whichfork)
-{
-	struct xfs_bmbt_irec	*new = &bma->got;
-	int			error;	/* error return value */
-	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
-	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
-	int			rval=0;	/* return value (logging flags) */
-	int			state;	/* state bits, accessed thru macros */
-
-	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
-	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
-	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!bma->cur ||
-	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
-
-	XFS_STATS_INC(xs_add_exlist);
-
-	state = 0;
-	if (whichfork == XFS_ATTR_FORK)
-		state |= BMAP_ATTRFORK;
-
-	/*
-	 * Check and set flags if this segment has a left neighbor.
-	 */
-	if (bma->idx > 0) {
-		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
-		if (isnullstartblock(left.br_startblock))
-			state |= BMAP_LEFT_DELAY;
-	}
-
-	/*
-	 * Check and set flags if this segment has a current value.
-	 * Not true if we're inserting into the "hole" at eof.
-	 */
-	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
-		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
-		if (isnullstartblock(right.br_startblock))
-			state |= BMAP_RIGHT_DELAY;
-	}
-
-	/*
-	 * We're inserting a real allocation between "left" and "right".
-	 * Set the contiguity flags.  Don't let extents get too large.
-	 */
-	if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
-	    left.br_startoff + left.br_blockcount == new->br_startoff &&
-	    left.br_startblock + left.br_blockcount == new->br_startblock &&
-	    left.br_state == new->br_state &&
-	    left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
-		state |= BMAP_LEFT_CONTIG;
-
-	if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
-	    new->br_startoff + new->br_blockcount == right.br_startoff &&
-	    new->br_startblock + new->br_blockcount == right.br_startblock &&
-	    new->br_state == right.br_state &&
-	    new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
-	    (!(state & BMAP_LEFT_CONTIG) ||
-	     left.br_blockcount + new->br_blockcount +
-	     right.br_blockcount <= MAXEXTLEN))
-		state |= BMAP_RIGHT_CONTIG;
-
-	error = 0;
-	/*
-	 * Select which case we're in here, and implement it.
-	 */
-	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
-	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
-		/*
-		 * New allocation is contiguous with real allocations on the
-		 * left and on the right.
-		 * Merge all three into a single extent record.
-		 */
-		--bma->idx;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-			left.br_blockcount + new->br_blockcount +
-			right.br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-
-		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
-		if (bma->cur == NULL) {
-			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-		} else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
-					right.br_startblock, right.br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_btree_delete(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_btree_decrement(bma->cur, 0, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, left.br_startoff,
-					left.br_startblock,
-					left.br_blockcount +
-						new->br_blockcount +
-						right.br_blockcount,
-					left.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_LEFT_CONTIG:
-		/*
-		 * New allocation is contiguous with a real allocation
-		 * on the left.
-		 * Merge the new allocation with the left neighbor.
-		 */
-		--bma->idx;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
-			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		if (bma->cur == NULL) {
-			rval = xfs_ilog_fext(whichfork);
-		} else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
-					left.br_startblock, left.br_blockcount,
-					&i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, left.br_startoff,
-					left.br_startblock,
-					left.br_blockcount +
-						new->br_blockcount,
-					left.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case BMAP_RIGHT_CONTIG:
-		/*
-		 * New allocation is contiguous with a real allocation
-		 * on the right.
-		 * Merge the new allocation with the right neighbor.
-		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
-			new->br_startoff, new->br_startblock,
-			new->br_blockcount + right.br_blockcount,
-			right.br_state);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
-		if (bma->cur == NULL) {
-			rval = xfs_ilog_fext(whichfork);
-		} else {
-			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur,
-					right.br_startoff,
-					right.br_startblock,
-					right.br_blockcount, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			error = xfs_bmbt_update(bma->cur, new->br_startoff,
-					new->br_startblock,
-					new->br_blockcount +
-						right.br_blockcount,
-					right.br_state);
-			if (error)
-				goto done;
-		}
-		break;
-
-	case 0:
-		/*
-		 * New allocation is not contiguous with another
-		 * real allocation.
-		 * Insert a new entry.
-		 */
-		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
-		if (bma->cur == NULL) {
-			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-		} else {
-			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur,
-					new->br_startoff,
-					new->br_startblock,
-					new->br_blockcount, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-			bma->cur->bc_rec.b.br_state = new->br_state;
-			error = xfs_btree_insert(bma->cur, &i);
-			if (error)
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		break;
-	}
-
-	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
-		int	tmp_logflags;	/* partial log flag return val */
-
-		ASSERT(bma->cur == NULL);
-		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-				bma->firstblock, bma->flist, &bma->cur,
-				0, &tmp_logflags, whichfork);
-		bma->logflags |= tmp_logflags;
-		if (error)
-			goto done;
-	}
-
-	/* clear out the allocated field, done with it now in any case. */
-	if (bma->cur)
-		bma->cur->bc_private.b.allocated = 0;
-
-	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
-done:
-	bma->logflags |= rval;
-	return error;
-}
-
-/*
- * Functions used in the extent read, allocate and remove paths
- */
-
-/*
- * Adjust the size of the new extent based on di_extsize and rt extsize.
- */
-int
-xfs_bmap_extsize_align(
-	xfs_mount_t	*mp,
-	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
-	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
-	xfs_extlen_t	extsz,		/* align to this extent size */
-	int		rt,		/* is this a realtime inode? */
-	int		eof,		/* is extent at end-of-file? */
-	int		delay,		/* creating delalloc extent? */
-	int		convert,	/* overwriting unwritten extent? */
-	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
-	xfs_extlen_t	*lenp)		/* in/out: aligned length */
-{
-	xfs_fileoff_t	orig_off;	/* original offset */
-	xfs_extlen_t	orig_alen;	/* original length */
-	xfs_fileoff_t	orig_end;	/* original off+len */
-	xfs_fileoff_t	nexto;		/* next file offset */
-	xfs_fileoff_t	prevo;		/* previous file offset */
-	xfs_fileoff_t	align_off;	/* temp for offset */
-	xfs_extlen_t	align_alen;	/* temp for length */
-	xfs_extlen_t	temp;		/* temp for calculations */
-
-	if (convert)
-		return 0;
-
-	orig_off = align_off = *offp;
-	orig_alen = align_alen = *lenp;
-	orig_end = orig_off + orig_alen;
-
-	/*
-	 * If this request overlaps an existing extent, then don't
-	 * attempt to perform any additional alignment.
-	 */
-	if (!delay && !eof &&
-	    (orig_off >= gotp->br_startoff) &&
-	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
-		return 0;
-	}
-
-	/*
-	 * If the file offset is unaligned vs. the extent size
-	 * we need to align it.  This will be possible unless
-	 * the file was previously written with a kernel that didn't
-	 * perform this alignment, or if a truncate shot us in the
-	 * foot.
-	 */
-	temp = do_mod(orig_off, extsz);
-	if (temp) {
-		align_alen += temp;
-		align_off -= temp;
-	}
-	/*
-	 * Same adjustment for the end of the requested area.
-	 */
-	if ((temp = (align_alen % extsz))) {
-		align_alen += extsz - temp;
-	}
-	/*
-	 * If the previous block overlaps with this proposed allocation
-	 * then move the start forward without adjusting the length.
-	 */
-	if (prevp->br_startoff != NULLFILEOFF) {
-		if (prevp->br_startblock == HOLESTARTBLOCK)
-			prevo = prevp->br_startoff;
-		else
-			prevo = prevp->br_startoff + prevp->br_blockcount;
-	} else
-		prevo = 0;
-	if (align_off != orig_off && align_off < prevo)
-		align_off = prevo;
-	/*
-	 * If the next block overlaps with this proposed allocation
-	 * then move the start back without adjusting the length,
-	 * but not before offset 0.
-	 * This may of course make the start overlap previous block,
-	 * and if we hit the offset 0 limit then the next block
-	 * can still overlap too.
-	 */
-	if (!eof && gotp->br_startoff != NULLFILEOFF) {
-		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
-		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
-			nexto = gotp->br_startoff + gotp->br_blockcount;
-		else
-			nexto = gotp->br_startoff;
-	} else
-		nexto = NULLFILEOFF;
-	if (!eof &&
-	    align_off + align_alen != orig_end &&
-	    align_off + align_alen > nexto)
-		align_off = nexto > align_alen ? nexto - align_alen : 0;
-	/*
-	 * If we're now overlapping the next or previous extent that
-	 * means we can't fit an extsz piece in this hole.  Just move
-	 * the start forward to the first valid spot and set
-	 * the length so we hit the end.
-	 */
-	if (align_off != orig_off && align_off < prevo)
-		align_off = prevo;
-	if (align_off + align_alen != orig_end &&
-	    align_off + align_alen > nexto &&
-	    nexto != NULLFILEOFF) {
-		ASSERT(nexto > prevo);
-		align_alen = nexto - align_off;
-	}
-
-	/*
-	 * If realtime, and the result isn't a multiple of the realtime
-	 * extent size we need to remove blocks until it is.
-	 */
-	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
-		/*
-		 * We're not covering the original request, or
-		 * we won't be able to once we fix the length.
-		 */
-		if (orig_off < align_off ||
-		    orig_end > align_off + align_alen ||
-		    align_alen - temp < orig_alen)
-			return EINVAL;
-		/*
-		 * Try to fix it by moving the start up.
-		 */
-		if (align_off + temp <= orig_off) {
-			align_alen -= temp;
-			align_off += temp;
-		}
-		/*
-		 * Try to fix it by moving the end in.
-		 */
-		else if (align_off + align_alen - temp >= orig_end)
-			align_alen -= temp;
-		/*
-		 * Set the start to the minimum then trim the length.
-		 */
-		else {
-			align_alen -= orig_off - align_off;
-			align_off = orig_off;
-			align_alen -= align_alen % mp->m_sb.sb_rextsize;
-		}
-		/*
-		 * Result doesn't cover the request, fail it.
-		 */
-		if (orig_off < align_off || orig_end > align_off + align_alen)
-			return EINVAL;
-	} else {
-		ASSERT(orig_off >= align_off);
-		ASSERT(orig_end <= align_off + align_alen);
-	}
-
-#ifdef DEBUG
-	if (!eof && gotp->br_startoff != NULLFILEOFF)
-		ASSERT(align_off + align_alen <= gotp->br_startoff);
-	if (prevp->br_startoff != NULLFILEOFF)
-		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
-#endif
-
-	*lenp = align_alen;
-	*offp = align_off;
-	return 0;
-}
-
-#define XFS_ALLOC_GAP_UNITS	4
-
-void
-xfs_bmap_adjacent(
-	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
-{
-	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
-	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
-	xfs_mount_t	*mp;		/* mount point structure */
-	int		nullfb;		/* true if ap->firstblock isn't set */
-	int		rt;		/* true if inode is realtime */
-
-#define	ISVALID(x,y)	\
-	(rt ? \
-		(x) < mp->m_sb.sb_rblocks : \
-		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
-		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
-		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
-
-	mp = ap->ip->i_mount;
-	nullfb = *ap->firstblock == NULLFSBLOCK;
-	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
-	/*
-	 * If allocating at eof, and there's a previous real block,
-	 * try to use its last block as our starting point.
-	 */
-	if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
-	    !isnullstartblock(ap->prev.br_startblock) &&
-	    ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
-		    ap->prev.br_startblock)) {
-		ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
-		/*
-		 * Adjust for the gap between prevp and us.
-		 */
-		adjust = ap->offset -
-			(ap->prev.br_startoff + ap->prev.br_blockcount);
-		if (adjust &&
-		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
-			ap->blkno += adjust;
-	}
-	/*
-	 * If not at eof, then compare the two neighbor blocks.
-	 * Figure out whether either one gives us a good starting point,
-	 * and pick the better one.
-	 */
-	else if (!ap->eof) {
-		xfs_fsblock_t	gotbno;		/* right side block number */
-		xfs_fsblock_t	gotdiff=0;	/* right side difference */
-		xfs_fsblock_t	prevbno;	/* left side block number */
-		xfs_fsblock_t	prevdiff=0;	/* left side difference */
-
-		/*
-		 * If there's a previous (left) block, select a requested
-		 * start block based on it.
-		 */
-		if (ap->prev.br_startoff != NULLFILEOFF &&
-		    !isnullstartblock(ap->prev.br_startblock) &&
-		    (prevbno = ap->prev.br_startblock +
-			       ap->prev.br_blockcount) &&
-		    ISVALID(prevbno, ap->prev.br_startblock)) {
-			/*
-			 * Calculate gap to end of previous block.
-			 */
-			adjust = prevdiff = ap->offset -
-				(ap->prev.br_startoff +
-				 ap->prev.br_blockcount);
-			/*
-			 * Figure the startblock based on the previous block's
-			 * end and the gap size.
-			 * Heuristic!
-			 * If the gap is large relative to the piece we're
-			 * allocating, or using it gives us an invalid block
-			 * number, then just use the end of the previous block.
-			 */
-			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
-			    ISVALID(prevbno + prevdiff,
-				    ap->prev.br_startblock))
-				prevbno += adjust;
-			else
-				prevdiff += adjust;
-			/*
-			 * If the firstblock forbids it, can't use it,
-			 * must use default.
-			 */
-			if (!rt && !nullfb &&
-			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
-				prevbno = NULLFSBLOCK;
-		}
-		/*
-		 * No previous block or can't follow it, just default.
-		 */
-		else
-			prevbno = NULLFSBLOCK;
-		/*
-		 * If there's a following (right) block, select a requested
-		 * start block based on it.
-		 */
-		if (!isnullstartblock(ap->got.br_startblock)) {
-			/*
-			 * Calculate gap to start of next block.
-			 */
-			adjust = gotdiff = ap->got.br_startoff - ap->offset;
-			/*
-			 * Figure the startblock based on the next block's
-			 * start and the gap size.
-			 */
-			gotbno = ap->got.br_startblock;
-			/*
-			 * Heuristic!
-			 * If the gap is large relative to the piece we're
-			 * allocating, or using it gives us an invalid block
-			 * number, then just use the start of the next block
-			 * offset by our length.
-			 */
-			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
-			    ISVALID(gotbno - gotdiff, gotbno))
-				gotbno -= adjust;
-			else if (ISVALID(gotbno - ap->length, gotbno)) {
-				gotbno -= ap->length;
-				gotdiff += adjust - ap->length;
-			} else
-				gotdiff += adjust;
-			/*
-			 * If the firstblock forbids it, can't use it,
-			 * must use default.
-			 */
-			if (!rt && !nullfb &&
-			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
-				gotbno = NULLFSBLOCK;
-		}
-		/*
-		 * No next block, just default.
-		 */
-		else
-			gotbno = NULLFSBLOCK;
-		/*
-		 * If both valid, pick the better one, else the only good
-		 * one, else ap->blkno is already set (to 0 or the inode block).
-		 */
-		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
-			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
-		else if (prevbno != NULLFSBLOCK)
-			ap->blkno = prevbno;
-		else if (gotbno != NULLFSBLOCK)
-			ap->blkno = gotbno;
-	}
-#undef ISVALID
-}
-
-static int
-xfs_bmap_longest_free_extent(
-	struct xfs_trans	*tp,
-	xfs_agnumber_t		ag,
-	xfs_extlen_t		*blen,
-	int			*notinit)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_perag	*pag;
-	xfs_extlen_t		longest;
-	int			error = 0;
-
-	pag = xfs_perag_get(mp, ag);
-	if (!pag->pagf_init) {
-		error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
-		if (error)
-			goto out;
-
-		if (!pag->pagf_init) {
-			*notinit = 1;
-			goto out;
-		}
-	}
-
-	longest = xfs_alloc_longest_free_extent(mp, pag);
-	if (*blen < longest)
-		*blen = longest;
-
-out:
-	xfs_perag_put(pag);
-	return error;
-}
-
-static void
-xfs_bmap_select_minlen(
-	struct xfs_bmalloca	*ap,
-	struct xfs_alloc_arg	*args,
-	xfs_extlen_t		*blen,
-	int			notinit)
-{
-	if (notinit || *blen < ap->minlen) {
-		/*
-		 * Since we did a BUF_TRYLOCK above, it is possible that
-		 * there is space for this request.
-		 */
-		args->minlen = ap->minlen;
-	} else if (*blen < args->maxlen) {
-		/*
-		 * If the best seen length is less than the request length,
-		 * use the best as the minimum.
-		 */
-		args->minlen = *blen;
-	} else {
-		/*
-		 * Otherwise we've seen an extent as big as maxlen, use that
-		 * as the minimum.
-		 */
-		args->minlen = args->maxlen;
-	}
-}
-
-STATIC int
-xfs_bmap_btalloc_nullfb(
-	struct xfs_bmalloca	*ap,
-	struct xfs_alloc_arg	*args,
-	xfs_extlen_t		*blen)
-{
-	struct xfs_mount	*mp = ap->ip->i_mount;
-	xfs_agnumber_t		ag, startag;
-	int			notinit = 0;
-	int			error;
-
-	args->type = XFS_ALLOCTYPE_START_BNO;
-	args->total = ap->total;
-
-	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
-	if (startag == NULLAGNUMBER)
-		startag = ag = 0;
-
-	while (*blen < args->maxlen) {
-		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
-						     &notinit);
-		if (error)
-			return error;
-
-		if (++ag == mp->m_sb.sb_agcount)
-			ag = 0;
-		if (ag == startag)
-			break;
-	}
-
-	xfs_bmap_select_minlen(ap, args, blen, notinit);
-	return 0;
-}
-
-STATIC int
-xfs_bmap_btalloc_filestreams(
-	struct xfs_bmalloca	*ap,
-	struct xfs_alloc_arg	*args,
-	xfs_extlen_t		*blen)
-{
-	struct xfs_mount	*mp = ap->ip->i_mount;
-	xfs_agnumber_t		ag;
-	int			notinit = 0;
-	int			error;
-
-	args->type = XFS_ALLOCTYPE_NEAR_BNO;
-	args->total = ap->total;
-
-	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
-	if (ag == NULLAGNUMBER)
-		ag = 0;
-
-	error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
-	if (error)
-		return error;
-
-	if (*blen < args->maxlen) {
-		error = xfs_filestream_new_ag(ap, &ag);
-		if (error)
-			return error;
-
-		error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
-						     &notinit);
-		if (error)
-			return error;
-
-	}
-
-	xfs_bmap_select_minlen(ap, args, blen, notinit);
-
-	/*
-	 * Set the failure fallback case to look in the selected AG as stream
-	 * may have moved.
-	 */
-	ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-	return 0;
-}
-
-STATIC int
-xfs_bmap_btalloc(
-	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
-{
-	xfs_mount_t	*mp;		/* mount point structure */
-	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
-	xfs_extlen_t	align;		/* minimum allocation alignment */
-	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
-	xfs_agnumber_t	ag;
-	xfs_alloc_arg_t	args;
-	xfs_extlen_t	blen;
-	xfs_extlen_t	nextminlen = 0;
-	int		nullfb;		/* true if ap->firstblock isn't set */
-	int		isaligned;
-	int		tryagain;
-	int		error;
-	int		stripe_align;
-
-	ASSERT(ap->length);
-
-	mp = ap->ip->i_mount;
-
-	/* stripe alignment for allocation is determined by mount parameters */
-	stripe_align = 0;
-	if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
-		stripe_align = mp->m_swidth;
-	else if (mp->m_dalign)
-		stripe_align = mp->m_dalign;
-
-	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
-	if (unlikely(align)) {
-		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
-						align, 0, ap->eof, 0, ap->conv,
-						&ap->offset, &ap->length);
-		ASSERT(!error);
-		ASSERT(ap->length);
-	}
-
-
-	nullfb = *ap->firstblock == NULLFSBLOCK;
-	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
-	if (nullfb) {
-		if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
-			ag = xfs_filestream_lookup_ag(ap->ip);
-			ag = (ag != NULLAGNUMBER) ? ag : 0;
-			ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
-		} else {
-			ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
-		}
-	} else
-		ap->blkno = *ap->firstblock;
-
-	xfs_bmap_adjacent(ap);
-
-	/*
-	 * If allowed, use ap->blkno; otherwise must use firstblock since
-	 * it's in the right allocation group.
-	 */
-	if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
-		;
-	else
-		ap->blkno = *ap->firstblock;
-	/*
-	 * Normal allocation, done through xfs_alloc_vextent.
-	 */
-	tryagain = isaligned = 0;
-	memset(&args, 0, sizeof(args));
-	args.tp = ap->tp;
-	args.mp = mp;
-	args.fsbno = ap->blkno;
-
-	/* Trim the allocation back to the maximum an AG can fit. */
-	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
-	args.firstblock = *ap->firstblock;
-	blen = 0;
-	if (nullfb) {
-		/*
-		 * Search for an allocation group with a single extent large
-		 * enough for the request.  If one isn't found, then adjust
-		 * the minimum allocation size to the largest space found.
-		 */
-		if (ap->userdata && xfs_inode_is_filestream(ap->ip))
-			error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
-		else
-			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
-		if (error)
-			return error;
-	} else if (ap->flist->xbf_low) {
-		if (xfs_inode_is_filestream(ap->ip))
-			args.type = XFS_ALLOCTYPE_FIRST_AG;
-		else
-			args.type = XFS_ALLOCTYPE_START_BNO;
-		args.total = args.minlen = ap->minlen;
-	} else {
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.total = ap->total;
-		args.minlen = ap->minlen;
-	}
-	/* apply extent size hints if obtained earlier */
-	if (unlikely(align)) {
-		args.prod = align;
-		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
-			args.mod = (xfs_extlen_t)(args.prod - args.mod);
-	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
-		args.prod = 1;
-		args.mod = 0;
-	} else {
-		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
-		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
-			args.mod = (xfs_extlen_t)(args.prod - args.mod);
-	}
-	/*
-	 * If we are not low on available data blocks, and the
-	 * underlying logical volume manager is a stripe, and
-	 * the file offset is zero then try to allocate data
-	 * blocks on stripe unit boundary.
-	 * NOTE: ap->aeof is only set if the allocation length
-	 * is >= the stripe unit and the allocation offset is
-	 * at the end of file.
-	 */
-	if (!ap->flist->xbf_low && ap->aeof) {
-		if (!ap->offset) {
-			args.alignment = stripe_align;
-			atype = args.type;
-			isaligned = 1;
-			/*
-			 * Adjust for alignment
-			 */
-			if (blen > args.alignment && blen <= args.maxlen)
-				args.minlen = blen - args.alignment;
-			args.minalignslop = 0;
-		} else {
-			/*
-			 * First try an exact bno allocation.
-			 * If it fails then do a near or start bno
-			 * allocation with alignment turned on.
-			 */
-			atype = args.type;
-			tryagain = 1;
-			args.type = XFS_ALLOCTYPE_THIS_BNO;
-			args.alignment = 1;
-			/*
-			 * Compute the minlen+alignment for the
-			 * next case.  Set slop so that the value
-			 * of minlen+alignment+slop doesn't go up
-			 * between the calls.
-			 */
-			if (blen > stripe_align && blen <= args.maxlen)
-				nextminlen = blen - stripe_align;
-			else
-				nextminlen = args.minlen;
-			if (nextminlen + stripe_align > args.minlen + 1)
-				args.minalignslop =
-					nextminlen + stripe_align -
-					args.minlen - 1;
-			else
-				args.minalignslop = 0;
-		}
-	} else {
-		args.alignment = 1;
-		args.minalignslop = 0;
-	}
-	args.minleft = ap->minleft;
-	args.wasdel = ap->wasdel;
-	args.isfl = 0;
-	args.userdata = ap->userdata;
-	if ((error = xfs_alloc_vextent(&args)))
-		return error;
-	if (tryagain && args.fsbno == NULLFSBLOCK) {
-		/*
-		 * Exact allocation failed. Now try with alignment
-		 * turned on.
-		 */
-		args.type = atype;
-		args.fsbno = ap->blkno;
-		args.alignment = stripe_align;
-		args.minlen = nextminlen;
-		args.minalignslop = 0;
-		isaligned = 1;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-	}
-	if (isaligned && args.fsbno == NULLFSBLOCK) {
-		/*
-		 * allocation failed, so turn off alignment and
-		 * try again.
-		 */
-		args.type = atype;
-		args.fsbno = ap->blkno;
-		args.alignment = 0;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-	}
-	if (args.fsbno == NULLFSBLOCK && nullfb &&
-	    args.minlen > ap->minlen) {
-		args.minlen = ap->minlen;
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = ap->blkno;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-	}
-	if (args.fsbno == NULLFSBLOCK && nullfb) {
-		args.fsbno = 0;
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.total = ap->minlen;
-		args.minleft = 0;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-		ap->flist->xbf_low = 1;
-	}
-	if (args.fsbno != NULLFSBLOCK) {
-		/*
-		 * check the allocation happened at the same or higher AG than
-		 * the first block that was allocated.
-		 */
-		ASSERT(*ap->firstblock == NULLFSBLOCK ||
-		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
-		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
-		       (ap->flist->xbf_low &&
-			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
-			XFS_FSB_TO_AGNO(mp, args.fsbno)));
-
-		ap->blkno = args.fsbno;
-		if (*ap->firstblock == NULLFSBLOCK)
-			*ap->firstblock = args.fsbno;
-		ASSERT(nullfb || fb_agno == args.agno ||
-		       (ap->flist->xbf_low && fb_agno < args.agno));
-		ap->length = args.len;
-		ap->ip->i_d.di_nblocks += args.len;
-		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-		if (ap->wasdel)
-			ap->ip->i_delayed_blks -= args.len;
-		/*
-		 * Adjust the disk quota also. This was reserved
-		 * earlier.
-		 */
-		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-			ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
-					XFS_TRANS_DQ_BCOUNT,
-			(long) args.len);
-	} else {
-		ap->blkno = NULLFSBLOCK;
-		ap->length = 0;
-	}
-	return 0;
-}
-
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
-	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
-{
-	if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
-		return xfs_bmap_rtalloc(ap);
-	return xfs_bmap_btalloc(ap);
-}
-
-/*
- * Trim the returned map to the required bounds
- */
-STATIC void
-xfs_bmapi_trim_map(
-	struct xfs_bmbt_irec	*mval,
-	struct xfs_bmbt_irec	*got,
-	xfs_fileoff_t		*bno,
-	xfs_filblks_t		len,
-	xfs_fileoff_t		obno,
-	xfs_fileoff_t		end,
-	int			n,
-	int			flags)
-{
-	if ((flags & XFS_BMAPI_ENTIRE) ||
-	    got->br_startoff + got->br_blockcount <= obno) {
-		*mval = *got;
-		if (isnullstartblock(got->br_startblock))
-			mval->br_startblock = DELAYSTARTBLOCK;
-		return;
-	}
-
-	if (obno > *bno)
-		*bno = obno;
-	ASSERT((*bno >= obno) || (n == 0));
-	ASSERT(*bno < end);
-	mval->br_startoff = *bno;
-	if (isnullstartblock(got->br_startblock))
-		mval->br_startblock = DELAYSTARTBLOCK;
-	else
-		mval->br_startblock = got->br_startblock +
-					(*bno - got->br_startoff);
-	/*
-	 * Return the minimum of what we got and what we asked for for
-	 * the length.  We can use the len variable here because it is
-	 * modified below and we could have been there before coming
-	 * here if the first part of the allocation didn't overlap what
-	 * was asked for.
-	 */
-	mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
-			got->br_blockcount - (*bno - got->br_startoff));
-	mval->br_state = got->br_state;
-	ASSERT(mval->br_blockcount <= len);
-	return;
-}
-
-/*
- * Update and validate the extent map to return
- */
-STATIC void
-xfs_bmapi_update_map(
-	struct xfs_bmbt_irec	**map,
-	xfs_fileoff_t		*bno,
-	xfs_filblks_t		*len,
-	xfs_fileoff_t		obno,
-	xfs_fileoff_t		end,
-	int			*n,
-	int			flags)
-{
-	xfs_bmbt_irec_t	*mval = *map;
-
-	ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-	       ((mval->br_startoff + mval->br_blockcount) <= end));
-	ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
-	       (mval->br_startoff < obno));
-
-	*bno = mval->br_startoff + mval->br_blockcount;
-	*len = end - *bno;
-	if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
-		/* update previous map with new information */
-		ASSERT(mval->br_startblock == mval[-1].br_startblock);
-		ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
-		ASSERT(mval->br_state == mval[-1].br_state);
-		mval[-1].br_blockcount = mval->br_blockcount;
-		mval[-1].br_state = mval->br_state;
-	} else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
-		   mval[-1].br_startblock != DELAYSTARTBLOCK &&
-		   mval[-1].br_startblock != HOLESTARTBLOCK &&
-		   mval->br_startblock == mval[-1].br_startblock +
-					  mval[-1].br_blockcount &&
-		   ((flags & XFS_BMAPI_IGSTATE) ||
-			mval[-1].br_state == mval->br_state)) {
-		ASSERT(mval->br_startoff ==
-		       mval[-1].br_startoff + mval[-1].br_blockcount);
-		mval[-1].br_blockcount += mval->br_blockcount;
-	} else if (*n > 0 &&
-		   mval->br_startblock == DELAYSTARTBLOCK &&
-		   mval[-1].br_startblock == DELAYSTARTBLOCK &&
-		   mval->br_startoff ==
-		   mval[-1].br_startoff + mval[-1].br_blockcount) {
-		mval[-1].br_blockcount += mval->br_blockcount;
-		mval[-1].br_state = mval->br_state;
-	} else if (!((*n == 0) &&
-		     ((mval->br_startoff + mval->br_blockcount) <=
-		      obno))) {
-		mval++;
-		(*n)++;
-	}
-	*map = mval;
-}
-
-/*
- * Map file blocks to filesystem blocks without allocation.
- */
-int
-xfs_bmapi_read(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	struct xfs_bmbt_irec	*mval,
-	int			*nmap,
-	int			flags)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp;
-	struct xfs_bmbt_irec	got;
-	struct xfs_bmbt_irec	prev;
-	xfs_fileoff_t		obno;
-	xfs_fileoff_t		end;
-	xfs_extnum_t		lastx;
-	int			error;
-	int			eof;
-	int			n = 0;
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
-
-	ASSERT(*nmap >= 1);
-	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
-			   XFS_BMAPI_IGSTATE)));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
-
-	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return EIO;
-
-	XFS_STATS_INC(xs_blk_mapr);
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(NULL, ip, whichfork);
-		if (error)
-			return error;
-	}
-
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
-	end = bno + len;
-	obno = bno;
-
-	while (bno < end && n < *nmap) {
-		/* Reading past eof, act as though there's a hole up to end. */
-		if (eof)
-			got.br_startoff = end;
-		if (got.br_startoff > bno) {
-			/* Reading in a hole.  */
-			mval->br_startoff = bno;
-			mval->br_startblock = HOLESTARTBLOCK;
-			mval->br_blockcount =
-				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
-			mval->br_state = XFS_EXT_NORM;
-			bno += mval->br_blockcount;
-			len -= mval->br_blockcount;
-			mval++;
-			n++;
-			continue;
-		}
-
-		/* set up the extent map to return. */
-		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-		/* If we're done, stop now. */
-		if (bno >= end || n >= *nmap)
-			break;
-
-		/* Else go on to the next record. */
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-		else
-			eof = 1;
-	}
-	*nmap = n;
-	return 0;
-}
-
-STATIC int
-xfs_bmapi_reserve_delalloc(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		aoff,
-	xfs_filblks_t		len,
-	struct xfs_bmbt_irec	*got,
-	struct xfs_bmbt_irec	*prev,
-	xfs_extnum_t		*lastx,
-	int			eof)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	xfs_extlen_t		alen;
-	xfs_extlen_t		indlen;
-	char			rt = XFS_IS_REALTIME_INODE(ip);
-	xfs_extlen_t		extsz;
-	int			error;
-
-	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
-	if (!eof)
-		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
-
-	/* Figure out the extent size, adjust alen */
-	extsz = xfs_get_extsz_hint(ip);
-	if (extsz) {
-		/*
-		 * Make sure we don't exceed a single extent length when we
-		 * align the extent by reducing length we are going to
-		 * allocate by the maximum amount extent size aligment may
-		 * require.
-		 */
-		alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
-		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
-					       1, 0, &aoff, &alen);
-		ASSERT(!error);
-	}
-
-	if (rt)
-		extsz = alen / mp->m_sb.sb_rextsize;
-
-	/*
-	 * Make a transaction-less quota reservation for delayed allocation
-	 * blocks.  This number gets adjusted later.  We return if we haven't
-	 * allocated blocks already inside this loop.
-	 */
-	error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
-			rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
-	if (error)
-		return error;
-
-	/*
-	 * Split changing sb for alen and indlen since they could be coming
-	 * from different places.
-	 */
-	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
-	ASSERT(indlen > 0);
-
-	if (rt) {
-		error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-					  -((int64_t)extsz), 0);
-	} else {
-		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-						 -((int64_t)alen), 0);
-	}
-
-	if (error)
-		goto out_unreserve_quota;
-
-	error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-					 -((int64_t)indlen), 0);
-	if (error)
-		goto out_unreserve_blocks;
-
-
-	ip->i_delayed_blks += alen;
-
-	got->br_startoff = aoff;
-	got->br_startblock = nullstartblock(indlen);
-	got->br_blockcount = alen;
-	got->br_state = XFS_EXT_NORM;
-	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
-
-	/*
-	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
-	 * might have merged it into one of the neighbouring ones.
-	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
-
-	ASSERT(got->br_startoff <= aoff);
-	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
-	ASSERT(isnullstartblock(got->br_startblock));
-	ASSERT(got->br_state == XFS_EXT_NORM);
-	return 0;
-
-out_unreserve_blocks:
-	if (rt)
-		xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
-	else
-		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
-out_unreserve_quota:
-	if (XFS_IS_QUOTA_ON(mp))
-		xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
-				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
-	return error;
-}
-
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
-	struct xfs_inode	*ip,	/* incore inode */
-	xfs_fileoff_t		bno,	/* starting file offs. mapped */
-	xfs_filblks_t		len,	/* length to map in file */
-	struct xfs_bmbt_irec	*mval,	/* output: map values */
-	int			*nmap,	/* i/o: mval size/count */
-	int			flags)	/* XFS_BMAPI_... */
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	struct xfs_bmbt_irec	got;	/* current file extent record */
-	struct xfs_bmbt_irec	prev;	/* previous file extent record */
-	xfs_fileoff_t		obno;	/* old block number (offset) */
-	xfs_fileoff_t		end;	/* end of mapped file region */
-	xfs_extnum_t		lastx;	/* last useful extent number */
-	int			eof;	/* we've hit the end of extents */
-	int			n = 0;	/* current extent index */
-	int			error = 0;
-
-	ASSERT(*nmap >= 1);
-	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
-	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return EIO;
-
-	XFS_STATS_INC(xs_blk_mapw);
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
-		if (error)
-			return error;
-	}
-
-	xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
-	end = bno + len;
-	obno = bno;
-
-	while (bno < end && n < *nmap) {
-		if (eof || got.br_startoff > bno) {
-			error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
-							   &prev, &lastx, eof);
-			if (error) {
-				if (n == 0) {
-					*nmap = 0;
-					return error;
-				}
-				break;
-			}
-		}
-
-		/* set up the extent map to return. */
-		xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-		/* If we're done, stop now. */
-		if (bno >= end || n >= *nmap)
-			break;
-
-		/* Else go on to the next record. */
-		prev = got;
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-		else
-			eof = 1;
-	}
-
-	*nmap = n;
-	return 0;
-}
-
-
-int
-__xfs_bmapi_allocate(
-	struct xfs_bmalloca	*bma)
-{
-	struct xfs_mount	*mp = bma->ip->i_mount;
-	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-	int			tmp_logflags = 0;
-	int			error;
-
-	ASSERT(bma->length > 0);
-
-	/*
-	 * For the wasdelay case, we could also just allocate the stuff asked
-	 * for in this bmap call but that wouldn't be as good.
-	 */
-	if (bma->wasdel) {
-		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
-		bma->offset = bma->got.br_startoff;
-		if (bma->idx != NULLEXTNUM && bma->idx) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
-					 &bma->prev);
-		}
-	} else {
-		bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
-		if (!bma->eof)
-			bma->length = XFS_FILBLKS_MIN(bma->length,
-					bma->got.br_startoff - bma->offset);
-	}
-
-	/*
-	 * Indicate if this is the first user data in the file, or just any
-	 * user data.
-	 */
-	if (!(bma->flags & XFS_BMAPI_METADATA)) {
-		bma->userdata = (bma->offset == 0) ?
-			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
-	}
-
-	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
-
-	/*
-	 * Only want to do the alignment at the eof if it is userdata and
-	 * allocation length is larger than a stripe unit.
-	 */
-	if (mp->m_dalign && bma->length >= mp->m_dalign &&
-	    !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
-		error = xfs_bmap_isaeof(bma, whichfork);
-		if (error)
-			return error;
-	}
-
-	error = xfs_bmap_alloc(bma);
-	if (error)
-		return error;
-
-	if (bma->flist->xbf_low)
-		bma->minleft = 0;
-	if (bma->cur)
-		bma->cur->bc_private.b.firstblock = *bma->firstblock;
-	if (bma->blkno == NULLFSBLOCK)
-		return 0;
-	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
-		bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
-		bma->cur->bc_private.b.firstblock = *bma->firstblock;
-		bma->cur->bc_private.b.flist = bma->flist;
-	}
-	/*
-	 * Bump the number of extents we've allocated
-	 * in this call.
-	 */
-	bma->nallocs++;
-
-	if (bma->cur)
-		bma->cur->bc_private.b.flags =
-			bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
-
-	bma->got.br_startoff = bma->offset;
-	bma->got.br_startblock = bma->blkno;
-	bma->got.br_blockcount = bma->length;
-	bma->got.br_state = XFS_EXT_NORM;
-
-	/*
-	 * A wasdelay extent has been initialized, so shouldn't be flagged
-	 * as unwritten.
-	 */
-	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
-	    xfs_sb_version_hasextflgbit(&mp->m_sb))
-		bma->got.br_state = XFS_EXT_UNWRITTEN;
-
-	if (bma->wasdel)
-		error = xfs_bmap_add_extent_delay_real(bma);
-	else
-		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
-
-	bma->logflags |= tmp_logflags;
-	if (error)
-		return error;
-
-	/*
-	 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
-	 * or xfs_bmap_add_extent_hole_real might have merged it into one of
-	 * the neighbouring ones.
-	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
-
-	ASSERT(bma->got.br_startoff <= bma->offset);
-	ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
-	       bma->offset + bma->length);
-	ASSERT(bma->got.br_state == XFS_EXT_NORM ||
-	       bma->got.br_state == XFS_EXT_UNWRITTEN);
-	return 0;
-}
-
-STATIC int
-xfs_bmapi_convert_unwritten(
-	struct xfs_bmalloca	*bma,
-	struct xfs_bmbt_irec	*mval,
-	xfs_filblks_t		len,
-	int			flags)
-{
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-	int			tmp_logflags = 0;
-	int			error;
-
-	/* check if we need to do unwritten->real conversion */
-	if (mval->br_state == XFS_EXT_UNWRITTEN &&
-	    (flags & XFS_BMAPI_PREALLOC))
-		return 0;
-
-	/* check if we need to do real->unwritten conversion */
-	if (mval->br_state == XFS_EXT_NORM &&
-	    (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
-			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
-		return 0;
-
-	/*
-	 * Modify (by adding) the state flag, if writing.
-	 */
-	ASSERT(mval->br_blockcount <= len);
-	if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
-		bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
-					bma->ip, whichfork);
-		bma->cur->bc_private.b.firstblock = *bma->firstblock;
-		bma->cur->bc_private.b.flist = bma->flist;
-	}
-	mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-				? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
-
-	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
-			&bma->cur, mval, bma->firstblock, bma->flist,
-			&tmp_logflags);
-	bma->logflags |= tmp_logflags;
-	if (error)
-		return error;
-
-	/*
-	 * Update our extent pointer, given that
-	 * xfs_bmap_add_extent_unwritten_real might have merged it into one
-	 * of the neighbouring ones.
-	 */
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
-
-	/*
-	 * We may have combined previously unwritten space with written space,
-	 * so generate another request.
-	 */
-	if (mval->br_blockcount < len)
-		return EAGAIN;
-	return 0;
-}
-
-/*
- * Map file blocks to filesystem blocks, and allocate blocks or convert the
- * extent state if necessary.  Details behaviour is controlled by the flags
- * parameter.  Only allocates blocks from a single allocation group, to avoid
- * locking problems.
- *
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int
-xfs_bmapi_write(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting file offs. mapped */
-	xfs_filblks_t		len,		/* length to map in file */
-	int			flags,		/* XFS_BMAPI_... */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_extlen_t		total,		/* total blocks needed */
-	struct xfs_bmbt_irec	*mval,		/* output: map values */
-	int			*nmap,		/* i/o: mval size/count */
-	struct xfs_bmap_free	*flist)		/* i/o: list extents to free */
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp;
-	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
-	xfs_fileoff_t		end;		/* end of mapped file region */
-	int			eof;		/* after the end of extents */
-	int			error;		/* error return */
-	int			n;		/* current extent index */
-	xfs_fileoff_t		obno;		/* old block number (offset) */
-	int			whichfork;	/* data or attr fork */
-	char			inhole;		/* current location is hole in file */
-	char			wasdelay;	/* old extent was delayed */
-
-#ifdef DEBUG
-	xfs_fileoff_t		orig_bno;	/* original block number value */
-	int			orig_flags;	/* original flags arg value */
-	xfs_filblks_t		orig_len;	/* original value of len arg */
-	struct xfs_bmbt_irec	*orig_mval;	/* original value of mval */
-	int			orig_nmap;	/* original value of *nmap */
-
-	orig_bno = bno;
-	orig_len = len;
-	orig_flags = flags;
-	orig_mval = mval;
-	orig_nmap = *nmap;
-#endif
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
-
-	ASSERT(*nmap >= 1);
-	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
-	ASSERT(tp != NULL);
-	ASSERT(len > 0);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return EIO;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-
-	XFS_STATS_INC(xs_blk_mapw);
-
-	if (*firstblock == NULLFSBLOCK) {
-		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-		else
-			bma.minleft = 1;
-	} else {
-		bma.minleft = 0;
-	}
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(tp, ip, whichfork);
-		if (error)
-			goto error0;
-	}
-
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
-				&bma.prev);
-	n = 0;
-	end = bno + len;
-	obno = bno;
-
-	bma.tp = tp;
-	bma.ip = ip;
-	bma.total = total;
-	bma.userdata = 0;
-	bma.flist = flist;
-	bma.firstblock = firstblock;
-
-	if (flags & XFS_BMAPI_STACK_SWITCH)
-		bma.stack_switch = 1;
-
-	while (bno < end && n < *nmap) {
-		inhole = eof || bma.got.br_startoff > bno;
-		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
-
-		/*
-		 * First, deal with the hole before the allocated space
-		 * that we found, if any.
-		 */
-		if (inhole || wasdelay) {
-			bma.eof = eof;
-			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-			bma.wasdel = wasdelay;
-			bma.offset = bno;
-			bma.flags = flags;
-
-			/*
-			 * There's a 32/64 bit type mismatch between the
-			 * allocation length request (which can be 64 bits in
-			 * length) and the bma length request, which is
-			 * xfs_extlen_t and therefore 32 bits. Hence we have to
-			 * check for 32-bit overflows and handle them here.
-			 */
-			if (len > (xfs_filblks_t)MAXEXTLEN)
-				bma.length = MAXEXTLEN;
-			else
-				bma.length = len;
-
-			ASSERT(len > 0);
-			ASSERT(bma.length > 0);
-			error = xfs_bmapi_allocate(&bma);
-			if (error)
-				goto error0;
-			if (bma.blkno == NULLFSBLOCK)
-				break;
-		}
-
-		/* Deal with the allocated space we found.  */
-		xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
-							end, n, flags);
-
-		/* Execute unwritten extent conversion if necessary */
-		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
-		if (error == EAGAIN)
-			continue;
-		if (error)
-			goto error0;
-
-		/* update the extent map to return */
-		xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-		/*
-		 * If we're done, stop now.  Stop when we've allocated
-		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
-		 * the transaction may get too big.
-		 */
-		if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
-			break;
-
-		/* Else go on to the next record. */
-		bma.prev = bma.got;
-		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
-					 &bma.got);
-		} else
-			eof = 1;
-	}
-	*nmap = n;
-
-	/*
-	 * Transform from btree to extents, give it cur.
-	 */
-	if (xfs_bmap_wants_extents(ip, whichfork)) {
-		int		tmp_logflags = 0;
-
-		ASSERT(bma.cur);
-		error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
-			&tmp_logflags, whichfork);
-		bma.logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-	}
-
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_NEXTENTS(ip, whichfork) >
-		XFS_IFORK_MAXEXT(ip, whichfork));
-	error = 0;
-error0:
-	/*
-	 * Log everything.  Do this after conversion, there's no point in
-	 * logging the extent records if we've converted to btree format.
-	 */
-	if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		bma.logflags &= ~xfs_ilog_fext(whichfork);
-	else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
-		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		bma.logflags &= ~xfs_ilog_fbroot(whichfork);
-	/*
-	 * Log whatever the flags say, even if error.  Otherwise we might miss
-	 * detecting a case where the data is changed, there's an error,
-	 * and it's not logged so we don't shutdown when we should.
-	 */
-	if (bma.logflags)
-		xfs_trans_log_inode(tp, ip, bma.logflags);
-
-	if (bma.cur) {
-		if (!error) {
-			ASSERT(*firstblock == NULLFSBLOCK ||
-			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
-			       XFS_FSB_TO_AGNO(mp,
-				       bma.cur->bc_private.b.firstblock) ||
-			       (flist->xbf_low &&
-				XFS_FSB_TO_AGNO(mp, *firstblock) <
-				XFS_FSB_TO_AGNO(mp,
-					bma.cur->bc_private.b.firstblock)));
-			*firstblock = bma.cur->bc_private.b.firstblock;
-		}
-		xfs_btree_del_cursor(bma.cur,
-			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-	}
-	if (!error)
-		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
-			orig_nmap, *nmap);
-	return error;
-}
-
-/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int				/* error */
-xfs_bmap_del_extent(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_trans_t		*tp,	/* current transaction pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/delete */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
-{
-	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
-	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
-	xfs_fsblock_t		del_endblock=0;	/* first block past del */
-	xfs_fileoff_t		del_endoff;	/* first offset past del */
-	int			delay;	/* current block is delayed allocated */
-	int			do_fx;	/* free extent at end of routine */
-	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
-	int			error;	/* error return value */
-	int			flags;	/* inode logging flags */
-	xfs_bmbt_irec_t		got;	/* current extent entry */
-	xfs_fileoff_t		got_endoff;	/* first offset past got */
-	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_mount_t		*mp;	/* mount structure */
-	xfs_filblks_t		nblks;	/* quota/sb block count */
-	xfs_bmbt_irec_t		new;	/* new record to be inserted */
-	/* REFERENCED */
-	uint			qfield;	/* quota field to update */
-	xfs_filblks_t		temp;	/* for indirect length calculations */
-	xfs_filblks_t		temp2;	/* for indirect length calculations */
-	int			state = 0;
-
-	XFS_STATS_INC(xs_del_exlist);
-
-	if (whichfork == XFS_ATTR_FORK)
-		state |= BMAP_ATTRFORK;
-
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
-		(uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(del->br_blockcount > 0);
-	ep = xfs_iext_get_ext(ifp, *idx);
-	xfs_bmbt_get_all(ep, &got);
-	ASSERT(got.br_startoff <= del->br_startoff);
-	del_endoff = del->br_startoff + del->br_blockcount;
-	got_endoff = got.br_startoff + got.br_blockcount;
-	ASSERT(got_endoff >= del_endoff);
-	delay = isnullstartblock(got.br_startblock);
-	ASSERT(isnullstartblock(del->br_startblock) == delay);
-	flags = 0;
-	qfield = 0;
-	error = 0;
-	/*
-	 * If deleting a real allocation, must free up the disk space.
-	 */
-	if (!delay) {
-		flags = XFS_ILOG_CORE;
-		/*
-		 * Realtime allocation.  Free it and record di_nblocks update.
-		 */
-		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
-			xfs_fsblock_t	bno;
-			xfs_filblks_t	len;
-
-			ASSERT(do_mod(del->br_blockcount,
-				      mp->m_sb.sb_rextsize) == 0);
-			ASSERT(do_mod(del->br_startblock,
-				      mp->m_sb.sb_rextsize) == 0);
-			bno = del->br_startblock;
-			len = del->br_blockcount;
-			do_div(bno, mp->m_sb.sb_rextsize);
-			do_div(len, mp->m_sb.sb_rextsize);
-			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
-			if (error)
-				goto done;
-			do_fx = 0;
-			nblks = len * mp->m_sb.sb_rextsize;
-			qfield = XFS_TRANS_DQ_RTBCOUNT;
-		}
-		/*
-		 * Ordinary allocation.
-		 */
-		else {
-			do_fx = 1;
-			nblks = del->br_blockcount;
-			qfield = XFS_TRANS_DQ_BCOUNT;
-		}
-		/*
-		 * Set up del_endblock and cur for later.
-		 */
-		del_endblock = del->br_startblock + del->br_blockcount;
-		if (cur) {
-			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-					got.br_startblock, got.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		da_old = da_new = 0;
-	} else {
-		da_old = startblockval(got.br_startblock);
-		da_new = 0;
-		nblks = 0;
-		do_fx = 0;
-	}
-	/*
-	 * Set flag value to use in switch statement.
-	 * Left-contig is 2, right-contig is 1.
-	 */
-	switch (((got.br_startoff == del->br_startoff) << 1) |
-		(got_endoff == del_endoff)) {
-	case 3:
-		/*
-		 * Matches the whole extent.  Delete the entry.
-		 */
-		xfs_iext_remove(ip, *idx, 1,
-				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-		--*idx;
-		if (delay)
-			break;
-
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-		flags |= XFS_ILOG_CORE;
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_btree_delete(cur, &i)))
-			goto done;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		break;
-
-	case 2:
-		/*
-		 * Deleting the first part of the extent.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_startoff(ep, del_endoff);
-		temp = got.br_blockcount - del->br_blockcount;
-		xfs_bmbt_set_blockcount(ep, temp);
-		if (delay) {
-			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-				da_old);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-			da_new = temp;
-			break;
-		}
-		xfs_bmbt_set_startblock(ep, del_endblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
-				got.br_blockcount - del->br_blockcount,
-				got.br_state)))
-			goto done;
-		break;
-
-	case 1:
-		/*
-		 * Deleting the last part of the extent.
-		 */
-		temp = got.br_blockcount - del->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		if (delay) {
-			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-				da_old);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-			da_new = temp;
-			break;
-		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_bmbt_update(cur, got.br_startoff,
-				got.br_startblock,
-				got.br_blockcount - del->br_blockcount,
-				got.br_state)))
-			goto done;
-		break;
-
-	case 0:
-		/*
-		 * Deleting the middle of the extent.
-		 */
-		temp = del->br_startoff - got.br_startoff;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		new.br_startoff = del_endoff;
-		temp2 = got_endoff - del_endoff;
-		new.br_blockcount = temp2;
-		new.br_state = got.br_state;
-		if (!delay) {
-			new.br_startblock = del_endblock;
-			flags |= XFS_ILOG_CORE;
-			if (cur) {
-				if ((error = xfs_bmbt_update(cur,
-						got.br_startoff,
-						got.br_startblock, temp,
-						got.br_state)))
-					goto done;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto done;
-				cur->bc_rec.b = new;
-				error = xfs_btree_insert(cur, &i);
-				if (error && error != ENOSPC)
-					goto done;
-				/*
-				 * If get no-space back from btree insert,
-				 * it tried a split, and we have a zero
-				 * block reservation.
-				 * Fix up our state and return the error.
-				 */
-				if (error == ENOSPC) {
-					/*
-					 * Reset the cursor, don't trust
-					 * it after any insert operation.
-					 */
-					if ((error = xfs_bmbt_lookup_eq(cur,
-							got.br_startoff,
-							got.br_startblock,
-							temp, &i)))
-						goto done;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-					/*
-					 * Update the btree record back
-					 * to the original value.
-					 */
-					if ((error = xfs_bmbt_update(cur,
-							got.br_startoff,
-							got.br_startblock,
-							got.br_blockcount,
-							got.br_state)))
-						goto done;
-					/*
-					 * Reset the extent record back
-					 * to the original value.
-					 */
-					xfs_bmbt_set_blockcount(ep,
-						got.br_blockcount);
-					flags = 0;
-					error = ENOSPC;
-					goto done;
-				}
-				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			} else
-				flags |= xfs_ilog_fext(whichfork);
-			XFS_IFORK_NEXT_SET(ip, whichfork,
-				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
-		} else {
-			ASSERT(whichfork == XFS_DATA_FORK);
-			temp = xfs_bmap_worst_indlen(ip, temp);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			temp2 = xfs_bmap_worst_indlen(ip, temp2);
-			new.br_startblock = nullstartblock((int)temp2);
-			da_new = temp + temp2;
-			while (da_new > da_old) {
-				if (temp) {
-					temp--;
-					da_new--;
-					xfs_bmbt_set_startblock(ep,
-						nullstartblock((int)temp));
-				}
-				if (da_new == da_old)
-					break;
-				if (temp2) {
-					temp2--;
-					da_new--;
-					new.br_startblock =
-						nullstartblock((int)temp2);
-				}
-			}
-		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-		++*idx;
-		break;
-	}
-	/*
-	 * If we need to, add to list of extents to delete.
-	 */
-	if (do_fx)
-		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-			mp);
-	/*
-	 * Adjust inode # blocks in the file.
-	 */
-	if (nblks)
-		ip->i_d.di_nblocks -= nblks;
-	/*
-	 * Adjust quota data.
-	 */
-	if (qfield)
-		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-
-	/*
-	 * Account for change in delayed indirect blocks.
-	 * Nothing to do for disk quota accounting here.
-	 */
-	ASSERT(da_old >= da_new);
-	if (da_old > da_new) {
-		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-			(int64_t)(da_old - da_new), 0);
-	}
-done:
-	*logflagsp = flags;
-	return error;
-}
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value.  If not all extents in the block range can be removed then
- * *done is set.
- */
-int						/* error */
-xfs_bunmapi(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* incore inode */
-	xfs_fileoff_t		bno,		/* starting offset to unmap */
-	xfs_filblks_t		len,		/* length to unmap in file */
-	int			flags,		/* misc flags */
-	xfs_extnum_t		nexts,		/* number of extents max */
-	xfs_fsblock_t		*firstblock,	/* first allocated block
-						   controls a.g. for allocs */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*done)		/* set if not done yet */
-{
-	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
-	xfs_bmbt_irec_t		del;		/* extent being deleted */
-	int			eof;		/* is deleting at eof */
-	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
-	int			error;		/* error return value */
-	xfs_extnum_t		extno;		/* extent number in list */
-	xfs_bmbt_irec_t		got;		/* current extent record */
-	xfs_ifork_t		*ifp;		/* inode fork pointer */
-	int			isrt;		/* freeing in rt area */
-	xfs_extnum_t		lastx;		/* last extent index used */
-	int			logflags;	/* transaction logging flags */
-	xfs_extlen_t		mod;		/* rt extent offset */
-	xfs_mount_t		*mp;		/* mount structure */
-	xfs_extnum_t		nextents;	/* number of file extents */
-	xfs_bmbt_irec_t		prev;		/* previous extent record */
-	xfs_fileoff_t		start;		/* first file offset deleted */
-	int			tmp_logflags;	/* partial logging flags */
-	int			wasdel;		/* was a delayed alloc extent */
-	int			whichfork;	/* data or attribute fork */
-	xfs_fsblock_t		sum;
-
-	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
-
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (unlikely(
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-		XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
-				 ip->i_mount);
-		return EFSCORRUPTED;
-	}
-	mp = ip->i_mount;
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return EIO;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(len > 0);
-	ASSERT(nexts >= 0);
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*done = 1;
-		return 0;
-	}
-	XFS_STATS_INC(xs_blk_unmap);
-	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
-	start = bno;
-	bno = start + len - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-
-	/*
-	 * Check to see if the given block number is past the end of the
-	 * file, back up to the last block if so...
-	 */
-	if (eof) {
-		ep = xfs_iext_get_ext(ifp, --lastx);
-		xfs_bmbt_get_all(ep, &got);
-		bno = got.br_startoff + got.br_blockcount - 1;
-	}
-	logflags = 0;
-	if (ifp->if_flags & XFS_IFBROOT) {
-		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_private.b.firstblock = *firstblock;
-		cur->bc_private.b.flist = flist;
-		cur->bc_private.b.flags = 0;
-	} else
-		cur = NULL;
-
-	if (isrt) {
-		/*
-		 * Synchronize by locking the bitmap inode.
-		 */
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-	}
-
-	extno = 0;
-	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
-	       (nexts == 0 || extno < nexts)) {
-		/*
-		 * Is the found extent after a hole in which bno lives?
-		 * Just back up to the previous extent, if so.
-		 */
-		if (got.br_startoff > bno) {
-			if (--lastx < 0)
-				break;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
-		}
-		/*
-		 * Is the last block of this extent before the range
-		 * we're supposed to delete?  If so, we're done.
-		 */
-		bno = XFS_FILEOFF_MIN(bno,
-			got.br_startoff + got.br_blockcount - 1);
-		if (bno < start)
-			break;
-		/*
-		 * Then deal with the (possibly delayed) allocated space
-		 * we found.
-		 */
-		ASSERT(ep != NULL);
-		del = got;
-		wasdel = isnullstartblock(del.br_startblock);
-		if (got.br_startoff < start) {
-			del.br_startoff = start;
-			del.br_blockcount -= start - got.br_startoff;
-			if (!wasdel)
-				del.br_startblock += start - got.br_startoff;
-		}
-		if (del.br_startoff + del.br_blockcount > bno + 1)
-			del.br_blockcount = bno + 1 - del.br_startoff;
-		sum = del.br_startblock + del.br_blockcount;
-		if (isrt &&
-		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
-			/*
-			 * Realtime extent not lined up at the end.
-			 * The extent could have been split into written
-			 * and unwritten pieces, or we could just be
-			 * unmapping part of it.  But we can't really
-			 * get rid of part of a realtime extent.
-			 */
-			if (del.br_state == XFS_EXT_UNWRITTEN ||
-			    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-				/*
-				 * This piece is unwritten, or we're not
-				 * using unwritten extents.  Skip over it.
-				 */
-				ASSERT(bno >= mod);
-				bno -= mod > del.br_blockcount ?
-					del.br_blockcount : mod;
-				if (bno < got.br_startoff) {
-					if (--lastx >= 0)
-						xfs_bmbt_get_all(xfs_iext_get_ext(
-							ifp, lastx), &got);
-				}
-				continue;
-			}
-			/*
-			 * It's written, turn it unwritten.
-			 * This is better than zeroing it.
-			 */
-			ASSERT(del.br_state == XFS_EXT_NORM);
-			ASSERT(xfs_trans_get_block_res(tp) > 0);
-			/*
-			 * If this spans a realtime extent boundary,
-			 * chop it back to the start of the one we end at.
-			 */
-			if (del.br_blockcount > mod) {
-				del.br_startoff += del.br_blockcount - mod;
-				del.br_startblock += del.br_blockcount - mod;
-				del.br_blockcount = mod;
-			}
-			del.br_state = XFS_EXT_UNWRITTEN;
-			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-					&lastx, &cur, &del, firstblock, flist,
-					&logflags);
-			if (error)
-				goto error0;
-			goto nodelete;
-		}
-		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
-			/*
-			 * Realtime extent is lined up at the end but not
-			 * at the front.  We'll get rid of full extents if
-			 * we can.
-			 */
-			mod = mp->m_sb.sb_rextsize - mod;
-			if (del.br_blockcount > mod) {
-				del.br_blockcount -= mod;
-				del.br_startoff += mod;
-				del.br_startblock += mod;
-			} else if ((del.br_startoff == start &&
-				    (del.br_state == XFS_EXT_UNWRITTEN ||
-				     xfs_trans_get_block_res(tp) == 0)) ||
-				   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-				/*
-				 * Can't make it unwritten.  There isn't
-				 * a full extent here so just skip it.
-				 */
-				ASSERT(bno >= del.br_blockcount);
-				bno -= del.br_blockcount;
-				if (got.br_startoff > bno) {
-					if (--lastx >= 0) {
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-						xfs_bmbt_get_all(ep, &got);
-					}
-				}
-				continue;
-			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
-				/*
-				 * This one is already unwritten.
-				 * It must have a written left neighbor.
-				 * Unwrite the killed part of that one and
-				 * try again.
-				 */
-				ASSERT(lastx > 0);
-				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-						lastx - 1), &prev);
-				ASSERT(prev.br_state == XFS_EXT_NORM);
-				ASSERT(!isnullstartblock(prev.br_startblock));
-				ASSERT(del.br_startblock ==
-				       prev.br_startblock + prev.br_blockcount);
-				if (prev.br_startoff < start) {
-					mod = start - prev.br_startoff;
-					prev.br_blockcount -= mod;
-					prev.br_startblock += mod;
-					prev.br_startoff = start;
-				}
-				prev.br_state = XFS_EXT_UNWRITTEN;
-				lastx--;
-				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &prev,
-						firstblock, flist, &logflags);
-				if (error)
-					goto error0;
-				goto nodelete;
-			} else {
-				ASSERT(del.br_state == XFS_EXT_NORM);
-				del.br_state = XFS_EXT_UNWRITTEN;
-				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &del,
-						firstblock, flist, &logflags);
-				if (error)
-					goto error0;
-				goto nodelete;
-			}
-		}
-		if (wasdel) {
-			ASSERT(startblockval(del.br_startblock) > 0);
-			/* Update realtime/data freespace, unreserve quota */
-			if (isrt) {
-				xfs_filblks_t rtexts;
-
-				rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
-				do_div(rtexts, mp->m_sb.sb_rextsize);
-				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-						(int64_t)rtexts, 0);
-				(void)xfs_trans_reserve_quota_nblks(NULL,
-					ip, -((long)del.br_blockcount), 0,
-					XFS_QMOPT_RES_RTBLKS);
-			} else {
-				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-						(int64_t)del.br_blockcount, 0);
-				(void)xfs_trans_reserve_quota_nblks(NULL,
-					ip, -((long)del.br_blockcount), 0,
-					XFS_QMOPT_RES_REGBLKS);
-			}
-			ip->i_delayed_blks -= del.br_blockcount;
-			if (cur)
-				cur->bc_private.b.flags |=
-					XFS_BTCUR_BPRV_WASDEL;
-		} else if (cur)
-			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
-		/*
-		 * If it's the case where the directory code is running
-		 * with no block reservation, and the deleted block is in
-		 * the middle of its extent, and the resulting insert
-		 * of an extent would cause transformation to btree format,
-		 * then reject it.  The calling code will then swap
-		 * blocks around instead.
-		 * We have to do this now, rather than waiting for the
-		 * conversion to btree format, since the transaction
-		 * will be dirty.
-		 */
-		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
-		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-		    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
-			XFS_IFORK_MAXEXT(ip, whichfork) &&
-		    del.br_startoff > got.br_startoff &&
-		    del.br_startoff + del.br_blockcount <
-		    got.br_startoff + got.br_blockcount) {
-			error = ENOSPC;
-			goto error0;
-		}
-		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-				&tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-		bno = del.br_startoff - 1;
-nodelete:
-		/*
-		 * If not done go on to the next (previous) record.
-		 */
-		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-			if (lastx >= 0) {
-				ep = xfs_iext_get_ext(ifp, lastx);
-				if (xfs_bmbt_get_startoff(ep) > bno) {
-					if (--lastx >= 0)
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-				}
-				xfs_bmbt_get_all(ep, &got);
-			}
-			extno++;
-		}
-	}
-	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-
-	/*
-	 * Convert to a btree if necessary.
-	 */
-	if (xfs_bmap_needs_btree(ip, whichfork)) {
-		ASSERT(cur == NULL);
-		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
-			&cur, 0, &tmp_logflags, whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-	}
-	/*
-	 * transform from btree to extents, give it cur
-	 */
-	else if (xfs_bmap_wants_extents(ip, whichfork)) {
-		ASSERT(cur != NULL);
-		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
-			whichfork);
-		logflags |= tmp_logflags;
-		if (error)
-			goto error0;
-	}
-	/*
-	 * transform from extents to local?
-	 */
-	error = 0;
-error0:
-	/*
-	 * Log everything.  Do this after conversion, there's no point in
-	 * logging the extent records if we've converted to btree format.
-	 */
-	if ((logflags & xfs_ilog_fext(whichfork)) &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~xfs_ilog_fext(whichfork);
-	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
-		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~xfs_ilog_fbroot(whichfork);
-	/*
-	 * Log inode even in the error case, if the transaction
-	 * is dirty we'll need to shut down the filesystem.
-	 */
-	if (logflags)
-		xfs_trans_log_inode(tp, ip, logflags);
-	if (cur) {
-		if (!error) {
-			*firstblock = cur->bc_private.b.firstblock;
-			cur->bc_private.b.allocated = 0;
-		}
-		xfs_btree_del_cursor(cur,
-			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-	}
-	return error;
-}
-
-/*
- * Shift extent records to the left to cover a hole.
- *
- * The maximum number of extents to be shifted in a single operation
- * is @num_exts, and @current_ext keeps track of the current extent
- * index we have shifted. @offset_shift_fsb is the length by which each
- * extent is shifted. If there is no hole to shift the extents
- * into, this will be considered invalid operation and we abort immediately.
- */
-int
-xfs_bmap_shift_extents(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	int			*done,
-	xfs_fileoff_t		start_fsb,
-	xfs_fileoff_t		offset_shift_fsb,
-	xfs_extnum_t		*current_ext,
-	xfs_fsblock_t		*firstblock,
-	struct xfs_bmap_free	*flist,
-	int			num_exts)
-{
-	struct xfs_btree_cur		*cur;
-	struct xfs_bmbt_rec_host	*gotp;
-	struct xfs_bmbt_irec            got;
-	struct xfs_bmbt_irec		left;
-	struct xfs_mount		*mp = ip->i_mount;
-	struct xfs_ifork		*ifp;
-	xfs_extnum_t			nexts = 0;
-	xfs_fileoff_t			startoff;
-	int				error = 0;
-	int				i;
-	int				whichfork = XFS_DATA_FORK;
-	int				logflags;
-	xfs_filblks_t			blockcount = 0;
-	int				total_extents;
-
-	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
-				 XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return EIO;
-
-	ASSERT(current_ext != NULL);
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		/* Read in all the extents */
-		error = xfs_iread_extents(tp, ip, whichfork);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * If *current_ext is 0, we would need to lookup the extent
-	 * from where we would start shifting and store it in gotp.
-	 */
-	if (!*current_ext) {
-		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
-		/*
-		 * gotp can be null in 2 cases: 1) if there are no extents
-		 * or 2) start_fsb lies in a hole beyond which there are
-		 * no extents. Either way, we are done.
-		 */
-		if (!gotp) {
-			*done = 1;
-			return 0;
-		}
-	}
-
-	/* We are going to change core inode */
-	logflags = XFS_ILOG_CORE;
-	if (ifp->if_flags & XFS_IFBROOT) {
-		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_private.b.firstblock = *firstblock;
-		cur->bc_private.b.flist = flist;
-		cur->bc_private.b.flags = 0;
-	} else {
-		cur = NULL;
-		logflags |= XFS_ILOG_DEXT;
-	}
-
-	/*
-	 * There may be delalloc extents in the data fork before the range we
-	 * are collapsing out, so we cannot
-	 * use the count of real extents here. Instead we have to calculate it
-	 * from the incore fork.
-	 */
-	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	while (nexts++ < num_exts && *current_ext < total_extents) {
-
-		gotp = xfs_iext_get_ext(ifp, *current_ext);
-		xfs_bmbt_get_all(gotp, &got);
-		startoff = got.br_startoff - offset_shift_fsb;
-
-		/*
-		 * Before shifting extent into hole, make sure that the hole
-		 * is large enough to accomodate the shift.
-		 */
-		if (*current_ext) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-						*current_ext - 1), &left);
-
-			if (startoff < left.br_startoff + left.br_blockcount)
-				error = EINVAL;
-		} else if (offset_shift_fsb > got.br_startoff) {
-			/*
-			 * When first extent is shifted, offset_shift_fsb
-			 * should be less than the stating offset of
-			 * the first extent.
-			 */
-			error = EINVAL;
-		}
-
-		if (error)
-			goto del_cursor;
-
-		if (cur) {
-			error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-						   got.br_startblock,
-						   got.br_blockcount,
-						   &i);
-			if (error)
-				goto del_cursor;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-		}
-
-		/* Check if we can merge 2 adjacent extents */
-		if (*current_ext &&
-		    left.br_startoff + left.br_blockcount == startoff &&
-		    left.br_startblock + left.br_blockcount ==
-				got.br_startblock &&
-		    left.br_state == got.br_state &&
-		    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
-			blockcount = left.br_blockcount +
-				got.br_blockcount;
-			xfs_iext_remove(ip, *current_ext, 1, 0);
-			if (cur) {
-				error = xfs_btree_delete(cur, &i);
-				if (error)
-					goto del_cursor;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-			}
-			XFS_IFORK_NEXT_SET(ip, whichfork,
-				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-			gotp = xfs_iext_get_ext(ifp, --*current_ext);
-			xfs_bmbt_get_all(gotp, &got);
-
-			/* Make cursor point to the extent we will update */
-			if (cur) {
-				error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-							   got.br_startblock,
-							   got.br_blockcount,
-							   &i);
-				if (error)
-					goto del_cursor;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
-			}
-
-			xfs_bmbt_set_blockcount(gotp, blockcount);
-			got.br_blockcount = blockcount;
-		} else {
-			/* We have to update the startoff */
-			xfs_bmbt_set_startoff(gotp, startoff);
-			got.br_startoff = startoff;
-		}
-
-		if (cur) {
-			error = xfs_bmbt_update(cur, got.br_startoff,
-						got.br_startblock,
-						got.br_blockcount,
-						got.br_state);
-			if (error)
-				goto del_cursor;
-		}
-
-		(*current_ext)++;
-		total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	}
-
-	/* Check if we are done */
-	if (*current_ext == total_extents)
-		*done = 1;
-
-del_cursor:
-	if (cur)
-		xfs_btree_del_cursor(cur,
-			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-
-	xfs_trans_log_inode(tp, ip, logflags);
-	return error;
-}
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
deleted file mode 100644
index de65bb8bab04..000000000000
--- a/fs/xfs/xfs_bmap_btree.c
+++ /dev/null
@@ -1,967 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-
-/*
- * Determine the extent state.
- */
-/* ARGSUSED */
-STATIC xfs_exntst_t
-xfs_extent_state(
-	xfs_filblks_t		blks,
-	int			extent_flag)
-{
-	if (extent_flag) {
-		ASSERT(blks != 0);	/* saved for DMIG */
-		return XFS_EXT_UNWRITTEN;
-	}
-	return XFS_EXT_NORM;
-}
-
-/*
- * Convert on-disk form of btree root to in-memory form.
- */
-void
-xfs_bmdr_to_bmbt(
-	struct xfs_inode	*ip,
-	xfs_bmdr_block_t	*dblock,
-	int			dblocklen,
-	struct xfs_btree_block	*rblock,
-	int			rblocklen)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	int			dmxr;
-	xfs_bmbt_key_t		*fkp;
-	__be64			*fpp;
-	xfs_bmbt_key_t		*tkp;
-	__be64			*tpp;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS);
-
-	rblock->bb_level = dblock->bb_level;
-	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
-	rblock->bb_numrecs = dblock->bb_numrecs;
-	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-	tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-	dmxr = be16_to_cpu(dblock->bb_numrecs);
-	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
-}
-
-/*
- * Convert a compressed bmap extent record to an uncompressed form.
- * This code must be in sync with the routines xfs_bmbt_get_startoff,
- * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
- */
-STATIC void
-__xfs_bmbt_get_all(
-		__uint64_t l0,
-		__uint64_t l1,
-		xfs_bmbt_irec_t *s)
-{
-	int	ext_flag;
-	xfs_exntst_t st;
-
-	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
-	s->br_startoff = ((xfs_fileoff_t)l0 &
-			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-#if XFS_BIG_BLKNOS
-	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
-			   (((xfs_fsblock_t)l1) >> 21);
-#else
-#ifdef DEBUG
-	{
-		xfs_dfsbno_t	b;
-
-		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
-		    (((xfs_dfsbno_t)l1) >> 21);
-		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-		s->br_startblock = (xfs_fsblock_t)b;
-	}
-#else	/* !DEBUG */
-	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
-#endif	/* DEBUG */
-#endif	/* XFS_BIG_BLKNOS */
-	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
-	/* This is xfs_extent_state() in-line */
-	if (ext_flag) {
-		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
-		st = XFS_EXT_UNWRITTEN;
-	} else
-		st = XFS_EXT_NORM;
-	s->br_state = st;
-}
-
-void
-xfs_bmbt_get_all(
-	xfs_bmbt_rec_host_t *r,
-	xfs_bmbt_irec_t *s)
-{
-	__xfs_bmbt_get_all(r->l0, r->l1, s);
-}
-
-/*
- * Extract the blockcount field from an in memory bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_get_blockcount(
-	xfs_bmbt_rec_host_t	*r)
-{
-	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startblock field from an in memory bmap extent record.
- */
-xfs_fsblock_t
-xfs_bmbt_get_startblock(
-	xfs_bmbt_rec_host_t	*r)
-{
-#if XFS_BIG_BLKNOS
-	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
-	       (((xfs_fsblock_t)r->l1) >> 21);
-#else
-#ifdef DEBUG
-	xfs_dfsbno_t	b;
-
-	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
-	    (((xfs_dfsbno_t)r->l1) >> 21);
-	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-	return (xfs_fsblock_t)b;
-#else	/* !DEBUG */
-	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
-#endif	/* DEBUG */
-#endif	/* XFS_BIG_BLKNOS */
-}
-
-/*
- * Extract the startoff field from an in memory bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_get_startoff(
-	xfs_bmbt_rec_host_t	*r)
-{
-	return ((xfs_fileoff_t)r->l0 &
-		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-xfs_exntst_t
-xfs_bmbt_get_state(
-	xfs_bmbt_rec_host_t	*r)
-{
-	int	ext_flag;
-
-	ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
-	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
-				ext_flag);
-}
-
-/*
- * Extract the blockcount field from an on disk bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_disk_get_blockcount(
-	xfs_bmbt_rec_t	*r)
-{
-	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startoff field from a disk format bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_disk_get_startoff(
-	xfs_bmbt_rec_t	*r)
-{
-	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-
-/*
- * Set all the fields in a bmap extent record from the arguments.
- */
-void
-xfs_bmbt_set_allf(
-	xfs_bmbt_rec_host_t	*r,
-	xfs_fileoff_t		startoff,
-	xfs_fsblock_t		startblock,
-	xfs_filblks_t		blockcount,
-	xfs_exntst_t		state)
-{
-	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
-	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
-	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
-	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-		((xfs_bmbt_rec_base_t)startoff << 9) |
-		((xfs_bmbt_rec_base_t)startblock >> 43);
-	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-		((xfs_bmbt_rec_base_t)blockcount &
-		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(startblock)) {
-		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			((xfs_bmbt_rec_base_t)startoff << 9) |
-			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)startblock << 21) |
-			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	} else {
-		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			((xfs_bmbt_rec_base_t)startoff << 9);
-		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-			 ((xfs_bmbt_rec_base_t)blockcount &
-			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	}
-#endif	/* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-void
-xfs_bmbt_set_all(
-	xfs_bmbt_rec_host_t *r,
-	xfs_bmbt_irec_t	*s)
-{
-	xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
-			     s->br_blockcount, s->br_state);
-}
-
-
-/*
- * Set all the fields in a disk format bmap extent record from the arguments.
- */
-void
-xfs_bmbt_disk_set_allf(
-	xfs_bmbt_rec_t		*r,
-	xfs_fileoff_t		startoff,
-	xfs_fsblock_t		startblock,
-	xfs_filblks_t		blockcount,
-	xfs_exntst_t		state)
-{
-	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
-	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
-	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
-	r->l0 = cpu_to_be64(
-		((xfs_bmbt_rec_base_t)extent_flag << 63) |
-		 ((xfs_bmbt_rec_base_t)startoff << 9) |
-		 ((xfs_bmbt_rec_base_t)startblock >> 43));
-	r->l1 = cpu_to_be64(
-		((xfs_bmbt_rec_base_t)startblock << 21) |
-		 ((xfs_bmbt_rec_base_t)blockcount &
-		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(startblock)) {
-		r->l0 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			 ((xfs_bmbt_rec_base_t)startoff << 9) |
-			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)startblock << 21) |
-			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-	} else {
-		r->l0 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			 ((xfs_bmbt_rec_base_t)startoff << 9));
-		r->l1 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)startblock << 21) |
-			 ((xfs_bmbt_rec_base_t)blockcount &
-			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-	}
-#endif	/* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-STATIC void
-xfs_bmbt_disk_set_all(
-	xfs_bmbt_rec_t	*r,
-	xfs_bmbt_irec_t *s)
-{
-	xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
-				  s->br_blockcount, s->br_state);
-}
-
-/*
- * Set the blockcount field in a bmap extent record.
- */
-void
-xfs_bmbt_set_blockcount(
-	xfs_bmbt_rec_host_t *r,
-	xfs_filblks_t	v)
-{
-	ASSERT((v & xfs_mask64hi(43)) == 0);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
-		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
-}
-
-/*
- * Set the startblock field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startblock(
-	xfs_bmbt_rec_host_t *r,
-	xfs_fsblock_t	v)
-{
-#if XFS_BIG_BLKNOS
-	ASSERT((v & xfs_mask64hi(12)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
-		  (xfs_bmbt_rec_base_t)(v >> 43);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
-		  (xfs_bmbt_rec_base_t)(v << 21);
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(v)) {
-		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	} else {
-		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	}
-#endif	/* XFS_BIG_BLKNOS */
-}
-
-/*
- * Set the startoff field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startoff(
-	xfs_bmbt_rec_host_t *r,
-	xfs_fileoff_t	v)
-{
-	ASSERT((v & xfs_mask64hi(9)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
-		((xfs_bmbt_rec_base_t)v << 9) |
-		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-}
-
-/*
- * Set the extent state field in a bmap extent record.
- */
-void
-xfs_bmbt_set_state(
-	xfs_bmbt_rec_host_t *r,
-	xfs_exntst_t	v)
-{
-	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
-	if (v == XFS_EXT_NORM)
-		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
-	else
-		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
-}
-
-/*
- * Convert in-memory form of btree root to on-disk form.
- */
-void
-xfs_bmbt_to_bmdr(
-	struct xfs_mount	*mp,
-	struct xfs_btree_block	*rblock,
-	int			rblocklen,
-	xfs_bmdr_block_t	*dblock,
-	int			dblocklen)
-{
-	int			dmxr;
-	xfs_bmbt_key_t		*fkp;
-	__be64			*fpp;
-	xfs_bmbt_key_t		*tkp;
-	__be64			*tpp;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
-		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
-		ASSERT(rblock->bb_u.l.bb_blkno ==
-		       cpu_to_be64(XFS_BUF_DADDR_NULL));
-	} else
-		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
-	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
-	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
-	ASSERT(rblock->bb_level != 0);
-	dblock->bb_level = rblock->bb_level;
-	dblock->bb_numrecs = rblock->bb_numrecs;
-	dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
-	fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-	tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-	fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-	tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-	dmxr = be16_to_cpu(dblock->bb_numrecs);
-	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
-}
-
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
-	xfs_ifork_t		*ifp,
-	xfs_extnum_t		idx,
-	xfs_extnum_t		num)
-{
-	for (; num > 0; num--, idx++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-		if ((ep->l0 >>
-		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
-			ASSERT(0);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-
-STATIC struct xfs_btree_cur *
-xfs_bmbt_dup_cursor(
-	struct xfs_btree_cur	*cur)
-{
-	struct xfs_btree_cur	*new;
-
-	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-
-	/*
-	 * Copy the firstblock, flist, and flags values,
-	 * since init cursor doesn't get them.
-	 */
-	new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-	new->bc_private.b.flist = cur->bc_private.b.flist;
-	new->bc_private.b.flags = cur->bc_private.b.flags;
-
-	return new;
-}
-
-STATIC void
-xfs_bmbt_update_cursor(
-	struct xfs_btree_cur	*src,
-	struct xfs_btree_cur	*dst)
-{
-	ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
-	       (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
-	ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
-
-	dst->bc_private.b.allocated += src->bc_private.b.allocated;
-	dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
-
-	src->bc_private.b.allocated = 0;
-}
-
-STATIC int
-xfs_bmbt_alloc_block(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*start,
-	union xfs_btree_ptr	*new,
-	int			*stat)
-{
-	xfs_alloc_arg_t		args;		/* block allocation args */
-	int			error;		/* error return value */
-
-	memset(&args, 0, sizeof(args));
-	args.tp = cur->bc_tp;
-	args.mp = cur->bc_mp;
-	args.fsbno = cur->bc_private.b.firstblock;
-	args.firstblock = args.fsbno;
-
-	if (args.fsbno == NULLFSBLOCK) {
-		args.fsbno = be64_to_cpu(start->l);
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		/*
-		 * Make sure there is sufficient room left in the AG to
-		 * complete a full tree split for an extent insert.  If
-		 * we are converting the middle part of an extent then
-		 * we may need space for two tree splits.
-		 *
-		 * We are relying on the caller to make the correct block
-		 * reservation for this operation to succeed.  If the
-		 * reservation amount is insufficient then we may fail a
-		 * block allocation here and corrupt the filesystem.
-		 */
-		args.minleft = xfs_trans_get_block_res(args.tp);
-	} else if (cur->bc_private.b.flist->xbf_low) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else {
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-	}
-
-	args.minlen = args.maxlen = args.prod = 1;
-	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-		error = ENOSPC;
-		goto error0;
-	}
-	error = xfs_alloc_vextent(&args);
-	if (error)
-		goto error0;
-
-	if (args.fsbno == NULLFSBLOCK && args.minleft) {
-		/*
-		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again without minleft and if
-		 * successful activate the lowspace algorithm.
-		 */
-		args.fsbno = 0;
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.minleft = 0;
-		error = xfs_alloc_vextent(&args);
-		if (error)
-			goto error0;
-		cur->bc_private.b.flist->xbf_low = 1;
-	}
-	if (args.fsbno == NULLFSBLOCK) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-	ASSERT(args.len == 1);
-	cur->bc_private.b.firstblock = args.fsbno;
-	cur->bc_private.b.allocated++;
-	cur->bc_private.b.ip->i_d.di_nblocks++;
-	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-	xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
-			XFS_TRANS_DQ_BCOUNT, 1L);
-
-	new->l = cpu_to_be64(args.fsbno);
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
- error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-STATIC int
-xfs_bmbt_free_block(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = cur->bc_mp;
-	struct xfs_inode	*ip = cur->bc_private.b.ip;
-	struct xfs_trans	*tp = cur->bc_tp;
-	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
-
-	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
-	ip->i_d.di_nblocks--;
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-	xfs_trans_binval(tp, bp);
-	return 0;
-}
-
-STATIC int
-xfs_bmbt_get_minrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	if (level == cur->bc_nlevels - 1) {
-		struct xfs_ifork	*ifp;
-
-		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-				    cur->bc_private.b.whichfork);
-
-		return xfs_bmbt_maxrecs(cur->bc_mp,
-					ifp->if_broot_bytes, level == 0) / 2;
-	}
-
-	return cur->bc_mp->m_bmap_dmnr[level != 0];
-}
-
-int
-xfs_bmbt_get_maxrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	if (level == cur->bc_nlevels - 1) {
-		struct xfs_ifork	*ifp;
-
-		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-				    cur->bc_private.b.whichfork);
-
-		return xfs_bmbt_maxrecs(cur->bc_mp,
-					ifp->if_broot_bytes, level == 0);
-	}
-
-	return cur->bc_mp->m_bmap_dmxr[level != 0];
-
-}
-
-/*
- * Get the maximum records we could store in the on-disk format.
- *
- * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
- * for the root node this checks the available space in the dinode fork
- * so that we can resize the in-memory buffer to match it.  After a
- * resize to the maximum size this function returns the same value
- * as xfs_bmbt_get_maxrecs for the root node, too.
- */
-STATIC int
-xfs_bmbt_get_dmaxrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	if (level != cur->bc_nlevels - 1)
-		return cur->bc_mp->m_bmap_dmxr[level != 0];
-	return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
-}
-
-STATIC void
-xfs_bmbt_init_key_from_rec(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	key->bmbt.br_startoff =
-		cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
-}
-
-STATIC void
-xfs_bmbt_init_rec_from_key(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	ASSERT(key->bmbt.br_startoff != 0);
-
-	xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
-			       0, 0, XFS_EXT_NORM);
-}
-
-STATIC void
-xfs_bmbt_init_rec_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*rec)
-{
-	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
-}
-
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	ptr->l = 0;
-}
-
-STATIC __int64_t
-xfs_bmbt_key_diff(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*key)
-{
-	return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
-				      cur->bc_rec.b.br_startoff;
-}
-
-static bool
-xfs_bmbt_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	unsigned int		level;
-
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
-			return false;
-		/*
-		 * XXX: need a better way of verifying the owner here. Right now
-		 * just make sure there has been one set.
-		 */
-		if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
-			return false;
-		/* fall through */
-	case cpu_to_be32(XFS_BMAP_MAGIC):
-		break;
-	default:
-		return false;
-	}
-
-	/*
-	 * numrecs and level verification.
-	 *
-	 * We don't know what fork we belong to, so just verify that the level
-	 * is less than the maximum of the two. Later checks will be more
-	 * precise.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
-		return false;
-	if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-		return false;
-
-	/* sibling pointer verification */
-	if (!block->bb_u.l.bb_leftsib ||
-	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
-	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
-		return false;
-	if (!block->bb_u.l.bb_rightsib ||
-	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
-	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
-		return false;
-
-	return true;
-}
-
-static void
-xfs_bmbt_read_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_btree_lblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_bmbt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_verifier_error(bp);
-	}
-}
-
-static void
-xfs_bmbt_write_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_bmbt_verify(bp)) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-	xfs_btree_lblock_calc_crc(bp);
-}
-
-const struct xfs_buf_ops xfs_bmbt_buf_ops = {
-	.verify_read = xfs_bmbt_read_verify,
-	.verify_write = xfs_bmbt_write_verify,
-};
-
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_bmbt_keys_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*k1,
-	union xfs_btree_key	*k2)
-{
-	return be64_to_cpu(k1->bmbt.br_startoff) <
-		be64_to_cpu(k2->bmbt.br_startoff);
-}
-
-STATIC int
-xfs_bmbt_recs_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*r1,
-	union xfs_btree_rec	*r2)
-{
-	return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
-		xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
-		xfs_bmbt_disk_get_startoff(&r2->bmbt);
-}
-#endif	/* DEBUG */
-
-static const struct xfs_btree_ops xfs_bmbt_ops = {
-	.rec_len		= sizeof(xfs_bmbt_rec_t),
-	.key_len		= sizeof(xfs_bmbt_key_t),
-
-	.dup_cursor		= xfs_bmbt_dup_cursor,
-	.update_cursor		= xfs_bmbt_update_cursor,
-	.alloc_block		= xfs_bmbt_alloc_block,
-	.free_block		= xfs_bmbt_free_block,
-	.get_maxrecs		= xfs_bmbt_get_maxrecs,
-	.get_minrecs		= xfs_bmbt_get_minrecs,
-	.get_dmaxrecs		= xfs_bmbt_get_dmaxrecs,
-	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
-	.init_rec_from_key	= xfs_bmbt_init_rec_from_key,
-	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
-	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
-	.key_diff		= xfs_bmbt_key_diff,
-	.buf_ops		= &xfs_bmbt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-	.keys_inorder		= xfs_bmbt_keys_inorder,
-	.recs_inorder		= xfs_bmbt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new bmap btree cursor.
- */
-struct xfs_btree_cur *				/* new bmap btree cursor */
-xfs_bmbt_init_cursor(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* inode owning the btree */
-	int			whichfork)	/* data or attr fork */
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-	struct xfs_btree_cur	*cur;
-
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-	cur->bc_tp = tp;
-	cur->bc_mp = mp;
-	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-	cur->bc_btnum = XFS_BTNUM_BMAP;
-	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
-	cur->bc_ops = &xfs_bmbt_ops;
-	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-	cur->bc_private.b.ip = ip;
-	cur->bc_private.b.firstblock = NULLFSBLOCK;
-	cur->bc_private.b.flist = NULL;
-	cur->bc_private.b.allocated = 0;
-	cur->bc_private.b.flags = 0;
-	cur->bc_private.b.whichfork = whichfork;
-
-	return cur;
-}
-
-/*
- * Calculate number of records in a bmap btree block.
- */
-int
-xfs_bmbt_maxrecs(
-	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
-{
-	blocklen -= XFS_BMBT_BLOCK_LEN(mp);
-
-	if (leaf)
-		return blocklen / sizeof(xfs_bmbt_rec_t);
-	return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
-}
-
-/*
- * Calculate number of records in a bmap btree inode root.
- */
-int
-xfs_bmdr_maxrecs(
-	int			blocklen,
-	int			leaf)
-{
-	blocklen -= sizeof(xfs_bmdr_block_t);
-
-	if (leaf)
-		return blocklen / sizeof(xfs_bmdr_rec_t);
-	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
-}
-
-/*
- * Change the owner of a btree format fork fo the inode passed in. Change it to
- * the owner of that is passed in so that we can change owners before or after
- * we switch forks between inodes. The operation that the caller is doing will
- * determine whether is needs to change owner before or after the switch.
- *
- * For demand paged transactional modification, the fork switch should be done
- * after reading in all the blocks, modifying them and pinning them in the
- * transaction. For modification when the buffers are already pinned in memory,
- * the fork switch can be done before changing the owner as we won't need to
- * validate the owner until the btree buffers are unpinned and writes can occur
- * again.
- *
- * For recovery based ownership change, there is no transactional context and
- * so a buffer list must be supplied so that we can record the buffers that we
- * modified for the caller to issue IO on.
- */
-int
-xfs_bmbt_change_owner(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	xfs_ino_t		new_owner,
-	struct list_head	*buffer_list)
-{
-	struct xfs_btree_cur	*cur;
-	int			error;
-
-	ASSERT(tp || buffer_list);
-	ASSERT(!(tp && buffer_list));
-	if (whichfork == XFS_DATA_FORK)
-		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
-	else
-		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
-
-	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
-	if (!cur)
-		return ENOMEM;
-
-	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
-	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-	return error;
-}
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
deleted file mode 100644
index 036b4fd34bf7..000000000000
--- a/fs/xfs/xfs_btree.c
+++ /dev/null
@@ -1,3989 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_btree.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t	*xfs_btree_cur_zone;
-
-/*
- * Btree magic numbers.
- */
-static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
-	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
-	  XFS_FIBT_MAGIC },
-	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
-	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
-};
-#define xfs_btree_magic(cur) \
-	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
-
-
-STATIC int				/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	struct xfs_btree_block	*block,	/* btree long form block pointer */
-	int			level,	/* level of the btree block */
-	struct xfs_buf		*bp)	/* buffer for block, if any */
-{
-	int			lblock_ok = 1; /* block passes checks */
-	struct xfs_mount	*mp;	/* file system mount point */
-
-	mp = cur->bc_mp;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		lblock_ok = lblock_ok &&
-			uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
-			block->bb_u.l.bb_blkno == cpu_to_be64(
-				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
-	}
-
-	lblock_ok = lblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
-		be16_to_cpu(block->bb_level) == level &&
-		be16_to_cpu(block->bb_numrecs) <=
-			cur->bc_ops->get_maxrecs(cur, level) &&
-		block->bb_u.l.bb_leftsib &&
-		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
-		 XFS_FSB_SANITY_CHECK(mp,
-			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-		block->bb_u.l.bb_rightsib &&
-		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
-		 XFS_FSB_SANITY_CHECK(mp,
-			be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
-	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
-			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
-			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
-		if (bp)
-			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-	return 0;
-}
-
-STATIC int				/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	struct xfs_btree_block	*block,	/* btree short form block pointer */
-	int			level,	/* level of the btree block */
-	struct xfs_buf		*bp)	/* buffer containing block */
-{
-	struct xfs_mount	*mp;	/* file system mount point */
-	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
-	struct xfs_agf		*agf;	/* ag. freespace structure */
-	xfs_agblock_t		agflen;	/* native ag. freespace length */
-	int			sblock_ok = 1; /* block passes checks */
-
-	mp = cur->bc_mp;
-	agbp = cur->bc_private.a.agbp;
-	agf = XFS_BUF_TO_AGF(agbp);
-	agflen = be32_to_cpu(agf->agf_length);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		sblock_ok = sblock_ok &&
-			uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
-			block->bb_u.s.bb_blkno == cpu_to_be64(
-				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
-	}
-
-	sblock_ok = sblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
-		be16_to_cpu(block->bb_level) == level &&
-		be16_to_cpu(block->bb_numrecs) <=
-			cur->bc_ops->get_maxrecs(cur, level) &&
-		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
-		block->bb_u.s.bb_leftsib &&
-		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
-		block->bb_u.s.bb_rightsib;
-
-	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
-			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
-			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
-		if (bp)
-			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-	return 0;
-}
-
-/*
- * Debug routine: check that block header is ok.
- */
-int
-xfs_btree_check_block(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	struct xfs_btree_block	*block,	/* generic btree block pointer */
-	int			level,	/* level of the btree block */
-	struct xfs_buf		*bp)	/* buffer containing block, if any */
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return xfs_btree_check_lblock(cur, block, level, bp);
-	else
-		return xfs_btree_check_sblock(cur, block, level, bp);
-}
-
-/*
- * Check that (long) pointer is ok.
- */
-int					/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_dfsbno_t		bno,	/* btree block disk address */
-	int			level)	/* btree block level */
-{
-	XFS_WANT_CORRUPTED_RETURN(
-		level > 0 &&
-		bno != NULLDFSBNO &&
-		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
-	return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that (short) pointer is ok.
- */
-STATIC int				/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* btree block disk address */
-	int			level)	/* btree block level */
-{
-	xfs_agblock_t		agblocks = cur->bc_mp->m_sb.sb_agblocks;
-
-	XFS_WANT_CORRUPTED_RETURN(
-		level > 0 &&
-		bno != NULLAGBLOCK &&
-		bno != 0 &&
-		bno < agblocks);
-	return 0;
-}
-
-/*
- * Check that block ptr is ok.
- */
-STATIC int				/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_ptr(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	union xfs_btree_ptr	*ptr,	/* btree block disk address */
-	int			index,	/* offset from ptr to check */
-	int			level)	/* btree block level */
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		return xfs_btree_check_lptr(cur,
-				be64_to_cpu((&ptr->l)[index]), level);
-	} else {
-		return xfs_btree_check_sptr(cur,
-				be32_to_cpu((&ptr->s)[index]), level);
-	}
-}
-#endif
-
-/*
- * Calculate CRC on the whole btree block and stuff it into the
- * long-form btree header.
- *
- * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
- * it to disk.
- */
-void
-xfs_btree_lblock_calc_crc(
-	struct xfs_buf		*bp)
-{
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-		return;
-	if (bip)
-		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-	xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-}
-
-bool
-xfs_btree_lblock_verify_crc(
-	struct xfs_buf		*bp)
-{
-	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-		return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-
-	return true;
-}
-
-/*
- * Calculate CRC on the whole btree block and stuff it into the
- * short-form btree header.
- *
- * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
- * it to disk.
- */
-void
-xfs_btree_sblock_calc_crc(
-	struct xfs_buf		*bp)
-{
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-		return;
-	if (bip)
-		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-	xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-}
-
-bool
-xfs_btree_sblock_verify_crc(
-	struct xfs_buf		*bp)
-{
-	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-
-	return true;
-}
-
-/*
- * Delete the btree cursor.
- */
-void
-xfs_btree_del_cursor(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	int		error)		/* del because of error */
-{
-	int		i;		/* btree level */
-
-	/*
-	 * Clear the buffer pointers, and release the buffers.
-	 * If we're doing this in the face of an error, we
-	 * need to make sure to inspect all of the entries
-	 * in the bc_bufs array for buffers to be unlocked.
-	 * This is because some of the btree code works from
-	 * level n down to 0, and if we get an error along
-	 * the way we won't have initialized all the entries
-	 * down to 0.
-	 */
-	for (i = 0; i < cur->bc_nlevels; i++) {
-		if (cur->bc_bufs[i])
-			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
-		else if (!error)
-			break;
-	}
-	/*
-	 * Can't free a bmap cursor without having dealt with the
-	 * allocated indirect blocks' accounting.
-	 */
-	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
-	       cur->bc_private.b.allocated == 0);
-	/*
-	 * Free the cursor.
-	 */
-	kmem_zone_free(xfs_btree_cur_zone, cur);
-}
-
-/*
- * Duplicate the btree cursor.
- * Allocate a new one, copy the record, re-get the buffers.
- */
-int					/* error */
-xfs_btree_dup_cursor(
-	xfs_btree_cur_t	*cur,		/* input cursor */
-	xfs_btree_cur_t	**ncur)		/* output cursor */
-{
-	xfs_buf_t	*bp;		/* btree block's buffer pointer */
-	int		error;		/* error return value */
-	int		i;		/* level number of btree block */
-	xfs_mount_t	*mp;		/* mount structure for filesystem */
-	xfs_btree_cur_t	*new;		/* new cursor value */
-	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
-
-	tp = cur->bc_tp;
-	mp = cur->bc_mp;
-
-	/*
-	 * Allocate a new cursor like the old one.
-	 */
-	new = cur->bc_ops->dup_cursor(cur);
-
-	/*
-	 * Copy the record currently in the cursor.
-	 */
-	new->bc_rec = cur->bc_rec;
-
-	/*
-	 * For each level current, re-get the buffer and copy the ptr value.
-	 */
-	for (i = 0; i < new->bc_nlevels; i++) {
-		new->bc_ptrs[i] = cur->bc_ptrs[i];
-		new->bc_ra[i] = cur->bc_ra[i];
-		bp = cur->bc_bufs[i];
-		if (bp) {
-			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-						   XFS_BUF_ADDR(bp), mp->m_bsize,
-						   0, &bp,
-						   cur->bc_ops->buf_ops);
-			if (error) {
-				xfs_btree_del_cursor(new, error);
-				*ncur = NULL;
-				return error;
-			}
-		}
-		new->bc_bufs[i] = bp;
-	}
-	*ncur = new;
-	return 0;
-}
-
-/*
- * XFS btree block layout and addressing:
- *
- * There are two types of blocks in the btree: leaf and non-leaf blocks.
- *
- * The leaf record start with a header then followed by records containing
- * the values.  A non-leaf block also starts with the same header, and
- * then first contains lookup keys followed by an equal number of pointers
- * to the btree blocks at the previous level.
- *
- *		+--------+-------+-------+-------+-------+-------+-------+
- * Leaf:	| header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
- *		+--------+-------+-------+-------+-------+-------+-------+
- *
- *		+--------+-------+-------+-------+-------+-------+-------+
- * Non-Leaf:	| header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
- *		+--------+-------+-------+-------+-------+-------+-------+
- *
- * The header is called struct xfs_btree_block for reasons better left unknown
- * and comes in different versions for short (32bit) and long (64bit) block
- * pointers.  The record and key structures are defined by the btree instances
- * and opaque to the btree core.  The block pointers are simple disk endian
- * integers, available in a short (32bit) and long (64bit) variant.
- *
- * The helpers below calculate the offset of a given record, key or pointer
- * into a btree block (xfs_btree_*_offset) or return a pointer to the given
- * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
- * inside the btree block is done using indices starting at one, not zero!
- */
-
-/*
- * Return size of the btree block header for this btree instance.
- */
-static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
-			return XFS_BTREE_LBLOCK_CRC_LEN;
-		return XFS_BTREE_LBLOCK_LEN;
-	}
-	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
-		return XFS_BTREE_SBLOCK_CRC_LEN;
-	return XFS_BTREE_SBLOCK_LEN;
-}
-
-/*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
-	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-		sizeof(__be64) : sizeof(__be32);
-}
-
-/*
- * Calculate offset of the n-th record in a btree block.
- */
-STATIC size_t
-xfs_btree_rec_offset(
-	struct xfs_btree_cur	*cur,
-	int			n)
-{
-	return xfs_btree_block_len(cur) +
-		(n - 1) * cur->bc_ops->rec_len;
-}
-
-/*
- * Calculate offset of the n-th key in a btree block.
- */
-STATIC size_t
-xfs_btree_key_offset(
-	struct xfs_btree_cur	*cur,
-	int			n)
-{
-	return xfs_btree_block_len(cur) +
-		(n - 1) * cur->bc_ops->key_len;
-}
-
-/*
- * Calculate offset of the n-th block pointer in a btree block.
- */
-STATIC size_t
-xfs_btree_ptr_offset(
-	struct xfs_btree_cur	*cur,
-	int			n,
-	int			level)
-{
-	return xfs_btree_block_len(cur) +
-		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
-		(n - 1) * xfs_btree_ptr_len(cur);
-}
-
-/*
- * Return a pointer to the n-th record in the btree block.
- */
-STATIC union xfs_btree_rec *
-xfs_btree_rec_addr(
-	struct xfs_btree_cur	*cur,
-	int			n,
-	struct xfs_btree_block	*block)
-{
-	return (union xfs_btree_rec *)
-		((char *)block + xfs_btree_rec_offset(cur, n));
-}
-
-/*
- * Return a pointer to the n-th key in the btree block.
- */
-STATIC union xfs_btree_key *
-xfs_btree_key_addr(
-	struct xfs_btree_cur	*cur,
-	int			n,
-	struct xfs_btree_block	*block)
-{
-	return (union xfs_btree_key *)
-		((char *)block + xfs_btree_key_offset(cur, n));
-}
-
-/*
- * Return a pointer to the n-th block pointer in the btree block.
- */
-STATIC union xfs_btree_ptr *
-xfs_btree_ptr_addr(
-	struct xfs_btree_cur	*cur,
-	int			n,
-	struct xfs_btree_block	*block)
-{
-	int			level = xfs_btree_get_level(block);
-
-	ASSERT(block->bb_level != 0);
-
-	return (union xfs_btree_ptr *)
-		((char *)block + xfs_btree_ptr_offset(cur, n, level));
-}
-
-/*
- * Get the root block which is stored in the inode.
- *
- * For now this btree implementation assumes the btree root is always
- * stored in the if_broot field of an inode fork.
- */
-STATIC struct xfs_btree_block *
-xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
-{
-       struct xfs_ifork        *ifp;
-
-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
-}
-
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be an inode btree root or from a buffer.
- */
-STATIC struct xfs_btree_block *		/* generic btree block pointer */
-xfs_btree_get_block(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	int			level,	/* level in btree */
-	struct xfs_buf		**bpp)	/* buffer containing the block */
-{
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1)) {
-		*bpp = NULL;
-		return xfs_btree_get_iroot(cur);
-	}
-
-	*bpp = cur->bc_bufs[level];
-	return XFS_BUF_TO_BLOCK(*bpp);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-xfs_buf_t *				/* buffer for fsbno */
-xfs_btree_get_bufl(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_fsblock_t	fsbno,		/* file system block number */
-	uint		lock)		/* lock flags for get_buf */
-{
-	xfs_daddr_t		d;		/* real disk block address */
-
-	ASSERT(fsbno != NULLFSBLOCK);
-	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-xfs_buf_t *				/* buffer for agno/agbno */
-xfs_btree_get_bufs(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* allocation group block number */
-	uint		lock)		/* lock flags for get_buf */
-{
-	xfs_daddr_t		d;		/* real disk block address */
-
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agbno != NULLAGBLOCK);
-	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-}
-
-/*
- * Check for the cursor referring to the last block at the given level.
- */
-int					/* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level)	/* level to check */
-{
-	struct xfs_btree_block	*block;	/* generic btree block pointer */
-	xfs_buf_t		*bp;	/* buffer containing block */
-
-	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
-	else
-		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-STATIC int				/* success=1, failure=0 */
-xfs_btree_firstrec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level)	/* level to change */
-{
-	struct xfs_btree_block	*block;	/* generic btree block pointer */
-	xfs_buf_t		*bp;	/* buffer containing block */
-
-	/*
-	 * Get the block pointer for this level.
-	 */
-	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
-	/*
-	 * It's empty, there is no such record.
-	 */
-	if (!block->bb_numrecs)
-		return 0;
-	/*
-	 * Set the ptr value to 1, that's the first record/key.
-	 */
-	cur->bc_ptrs[level] = 1;
-	return 1;
-}
-
-/*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-STATIC int				/* success=1, failure=0 */
-xfs_btree_lastrec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level)	/* level to change */
-{
-	struct xfs_btree_block	*block;	/* generic btree block pointer */
-	xfs_buf_t		*bp;	/* buffer containing block */
-
-	/*
-	 * Get the block pointer for this level.
-	 */
-	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
-	/*
-	 * It's empty, there is no such record.
-	 */
-	if (!block->bb_numrecs)
-		return 0;
-	/*
-	 * Set the ptr value to numrecs, that's the last record/key.
-	 */
-	cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
-	return 1;
-}
-
-/*
- * Compute first and last byte offsets for the fields given.
- * Interprets the offsets table, which contains struct field offsets.
- */
-void
-xfs_btree_offsets(
-	__int64_t	fields,		/* bitmask of fields */
-	const short	*offsets,	/* table of field offsets */
-	int		nbits,		/* number of bits to inspect */
-	int		*first,		/* output: first byte offset */
-	int		*last)		/* output: last byte offset */
-{
-	int		i;		/* current bit number */
-	__int64_t	imask;		/* mask for current bit number */
-
-	ASSERT(fields != 0);
-	/*
-	 * Find the lowest bit, so the first byte offset.
-	 */
-	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
-		if (imask & fields) {
-			*first = offsets[i];
-			break;
-		}
-	}
-	/*
-	 * Find the highest bit, so the last byte offset.
-	 */
-	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
-		if (imask & fields) {
-			*last = offsets[i + 1] - 1;
-			break;
-		}
-	}
-}
-
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	xfs_fsblock_t		fsbno,		/* file system block number */
-	uint			lock,		/* lock flags for read_buf */
-	struct xfs_buf		**bpp,		/* buffer for fsbno */
-	int			refval,		/* ref count value for buffer */
-	const struct xfs_buf_ops *ops)
-{
-	struct xfs_buf		*bp;		/* return value */
-	xfs_daddr_t		d;		/* real disk block address */
-	int			error;
-
-	ASSERT(fsbno != NULLFSBLOCK);
-	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, lock, &bp, ops);
-	if (error)
-		return error;
-	if (bp)
-		xfs_buf_set_ref(bp, refval);
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
-	struct xfs_mount	*mp,		/* file system mount point */
-	xfs_fsblock_t		fsbno,		/* file system block number */
-	xfs_extlen_t		count,		/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops)
-{
-	xfs_daddr_t		d;
-
-	ASSERT(fsbno != NULLFSBLOCK);
-	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
-	struct xfs_mount	*mp,		/* file system mount point */
-	xfs_agnumber_t		agno,		/* allocation group number */
-	xfs_agblock_t		agbno,		/* allocation group block number */
-	xfs_extlen_t		count,		/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops)
-{
-	xfs_daddr_t		d;
-
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agbno != NULLAGBLOCK);
-	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
-
-STATIC int
-xfs_btree_readahead_lblock(
-	struct xfs_btree_cur	*cur,
-	int			lr,
-	struct xfs_btree_block	*block)
-{
-	int			rval = 0;
-	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
-	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
-				     cur->bc_ops->buf_ops);
-		rval++;
-	}
-
-	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
-				     cur->bc_ops->buf_ops);
-		rval++;
-	}
-
-	return rval;
-}
-
-STATIC int
-xfs_btree_readahead_sblock(
-	struct xfs_btree_cur	*cur,
-	int			lr,
-	struct xfs_btree_block *block)
-{
-	int			rval = 0;
-	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
-	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
-
-	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
-		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     left, 1, cur->bc_ops->buf_ops);
-		rval++;
-	}
-
-	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
-		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     right, 1, cur->bc_ops->buf_ops);
-		rval++;
-	}
-
-	return rval;
-}
-
-/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
- */
-STATIC int
-xfs_btree_readahead(
-	struct xfs_btree_cur	*cur,		/* btree cursor */
-	int			lev,		/* level in btree */
-	int			lr)		/* left/right bits */
-{
-	struct xfs_btree_block	*block;
-
-	/*
-	 * No readahead needed if we are at the root level and the
-	 * btree root is stored in the inode.
-	 */
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (lev == cur->bc_nlevels - 1))
-		return 0;
-
-	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-		return 0;
-
-	cur->bc_ra[lev] |= lr;
-	block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
-
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return xfs_btree_readahead_lblock(cur, lr, block);
-	return xfs_btree_readahead_sblock(cur, lr, block);
-}
-
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
-		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
-	} else {
-		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
-		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
-		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
-					be32_to_cpu(ptr->s));
-	}
-}
-
-/*
- * Readahead @count btree blocks at the given @ptr location.
- *
- * We don't need to care about long or short form btrees here as we have a
- * method of converting the ptr directly to a daddr available to us.
- */
-STATIC void
-xfs_btree_readahead_ptr(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr,
-	xfs_extlen_t		count)
-{
-	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
-			  xfs_btree_ptr_to_daddr(cur, ptr),
-			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
-}
-
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-STATIC void
-xfs_btree_setbuf(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			lev,	/* level in btree */
-	xfs_buf_t		*bp)	/* new buffer to set */
-{
-	struct xfs_btree_block	*b;	/* btree block */
-
-	if (cur->bc_bufs[lev])
-		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
-	cur->bc_bufs[lev] = bp;
-	cur->bc_ra[lev] = 0;
-
-	b = XFS_BUF_TO_BLOCK(bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
-			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
-			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
-	} else {
-		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
-			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-		if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
-			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
-	}
-}
-
-STATIC int
-xfs_btree_ptr_is_null(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return ptr->l == cpu_to_be64(NULLDFSBNO);
-	else
-		return ptr->s == cpu_to_be32(NULLAGBLOCK);
-}
-
-STATIC void
-xfs_btree_set_ptr_null(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(NULLDFSBNO);
-	else
-		ptr->s = cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
- * Get/set/init sibling pointers
- */
-STATIC void
-xfs_btree_get_sibling(
-	struct xfs_btree_cur	*cur,
-	struct xfs_btree_block	*block,
-	union xfs_btree_ptr	*ptr,
-	int			lr)
-{
-	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
-
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (lr == XFS_BB_RIGHTSIB)
-			ptr->l = block->bb_u.l.bb_rightsib;
-		else
-			ptr->l = block->bb_u.l.bb_leftsib;
-	} else {
-		if (lr == XFS_BB_RIGHTSIB)
-			ptr->s = block->bb_u.s.bb_rightsib;
-		else
-			ptr->s = block->bb_u.s.bb_leftsib;
-	}
-}
-
-STATIC void
-xfs_btree_set_sibling(
-	struct xfs_btree_cur	*cur,
-	struct xfs_btree_block	*block,
-	union xfs_btree_ptr	*ptr,
-	int			lr)
-{
-	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
-
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (lr == XFS_BB_RIGHTSIB)
-			block->bb_u.l.bb_rightsib = ptr->l;
-		else
-			block->bb_u.l.bb_leftsib = ptr->l;
-	} else {
-		if (lr == XFS_BB_RIGHTSIB)
-			block->bb_u.s.bb_rightsib = ptr->s;
-		else
-			block->bb_u.s.bb_leftsib = ptr->s;
-	}
-}
-
-void
-xfs_btree_init_block_int(
-	struct xfs_mount	*mp,
-	struct xfs_btree_block	*buf,
-	xfs_daddr_t		blkno,
-	__u32			magic,
-	__u16			level,
-	__u16			numrecs,
-	__u64			owner,
-	unsigned int		flags)
-{
-	buf->bb_magic = cpu_to_be32(magic);
-	buf->bb_level = cpu_to_be16(level);
-	buf->bb_numrecs = cpu_to_be16(numrecs);
-
-	if (flags & XFS_BTREE_LONG_PTRS) {
-		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
-			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
-			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
-			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
-			buf->bb_u.l.bb_pad = 0;
-			buf->bb_u.l.bb_lsn = 0;
-		}
-	} else {
-		/* owner is a 32 bit value on short blocks */
-		__u32 __owner = (__u32)owner;
-
-		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
-			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
-			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
-			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
-			buf->bb_u.s.bb_lsn = 0;
-		}
-	}
-}
-
-void
-xfs_btree_init_block(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	__u32		magic,
-	__u16		level,
-	__u16		numrecs,
-	__u64		owner,
-	unsigned int	flags)
-{
-	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 magic, level, numrecs, owner, flags);
-}
-
-STATIC void
-xfs_btree_init_block_cur(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	int			numrecs)
-{
-	__u64 owner;
-
-	/*
-	 * we can pull the owner from the cursor right now as the different
-	 * owners align directly with the pointer size of the btree. This may
-	 * change in future, but is safe for current users of the generic btree
-	 * code.
-	 */
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		owner = cur->bc_private.b.ip->i_ino;
-	else
-		owner = cur->bc_private.a.agno;
-
-	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 xfs_btree_magic(cur), level, numrecs,
-				 owner, cur->bc_flags);
-}
-
-/*
- * Return true if ptr is the last record in the btree and
- * we need to track updates to this record.  The decision
- * will be further refined in the update_lastrec method.
- */
-STATIC int
-xfs_btree_is_lastrec(
-	struct xfs_btree_cur	*cur,
-	struct xfs_btree_block	*block,
-	int			level)
-{
-	union xfs_btree_ptr	ptr;
-
-	if (level > 0)
-		return 0;
-	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
-		return 0;
-
-	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-	if (!xfs_btree_ptr_is_null(cur, &ptr))
-		return 0;
-	return 1;
-}
-
-STATIC void
-xfs_btree_buf_to_ptr(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	union xfs_btree_ptr	*ptr)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
-					XFS_BUF_ADDR(bp)));
-	else {
-		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
-					XFS_BUF_ADDR(bp)));
-	}
-}
-
-STATIC void
-xfs_btree_set_refs(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp)
-{
-	switch (cur->bc_btnum) {
-	case XFS_BTNUM_BNO:
-	case XFS_BTNUM_CNT:
-		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
-		break;
-	case XFS_BTNUM_INO:
-	case XFS_BTNUM_FINO:
-		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
-		break;
-	case XFS_BTNUM_BMAP:
-		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
-		break;
-	default:
-		ASSERT(0);
-	}
-}
-
-STATIC int
-xfs_btree_get_buf_block(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr,
-	int			flags,
-	struct xfs_btree_block	**block,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_daddr_t		d;
-
-	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XBF_TRYLOCK));
-
-	d = xfs_btree_ptr_to_daddr(cur, ptr);
-	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
-				 mp->m_bsize, flags);
-
-	if (!*bpp)
-		return ENOMEM;
-
-	(*bpp)->b_ops = cur->bc_ops->buf_ops;
-	*block = XFS_BUF_TO_BLOCK(*bpp);
-	return 0;
-}
-
-/*
- * Read in the buffer at the given ptr and return the buffer and
- * the block pointer within the buffer.
- */
-STATIC int
-xfs_btree_read_buf_block(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr,
-	int			flags,
-	struct xfs_btree_block	**block,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_daddr_t		d;
-	int			error;
-
-	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XBF_TRYLOCK));
-
-	d = xfs_btree_ptr_to_daddr(cur, ptr);
-	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, flags, bpp,
-				   cur->bc_ops->buf_ops);
-	if (error)
-		return error;
-
-	xfs_btree_set_refs(cur, *bpp);
-	*block = XFS_BUF_TO_BLOCK(*bpp);
-	return 0;
-}
-
-/*
- * Copy keys from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_keys(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*dst_key,
-	union xfs_btree_key	*src_key,
-	int			numkeys)
-{
-	ASSERT(numkeys >= 0);
-	memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
-}
-
-/*
- * Copy records from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_recs(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*dst_rec,
-	union xfs_btree_rec	*src_rec,
-	int			numrecs)
-{
-	ASSERT(numrecs >= 0);
-	memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
-}
-
-/*
- * Copy block pointers from one btree block to another.
- */
-STATIC void
-xfs_btree_copy_ptrs(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*dst_ptr,
-	union xfs_btree_ptr	*src_ptr,
-	int			numptrs)
-{
-	ASSERT(numptrs >= 0);
-	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
-}
-
-/*
- * Shift keys one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_keys(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*key,
-	int			dir,
-	int			numkeys)
-{
-	char			*dst_key;
-
-	ASSERT(numkeys >= 0);
-	ASSERT(dir == 1 || dir == -1);
-
-	dst_key = (char *)key + (dir * cur->bc_ops->key_len);
-	memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
-}
-
-/*
- * Shift records one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_recs(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*rec,
-	int			dir,
-	int			numrecs)
-{
-	char			*dst_rec;
-
-	ASSERT(numrecs >= 0);
-	ASSERT(dir == 1 || dir == -1);
-
-	dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
-	memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
-}
-
-/*
- * Shift block pointers one index left/right inside a single btree block.
- */
-STATIC void
-xfs_btree_shift_ptrs(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr,
-	int			dir,
-	int			numptrs)
-{
-	char			*dst_ptr;
-
-	ASSERT(numptrs >= 0);
-	ASSERT(dir == 1 || dir == -1);
-
-	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
-	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_btree_log_keys(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			first,
-	int			last)
-{
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-	if (bp) {
-		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-		xfs_trans_log_buf(cur->bc_tp, bp,
-				  xfs_btree_key_offset(cur, first),
-				  xfs_btree_key_offset(cur, last + 1) - 1);
-	} else {
-		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-				xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log record values from the btree block.
- */
-void
-xfs_btree_log_recs(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			first,
-	int			last)
-{
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-	xfs_trans_log_buf(cur->bc_tp, bp,
-			  xfs_btree_rec_offset(cur, first),
-			  xfs_btree_rec_offset(cur, last + 1) - 1);
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_btree_log_ptrs(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	struct xfs_buf		*bp,	/* buffer containing btree block */
-	int			first,	/* index of first pointer to log */
-	int			last)	/* index of last pointer to log */
-{
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
-
-	if (bp) {
-		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-		int			level = xfs_btree_get_level(block);
-
-		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-		xfs_trans_log_buf(cur->bc_tp, bp,
-				xfs_btree_ptr_offset(cur, first, level),
-				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
-	} else {
-		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Log fields from a btree block header.
- */
-void
-xfs_btree_log_block(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	struct xfs_buf		*bp,	/* buffer containing btree block */
-	int			fields)	/* mask of fields: XFS_BB_... */
-{
-	int			first;	/* first byte offset logged */
-	int			last;	/* last byte offset logged */
-	static const short	soffsets[] = {	/* table of offsets (short) */
-		offsetof(struct xfs_btree_block, bb_magic),
-		offsetof(struct xfs_btree_block, bb_level),
-		offsetof(struct xfs_btree_block, bb_numrecs),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
-		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
-		XFS_BTREE_SBLOCK_CRC_LEN
-	};
-	static const short	loffsets[] = {	/* table of offsets (long) */
-		offsetof(struct xfs_btree_block, bb_magic),
-		offsetof(struct xfs_btree_block, bb_level),
-		offsetof(struct xfs_btree_block, bb_numrecs),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
-		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
-		XFS_BTREE_LBLOCK_CRC_LEN
-	};
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
-
-	if (bp) {
-		int nbits;
-
-		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
-			/*
-			 * We don't log the CRC when updating a btree
-			 * block but instead recreate it during log
-			 * recovery.  As the log buffers have checksums
-			 * of their own this is safe and avoids logging a crc
-			 * update in a lot of places.
-			 */
-			if (fields == XFS_BB_ALL_BITS)
-				fields = XFS_BB_ALL_BITS_CRC;
-			nbits = XFS_BB_NUM_BITS_CRC;
-		} else {
-			nbits = XFS_BB_NUM_BITS;
-		}
-		xfs_btree_offsets(fields,
-				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-					loffsets : soffsets,
-				  nbits, &first, &last);
-		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
-		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-	} else {
-		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-			xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-}
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int						/* error */
-xfs_btree_increment(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			*stat)		/* success/failure */
-{
-	struct xfs_btree_block	*block;
-	union xfs_btree_ptr	ptr;
-	struct xfs_buf		*bp;
-	int			error;		/* error return value */
-	int			lev;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, level);
-
-	ASSERT(level < cur->bc_nlevels);
-
-	/* Read-ahead to the right at this level. */
-	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-
-	/* Get a pointer to the btree block. */
-	block = xfs_btree_get_block(cur, level, &bp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, level, bp);
-	if (error)
-		goto error0;
-#endif
-
-	/* We're done if we remain in the block after the increment. */
-	if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
-		goto out1;
-
-	/* Fail if we just went off the right edge of the tree. */
-	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-	if (xfs_btree_ptr_is_null(cur, &ptr))
-		goto out0;
-
-	XFS_BTREE_STATS_INC(cur, increment);
-
-	/*
-	 * March up the tree incrementing pointers.
-	 * Stop when we don't go off the right edge of a block.
-	 */
-	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-		block = xfs_btree_get_block(cur, lev, &bp);
-
-#ifdef DEBUG
-		error = xfs_btree_check_block(cur, block, lev, bp);
-		if (error)
-			goto error0;
-#endif
-
-		if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
-			break;
-
-		/* Read-ahead the right block for the next loop. */
-		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-	}
-
-	/*
-	 * If we went off the root then we are either seriously
-	 * confused or have the tree root in an inode.
-	 */
-	if (lev == cur->bc_nlevels) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
-			goto out0;
-		ASSERT(0);
-		error = EFSCORRUPTED;
-		goto error0;
-	}
-	ASSERT(lev < cur->bc_nlevels);
-
-	/*
-	 * Now walk back down the tree, fixing up the cursor's buffer
-	 * pointers and key numbers.
-	 */
-	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
-		union xfs_btree_ptr	*ptrp;
-
-		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-		--lev;
-		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
-		if (error)
-			goto error0;
-
-		xfs_btree_setbuf(cur, lev, bp);
-		cur->bc_ptrs[lev] = 1;
-	}
-out1:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int						/* error */
-xfs_btree_decrement(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			*stat)		/* success/failure */
-{
-	struct xfs_btree_block	*block;
-	xfs_buf_t		*bp;
-	int			error;		/* error return value */
-	int			lev;
-	union xfs_btree_ptr	ptr;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, level);
-
-	ASSERT(level < cur->bc_nlevels);
-
-	/* Read-ahead to the left at this level. */
-	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-
-	/* We're done if we remain in the block after the decrement. */
-	if (--cur->bc_ptrs[level] > 0)
-		goto out1;
-
-	/* Get a pointer to the btree block. */
-	block = xfs_btree_get_block(cur, level, &bp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, level, bp);
-	if (error)
-		goto error0;
-#endif
-
-	/* Fail if we just went off the left edge of the tree. */
-	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
-	if (xfs_btree_ptr_is_null(cur, &ptr))
-		goto out0;
-
-	XFS_BTREE_STATS_INC(cur, decrement);
-
-	/*
-	 * March up the tree decrementing pointers.
-	 * Stop when we don't go off the left edge of a block.
-	 */
-	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-		if (--cur->bc_ptrs[lev] > 0)
-			break;
-		/* Read-ahead the left block for the next loop. */
-		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-	}
-
-	/*
-	 * If we went off the root then we are seriously confused.
-	 * or the root of the tree is in an inode.
-	 */
-	if (lev == cur->bc_nlevels) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
-			goto out0;
-		ASSERT(0);
-		error = EFSCORRUPTED;
-		goto error0;
-	}
-	ASSERT(lev < cur->bc_nlevels);
-
-	/*
-	 * Now walk back down the tree, fixing up the cursor's buffer
-	 * pointers and key numbers.
-	 */
-	for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
-		union xfs_btree_ptr	*ptrp;
-
-		ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-		--lev;
-		error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
-		if (error)
-			goto error0;
-		xfs_btree_setbuf(cur, lev, bp);
-		cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
-	}
-out1:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-STATIC int
-xfs_btree_lookup_get_block(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	int			level,	/* level in the btree */
-	union xfs_btree_ptr	*pp,	/* ptr to btree block */
-	struct xfs_btree_block	**blkp) /* return btree block */
-{
-	struct xfs_buf		*bp;	/* buffer pointer for btree block */
-	int			error = 0;
-
-	/* special case the root block if in an inode */
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1)) {
-		*blkp = xfs_btree_get_iroot(cur);
-		return 0;
-	}
-
-	/*
-	 * If the old buffer at this level for the disk address we are
-	 * looking for re-use it.
-	 *
-	 * Otherwise throw it away and get a new one.
-	 */
-	bp = cur->bc_bufs[level];
-	if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
-		*blkp = XFS_BUF_TO_BLOCK(bp);
-		return 0;
-	}
-
-	error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
-	if (error)
-		return error;
-
-	xfs_btree_setbuf(cur, level, bp);
-	return 0;
-}
-
-/*
- * Get current search key.  For level 0 we don't actually have a key
- * structure so we make one up from the record.  For all other levels
- * we just return the right key.
- */
-STATIC union xfs_btree_key *
-xfs_lookup_get_search_key(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			keyno,
-	struct xfs_btree_block	*block,
-	union xfs_btree_key	*kp)
-{
-	if (level == 0) {
-		cur->bc_ops->init_key_from_rec(kp,
-				xfs_btree_rec_addr(cur, keyno, block));
-		return kp;
-	}
-
-	return xfs_btree_key_addr(cur, keyno, block);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * stat is set to 0 if can't find any such record, 1 for success.
- */
-int					/* error */
-xfs_btree_lookup(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_lookup_t		dir,	/* <=, ==, or >= */
-	int			*stat)	/* success/failure */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	__int64_t		diff;	/* difference for the current key */
-	int			error;	/* error return value */
-	int			keyno;	/* current key number */
-	int			level;	/* level in the btree */
-	union xfs_btree_ptr	*pp;	/* ptr to btree block */
-	union xfs_btree_ptr	ptr;	/* ptr to btree block */
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, dir);
-
-	XFS_BTREE_STATS_INC(cur, lookup);
-
-	block = NULL;
-	keyno = 0;
-
-	/* initialise start pointer from cursor */
-	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
-	pp = &ptr;
-
-	/*
-	 * Iterate over each level in the btree, starting at the root.
-	 * For each level above the leaves, find the key we need, based
-	 * on the lookup record, then follow the corresponding block
-	 * pointer down to the next level.
-	 */
-	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-		/* Get the block we need to do the lookup on. */
-		error = xfs_btree_lookup_get_block(cur, level, pp, &block);
-		if (error)
-			goto error0;
-
-		if (diff == 0) {
-			/*
-			 * If we already had a key match at a higher level, we
-			 * know we need to use the first entry in this block.
-			 */
-			keyno = 1;
-		} else {
-			/* Otherwise search this block. Do a binary search. */
-
-			int	high;	/* high entry number */
-			int	low;	/* low entry number */
-
-			/* Set low and high entry numbers, 1-based. */
-			low = 1;
-			high = xfs_btree_get_numrecs(block);
-			if (!high) {
-				/* Block is empty, must be an empty leaf. */
-				ASSERT(level == 0 && cur->bc_nlevels == 1);
-
-				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-				*stat = 0;
-				return 0;
-			}
-
-			/* Binary search the block. */
-			while (low <= high) {
-				union xfs_btree_key	key;
-				union xfs_btree_key	*kp;
-
-				XFS_BTREE_STATS_INC(cur, compare);
-
-				/* keyno is average of low and high. */
-				keyno = (low + high) >> 1;
-
-				/* Get current search key */
-				kp = xfs_lookup_get_search_key(cur, level,
-						keyno, block, &key);
-
-				/*
-				 * Compute difference to get next direction:
-				 *  - less than, move right
-				 *  - greater than, move left
-				 *  - equal, we're done
-				 */
-				diff = cur->bc_ops->key_diff(cur, kp);
-				if (diff < 0)
-					low = keyno + 1;
-				else if (diff > 0)
-					high = keyno - 1;
-				else
-					break;
-			}
-		}
-
-		/*
-		 * If there are more levels, set up for the next level
-		 * by getting the block number and filling in the cursor.
-		 */
-		if (level > 0) {
-			/*
-			 * If we moved left, need the previous key number,
-			 * unless there isn't one.
-			 */
-			if (diff > 0 && --keyno < 1)
-				keyno = 1;
-			pp = xfs_btree_ptr_addr(cur, keyno, block);
-
-#ifdef DEBUG
-			error = xfs_btree_check_ptr(cur, pp, 0, level);
-			if (error)
-				goto error0;
-#endif
-			cur->bc_ptrs[level] = keyno;
-		}
-	}
-
-	/* Done with the search. See if we need to adjust the results. */
-	if (dir != XFS_LOOKUP_LE && diff < 0) {
-		keyno++;
-		/*
-		 * If ge search and we went off the end of the block, but it's
-		 * not the last block, we're in the wrong block.
-		 */
-		xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-		if (dir == XFS_LOOKUP_GE &&
-		    keyno > xfs_btree_get_numrecs(block) &&
-		    !xfs_btree_ptr_is_null(cur, &ptr)) {
-			int	i;
-
-			cur->bc_ptrs[0] = keyno;
-			error = xfs_btree_increment(cur, 0, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_RETURN(i == 1);
-			XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-			*stat = 1;
-			return 0;
-		}
-	} else if (dir == XFS_LOOKUP_LE && diff > 0)
-		keyno--;
-	cur->bc_ptrs[0] = keyno;
-
-	/* Return if we succeeded or not. */
-	if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
-		*stat = 0;
-	else if (dir != XFS_LOOKUP_EQ || diff == 0)
-		*stat = 1;
-	else
-		*stat = 0;
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int
-xfs_btree_updkey(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*keyp,
-	int			level)
-{
-	struct xfs_btree_block	*block;
-	struct xfs_buf		*bp;
-	union xfs_btree_key	*kp;
-	int			ptr;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
-
-	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
-
-	/*
-	 * Go up the tree from this level toward the root.
-	 * At each level, update the key value to the value input.
-	 * Stop when we reach a level where the cursor isn't pointing
-	 * at the first entry in the block.
-	 */
-	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-#ifdef DEBUG
-		int		error;
-#endif
-		block = xfs_btree_get_block(cur, level, &bp);
-#ifdef DEBUG
-		error = xfs_btree_check_block(cur, block, level, bp);
-		if (error) {
-			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-			return error;
-		}
-#endif
-		ptr = cur->bc_ptrs[level];
-		kp = xfs_btree_key_addr(cur, ptr, block);
-		xfs_btree_copy_keys(cur, kp, keyp, 1);
-		xfs_btree_log_keys(cur, bp, ptr, ptr);
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
-/*
- * Update the record referred to by cur to the value in the
- * given record. This either works (return 0) or gets an
- * EFSCORRUPTED error.
- */
-int
-xfs_btree_update(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*rec)
-{
-	struct xfs_btree_block	*block;
-	struct xfs_buf		*bp;
-	int			error;
-	int			ptr;
-	union xfs_btree_rec	*rp;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGR(cur, rec);
-
-	/* Pick up the current block. */
-	block = xfs_btree_get_block(cur, 0, &bp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, 0, bp);
-	if (error)
-		goto error0;
-#endif
-	/* Get the address of the rec to be updated. */
-	ptr = cur->bc_ptrs[0];
-	rp = xfs_btree_rec_addr(cur, ptr, block);
-
-	/* Fill in the new contents and log them. */
-	xfs_btree_copy_recs(cur, rp, rec, 1);
-	xfs_btree_log_recs(cur, bp, ptr, ptr);
-
-	/*
-	 * If we are tracking the last record in the tree and
-	 * we are at the far right edge of the tree, update it.
-	 */
-	if (xfs_btree_is_lastrec(cur, block, 0)) {
-		cur->bc_ops->update_lastrec(cur, block, rec,
-					    ptr, LASTREC_UPDATE);
-	}
-
-	/* Updating first rec in leaf. Pass new key value up to our parent. */
-	if (ptr == 1) {
-		union xfs_btree_key	key;
-
-		cur->bc_ops->init_key_from_rec(&key, rec);
-		error = xfs_btree_updkey(cur, &key, 1);
-		if (error)
-			goto error0;
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int					/* error */
-xfs_btree_lshift(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			*stat)		/* success/failure */
-{
-	union xfs_btree_key	key;		/* btree key */
-	struct xfs_buf		*lbp;		/* left buffer pointer */
-	struct xfs_btree_block	*left;		/* left btree block */
-	int			lrecs;		/* left record count */
-	struct xfs_buf		*rbp;		/* right buffer pointer */
-	struct xfs_btree_block	*right;		/* right btree block */
-	int			rrecs;		/* right record count */
-	union xfs_btree_ptr	lptr;		/* left btree pointer */
-	union xfs_btree_key	*rkp = NULL;	/* right btree key */
-	union xfs_btree_ptr	*rpp = NULL;	/* right address pointer */
-	union xfs_btree_rec	*rrp = NULL;	/* right record pointer */
-	int			error;		/* error return value */
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, level);
-
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    level == cur->bc_nlevels - 1)
-		goto out0;
-
-	/* Set up variables for this block as "right". */
-	right = xfs_btree_get_block(cur, level, &rbp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, right, level, rbp);
-	if (error)
-		goto error0;
-#endif
-
-	/* If we've got no left sibling then we can't shift an entry left. */
-	xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-	if (xfs_btree_ptr_is_null(cur, &lptr))
-		goto out0;
-
-	/*
-	 * If the cursor entry is the one that would be moved, don't
-	 * do it... it's too complicated.
-	 */
-	if (cur->bc_ptrs[level] <= 1)
-		goto out0;
-
-	/* Set up the left neighbor as "left". */
-	error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-	if (error)
-		goto error0;
-
-	/* If it's full, it can't take another entry. */
-	lrecs = xfs_btree_get_numrecs(left);
-	if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
-		goto out0;
-
-	rrecs = xfs_btree_get_numrecs(right);
-
-	/*
-	 * We add one entry to the left side and remove one for the right side.
-	 * Account for it here, the changes will be updated on disk and logged
-	 * later.
-	 */
-	lrecs++;
-	rrecs--;
-
-	XFS_BTREE_STATS_INC(cur, lshift);
-	XFS_BTREE_STATS_ADD(cur, moves, 1);
-
-	/*
-	 * If non-leaf, copy a key and a ptr to the left block.
-	 * Log the changes to the left block.
-	 */
-	if (level > 0) {
-		/* It's a non-leaf.  Move keys and pointers. */
-		union xfs_btree_key	*lkp;	/* left btree key */
-		union xfs_btree_ptr	*lpp;	/* left address pointer */
-
-		lkp = xfs_btree_key_addr(cur, lrecs, left);
-		rkp = xfs_btree_key_addr(cur, 1, right);
-
-		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
-		rpp = xfs_btree_ptr_addr(cur, 1, right);
-#ifdef DEBUG
-		error = xfs_btree_check_ptr(cur, rpp, 0, level);
-		if (error)
-			goto error0;
-#endif
-		xfs_btree_copy_keys(cur, lkp, rkp, 1);
-		xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
-
-		xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
-		xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
-
-		ASSERT(cur->bc_ops->keys_inorder(cur,
-			xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
-	} else {
-		/* It's a leaf.  Move records.  */
-		union xfs_btree_rec	*lrp;	/* left record pointer */
-
-		lrp = xfs_btree_rec_addr(cur, lrecs, left);
-		rrp = xfs_btree_rec_addr(cur, 1, right);
-
-		xfs_btree_copy_recs(cur, lrp, rrp, 1);
-		xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
-
-		ASSERT(cur->bc_ops->recs_inorder(cur,
-			xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
-	}
-
-	xfs_btree_set_numrecs(left, lrecs);
-	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
-
-	xfs_btree_set_numrecs(right, rrecs);
-	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
-
-	/*
-	 * Slide the contents of right down one entry.
-	 */
-	XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
-	if (level > 0) {
-		/* It's a nonleaf. operate on keys and ptrs */
-#ifdef DEBUG
-		int			i;		/* loop index */
-
-		for (i = 0; i < rrecs; i++) {
-			error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
-			if (error)
-				goto error0;
-		}
-#endif
-		xfs_btree_shift_keys(cur,
-				xfs_btree_key_addr(cur, 2, right),
-				-1, rrecs);
-		xfs_btree_shift_ptrs(cur,
-				xfs_btree_ptr_addr(cur, 2, right),
-				-1, rrecs);
-
-		xfs_btree_log_keys(cur, rbp, 1, rrecs);
-		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
-	} else {
-		/* It's a leaf. operate on records */
-		xfs_btree_shift_recs(cur,
-			xfs_btree_rec_addr(cur, 2, right),
-			-1, rrecs);
-		xfs_btree_log_recs(cur, rbp, 1, rrecs);
-
-		/*
-		 * If it's the first record in the block, we'll need a key
-		 * structure to pass up to the next level (updkey).
-		 */
-		cur->bc_ops->init_key_from_rec(&key,
-			xfs_btree_rec_addr(cur, 1, right));
-		rkp = &key;
-	}
-
-	/* Update the parent key values of right. */
-	error = xfs_btree_updkey(cur, rkp, level + 1);
-	if (error)
-		goto error0;
-
-	/* Slide the cursor value left one. */
-	cur->bc_ptrs[level]--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int					/* error */
-xfs_btree_rshift(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			*stat)		/* success/failure */
-{
-	union xfs_btree_key	key;		/* btree key */
-	struct xfs_buf		*lbp;		/* left buffer pointer */
-	struct xfs_btree_block	*left;		/* left btree block */
-	struct xfs_buf		*rbp;		/* right buffer pointer */
-	struct xfs_btree_block	*right;		/* right btree block */
-	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
-	union xfs_btree_ptr	rptr;		/* right block pointer */
-	union xfs_btree_key	*rkp;		/* right btree key */
-	int			rrecs;		/* right record count */
-	int			lrecs;		/* left record count */
-	int			error;		/* error return value */
-	int			i;		/* loop counter */
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, level);
-
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1))
-		goto out0;
-
-	/* Set up variables for this block as "left". */
-	left = xfs_btree_get_block(cur, level, &lbp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, left, level, lbp);
-	if (error)
-		goto error0;
-#endif
-
-	/* If we've got no right sibling then we can't shift an entry right. */
-	xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
-	if (xfs_btree_ptr_is_null(cur, &rptr))
-		goto out0;
-
-	/*
-	 * If the cursor entry is the one that would be moved, don't
-	 * do it... it's too complicated.
-	 */
-	lrecs = xfs_btree_get_numrecs(left);
-	if (cur->bc_ptrs[level] >= lrecs)
-		goto out0;
-
-	/* Set up the right neighbor as "right". */
-	error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-	if (error)
-		goto error0;
-
-	/* If it's full, it can't take another entry. */
-	rrecs = xfs_btree_get_numrecs(right);
-	if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
-		goto out0;
-
-	XFS_BTREE_STATS_INC(cur, rshift);
-	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-
-	/*
-	 * Make a hole at the start of the right neighbor block, then
-	 * copy the last left block entry to the hole.
-	 */
-	if (level > 0) {
-		/* It's a nonleaf. make a hole in the keys and ptrs */
-		union xfs_btree_key	*lkp;
-		union xfs_btree_ptr	*lpp;
-		union xfs_btree_ptr	*rpp;
-
-		lkp = xfs_btree_key_addr(cur, lrecs, left);
-		lpp = xfs_btree_ptr_addr(cur, lrecs, left);
-		rkp = xfs_btree_key_addr(cur, 1, right);
-		rpp = xfs_btree_ptr_addr(cur, 1, right);
-
-#ifdef DEBUG
-		for (i = rrecs - 1; i >= 0; i--) {
-			error = xfs_btree_check_ptr(cur, rpp, i, level);
-			if (error)
-				goto error0;
-		}
-#endif
-
-		xfs_btree_shift_keys(cur, rkp, 1, rrecs);
-		xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
-
-#ifdef DEBUG
-		error = xfs_btree_check_ptr(cur, lpp, 0, level);
-		if (error)
-			goto error0;
-#endif
-
-		/* Now put the new data in, and log it. */
-		xfs_btree_copy_keys(cur, rkp, lkp, 1);
-		xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
-
-		xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
-		xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
-
-		ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
-			xfs_btree_key_addr(cur, 2, right)));
-	} else {
-		/* It's a leaf. make a hole in the records */
-		union xfs_btree_rec	*lrp;
-		union xfs_btree_rec	*rrp;
-
-		lrp = xfs_btree_rec_addr(cur, lrecs, left);
-		rrp = xfs_btree_rec_addr(cur, 1, right);
-
-		xfs_btree_shift_recs(cur, rrp, 1, rrecs);
-
-		/* Now put the new data in, and log it. */
-		xfs_btree_copy_recs(cur, rrp, lrp, 1);
-		xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
-
-		cur->bc_ops->init_key_from_rec(&key, rrp);
-		rkp = &key;
-
-		ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
-			xfs_btree_rec_addr(cur, 2, right)));
-	}
-
-	/*
-	 * Decrement and log left's numrecs, bump and log right's numrecs.
-	 */
-	xfs_btree_set_numrecs(left, --lrecs);
-	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
-
-	xfs_btree_set_numrecs(right, ++rrecs);
-	xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
-
-	/*
-	 * Using a temporary cursor, update the parent key values of the
-	 * block on the right.
-	 */
-	error = xfs_btree_dup_cursor(cur, &tcur);
-	if (error)
-		goto error0;
-	i = xfs_btree_lastrec(tcur, level);
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-	error = xfs_btree_increment(tcur, level, &i);
-	if (error)
-		goto error1;
-
-	error = xfs_btree_updkey(tcur, rkp, level + 1);
-	if (error)
-		goto error1;
-
-	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-
-error1:
-	XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
-	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Split cur/level block in half.
- * Return new block number and the key to its first
- * record (to be inserted into parent).
- */
-STATIC int					/* error */
-xfs_btree_split(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	union xfs_btree_ptr	*ptrp,
-	union xfs_btree_key	*key,
-	struct xfs_btree_cur	**curp,
-	int			*stat)		/* success/failure */
-{
-	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
-	struct xfs_buf		*lbp;		/* left buffer pointer */
-	struct xfs_btree_block	*left;		/* left btree block */
-	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
-	struct xfs_buf		*rbp;		/* right buffer pointer */
-	struct xfs_btree_block	*right;		/* right btree block */
-	union xfs_btree_ptr	rrptr;		/* right-right sibling ptr */
-	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
-	struct xfs_btree_block	*rrblock;	/* right-right btree block */
-	int			lrecs;
-	int			rrecs;
-	int			src_index;
-	int			error;		/* error return value */
-#ifdef DEBUG
-	int			i;
-#endif
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
-
-	XFS_BTREE_STATS_INC(cur, split);
-
-	/* Set up left block (current one). */
-	left = xfs_btree_get_block(cur, level, &lbp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, left, level, lbp);
-	if (error)
-		goto error0;
-#endif
-
-	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
-
-	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
-	if (error)
-		goto error0;
-	if (*stat == 0)
-		goto out0;
-	XFS_BTREE_STATS_INC(cur, alloc);
-
-	/* Set up the new block as "right". */
-	error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
-	if (error)
-		goto error0;
-
-	/* Fill in the btree header for the new right block. */
-	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
-
-	/*
-	 * Split the entries between the old and the new block evenly.
-	 * Make sure that if there's an odd number of entries now, that
-	 * each new block will have the same number of entries.
-	 */
-	lrecs = xfs_btree_get_numrecs(left);
-	rrecs = lrecs / 2;
-	if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
-		rrecs++;
-	src_index = (lrecs - rrecs + 1);
-
-	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-
-	/*
-	 * Copy btree block entries from the left block over to the
-	 * new block, the right. Update the right block and log the
-	 * changes.
-	 */
-	if (level > 0) {
-		/* It's a non-leaf.  Move keys and pointers. */
-		union xfs_btree_key	*lkp;	/* left btree key */
-		union xfs_btree_ptr	*lpp;	/* left address pointer */
-		union xfs_btree_key	*rkp;	/* right btree key */
-		union xfs_btree_ptr	*rpp;	/* right address pointer */
-
-		lkp = xfs_btree_key_addr(cur, src_index, left);
-		lpp = xfs_btree_ptr_addr(cur, src_index, left);
-		rkp = xfs_btree_key_addr(cur, 1, right);
-		rpp = xfs_btree_ptr_addr(cur, 1, right);
-
-#ifdef DEBUG
-		for (i = src_index; i < rrecs; i++) {
-			error = xfs_btree_check_ptr(cur, lpp, i, level);
-			if (error)
-				goto error0;
-		}
-#endif
-
-		xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
-		xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
-
-		xfs_btree_log_keys(cur, rbp, 1, rrecs);
-		xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
-
-		/* Grab the keys to the entries moved to the right block */
-		xfs_btree_copy_keys(cur, key, rkp, 1);
-	} else {
-		/* It's a leaf.  Move records.  */
-		union xfs_btree_rec	*lrp;	/* left record pointer */
-		union xfs_btree_rec	*rrp;	/* right record pointer */
-
-		lrp = xfs_btree_rec_addr(cur, src_index, left);
-		rrp = xfs_btree_rec_addr(cur, 1, right);
-
-		xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
-		xfs_btree_log_recs(cur, rbp, 1, rrecs);
-
-		cur->bc_ops->init_key_from_rec(key,
-			xfs_btree_rec_addr(cur, 1, right));
-	}
-
-
-	/*
-	 * Find the left block number by looking in the buffer.
-	 * Adjust numrecs, sibling pointers.
-	 */
-	xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
-	xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
-	xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-	xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
-
-	lrecs -= rrecs;
-	xfs_btree_set_numrecs(left, lrecs);
-	xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
-
-	xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
-	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-
-	/*
-	 * If there's a block to the new block's right, make that block
-	 * point back to right instead of to left.
-	 */
-	if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
-		error = xfs_btree_read_buf_block(cur, &rrptr,
-							0, &rrblock, &rrbp);
-		if (error)
-			goto error0;
-		xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
-		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-	}
-	/*
-	 * If the cursor is really in the right block, move it there.
-	 * If it's just pointing past the last entry in left, then we'll
-	 * insert there, so don't change anything in that case.
-	 */
-	if (cur->bc_ptrs[level] > lrecs + 1) {
-		xfs_btree_setbuf(cur, level, rbp);
-		cur->bc_ptrs[level] -= lrecs;
-	}
-	/*
-	 * If there are more levels, we'll need another cursor which refers
-	 * the right block, no matter where this cursor was.
-	 */
-	if (level + 1 < cur->bc_nlevels) {
-		error = xfs_btree_dup_cursor(cur, curp);
-		if (error)
-			goto error0;
-		(*curp)->bc_ptrs[level + 1]++;
-	}
-	*ptrp = rptr;
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Copy the old inode root contents into a real block and make the
- * broot point to it.
- */
-int						/* error */
-xfs_btree_new_iroot(
-	struct xfs_btree_cur	*cur,		/* btree cursor */
-	int			*logflags,	/* logging flags for inode */
-	int			*stat)		/* return status - 0 fail */
-{
-	struct xfs_buf		*cbp;		/* buffer for cblock */
-	struct xfs_btree_block	*block;		/* btree block */
-	struct xfs_btree_block	*cblock;	/* child btree block */
-	union xfs_btree_key	*ckp;		/* child key pointer */
-	union xfs_btree_ptr	*cpp;		/* child ptr pointer */
-	union xfs_btree_key	*kp;		/* pointer to btree key */
-	union xfs_btree_ptr	*pp;		/* pointer to block addr */
-	union xfs_btree_ptr	nptr;		/* new block addr */
-	int			level;		/* btree level */
-	int			error;		/* error return code */
-#ifdef DEBUG
-	int			i;		/* loop counter */
-#endif
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, newroot);
-
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-
-	level = cur->bc_nlevels - 1;
-
-	block = xfs_btree_get_iroot(cur);
-	pp = xfs_btree_ptr_addr(cur, 1, block);
-
-	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
-	if (error)
-		goto error0;
-	if (*stat == 0) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		return 0;
-	}
-	XFS_BTREE_STATS_INC(cur, alloc);
-
-	/* Copy the root into a real block. */
-	error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
-	if (error)
-		goto error0;
-
-	/*
-	 * we can't just memcpy() the root in for CRC enabled btree blocks.
-	 * In that case have to also ensure the blkno remains correct
-	 */
-	memcpy(cblock, block, xfs_btree_block_len(cur));
-	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
-		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
-		else
-			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
-	}
-
-	be16_add_cpu(&block->bb_level, 1);
-	xfs_btree_set_numrecs(block, 1);
-	cur->bc_nlevels++;
-	cur->bc_ptrs[level + 1] = 1;
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
-
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-#ifdef DEBUG
-	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		error = xfs_btree_check_ptr(cur, pp, i, level);
-		if (error)
-			goto error0;
-	}
-#endif
-	xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
-
-#ifdef DEBUG
-	error = xfs_btree_check_ptr(cur, &nptr, 0, level);
-	if (error)
-		goto error0;
-#endif
-	xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
-
-	xfs_iroot_realloc(cur->bc_private.b.ip,
-			  1 - xfs_btree_get_numrecs(cblock),
-			  cur->bc_private.b.whichfork);
-
-	xfs_btree_setbuf(cur, level, cbp);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
-	xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
-
-	*logflags |=
-		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
-	*stat = 1;
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int				/* error */
-xfs_btree_new_root(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	int			*stat)	/* success/failure */
-{
-	struct xfs_btree_block	*block;	/* one half of the old root block */
-	struct xfs_buf		*bp;	/* buffer containing block */
-	int			error;	/* error return value */
-	struct xfs_buf		*lbp;	/* left buffer pointer */
-	struct xfs_btree_block	*left;	/* left btree block */
-	struct xfs_buf		*nbp;	/* new (root) buffer */
-	struct xfs_btree_block	*new;	/* new (root) btree block */
-	int			nptr;	/* new value for key index, 1 or 2 */
-	struct xfs_buf		*rbp;	/* right buffer pointer */
-	struct xfs_btree_block	*right;	/* right btree block */
-	union xfs_btree_ptr	rptr;
-	union xfs_btree_ptr	lptr;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, newroot);
-
-	/* initialise our start point from the cursor */
-	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
-
-	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
-	if (error)
-		goto error0;
-	if (*stat == 0)
-		goto out0;
-	XFS_BTREE_STATS_INC(cur, alloc);
-
-	/* Set up the new block. */
-	error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
-	if (error)
-		goto error0;
-
-	/* Set the root in the holding structure  increasing the level by 1. */
-	cur->bc_ops->set_root(cur, &lptr, 1);
-
-	/*
-	 * At the previous root level there are now two blocks: the old root,
-	 * and the new block generated when it was split.  We don't know which
-	 * one the cursor is pointing at, so we set up variables "left" and
-	 * "right" for each case.
-	 */
-	block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
-	if (error)
-		goto error0;
-#endif
-
-	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
-		/* Our block is left, pick up the right block. */
-		lbp = bp;
-		xfs_btree_buf_to_ptr(cur, lbp, &lptr);
-		left = block;
-		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-		if (error)
-			goto error0;
-		bp = rbp;
-		nptr = 1;
-	} else {
-		/* Our block is right, pick up the left block. */
-		rbp = bp;
-		xfs_btree_buf_to_ptr(cur, rbp, &rptr);
-		right = block;
-		xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-		if (error)
-			goto error0;
-		bp = lbp;
-		nptr = 2;
-	}
-	/* Fill in the new block's btree header and log it. */
-	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
-	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
-	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
-			!xfs_btree_ptr_is_null(cur, &rptr));
-
-	/* Fill in the key data in the new root. */
-	if (xfs_btree_get_level(left) > 0) {
-		xfs_btree_copy_keys(cur,
-				xfs_btree_key_addr(cur, 1, new),
-				xfs_btree_key_addr(cur, 1, left), 1);
-		xfs_btree_copy_keys(cur,
-				xfs_btree_key_addr(cur, 2, new),
-				xfs_btree_key_addr(cur, 1, right), 1);
-	} else {
-		cur->bc_ops->init_key_from_rec(
-				xfs_btree_key_addr(cur, 1, new),
-				xfs_btree_rec_addr(cur, 1, left));
-		cur->bc_ops->init_key_from_rec(
-				xfs_btree_key_addr(cur, 2, new),
-				xfs_btree_rec_addr(cur, 1, right));
-	}
-	xfs_btree_log_keys(cur, nbp, 1, 2);
-
-	/* Fill in the pointer data in the new root. */
-	xfs_btree_copy_ptrs(cur,
-		xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
-	xfs_btree_copy_ptrs(cur,
-		xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
-	xfs_btree_log_ptrs(cur, nbp, 1, 2);
-
-	/* Fix up the cursor. */
-	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-	cur->bc_ptrs[cur->bc_nlevels] = nptr;
-	cur->bc_nlevels++;
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 0;
-	return 0;
-}
-
-STATIC int
-xfs_btree_make_block_unfull(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	int			level,	/* btree level */
-	int			numrecs,/* # of recs in block */
-	int			*oindex,/* old tree index */
-	int			*index,	/* new tree index */
-	union xfs_btree_ptr	*nptr,	/* new btree ptr */
-	struct xfs_btree_cur	**ncur,	/* new btree cursor */
-	union xfs_btree_rec	*nrec,	/* new record */
-	int			*stat)
-{
-	union xfs_btree_key	key;	/* new btree key value */
-	int			error = 0;
-
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    level == cur->bc_nlevels - 1) {
-	    	struct xfs_inode *ip = cur->bc_private.b.ip;
-
-		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
-			/* A root block that can be made bigger. */
-			xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
-		} else {
-			/* A root block that needs replacing */
-			int	logflags = 0;
-
-			error = xfs_btree_new_iroot(cur, &logflags, stat);
-			if (error || *stat == 0)
-				return error;
-
-			xfs_trans_log_inode(cur->bc_tp, ip, logflags);
-		}
-
-		return 0;
-	}
-
-	/* First, try shifting an entry to the right neighbor. */
-	error = xfs_btree_rshift(cur, level, stat);
-	if (error || *stat)
-		return error;
-
-	/* Next, try shifting an entry to the left neighbor. */
-	error = xfs_btree_lshift(cur, level, stat);
-	if (error)
-		return error;
-
-	if (*stat) {
-		*oindex = *index = cur->bc_ptrs[level];
-		return 0;
-	}
-
-	/*
-	 * Next, try splitting the current block in half.
-	 *
-	 * If this works we have to re-set our variables because we
-	 * could be in a different block now.
-	 */
-	error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
-	if (error || *stat == 0)
-		return error;
-
-
-	*index = cur->bc_ptrs[level];
-	cur->bc_ops->init_rec_from_key(&key, nrec);
-	return 0;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int
-xfs_btree_insrec(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	int			level,	/* level to insert record at */
-	union xfs_btree_ptr	*ptrp,	/* i/o: block number inserted */
-	union xfs_btree_rec	*recp,	/* i/o: record data inserted */
-	struct xfs_btree_cur	**curp,	/* output: new cursor replacing cur */
-	int			*stat)	/* success/failure */
-{
-	struct xfs_btree_block	*block;	/* btree block */
-	struct xfs_buf		*bp;	/* buffer for block */
-	union xfs_btree_key	key;	/* btree key */
-	union xfs_btree_ptr	nptr;	/* new block ptr */
-	struct xfs_btree_cur	*ncur;	/* new btree cursor */
-	union xfs_btree_rec	nrec;	/* new record count */
-	int			optr;	/* old key/record index */
-	int			ptr;	/* key/record index */
-	int			numrecs;/* number of records */
-	int			error;	/* error return value */
-#ifdef DEBUG
-	int			i;
-#endif
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
-
-	ncur = NULL;
-
-	/*
-	 * If we have an external root pointer, and we've made it to the
-	 * root level, allocate a new root block and we're done.
-	 */
-	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level >= cur->bc_nlevels)) {
-		error = xfs_btree_new_root(cur, stat);
-		xfs_btree_set_ptr_null(cur, ptrp);
-
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		return error;
-	}
-
-	/* If we're off the left edge, return failure. */
-	ptr = cur->bc_ptrs[level];
-	if (ptr == 0) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-
-	/* Make a key out of the record data to be inserted, and save it. */
-	cur->bc_ops->init_key_from_rec(&key, recp);
-
-	optr = ptr;
-
-	XFS_BTREE_STATS_INC(cur, insrec);
-
-	/* Get pointers to the btree buffer and block. */
-	block = xfs_btree_get_block(cur, level, &bp);
-	numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, level, bp);
-	if (error)
-		goto error0;
-
-	/* Check that the new entry is being inserted in the right place. */
-	if (ptr <= numrecs) {
-		if (level == 0) {
-			ASSERT(cur->bc_ops->recs_inorder(cur, recp,
-				xfs_btree_rec_addr(cur, ptr, block)));
-		} else {
-			ASSERT(cur->bc_ops->keys_inorder(cur, &key,
-				xfs_btree_key_addr(cur, ptr, block)));
-		}
-	}
-#endif
-
-	/*
-	 * If the block is full, we can't insert the new entry until we
-	 * make the block un-full.
-	 */
-	xfs_btree_set_ptr_null(cur, &nptr);
-	if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
-		error = xfs_btree_make_block_unfull(cur, level, numrecs,
-					&optr, &ptr, &nptr, &ncur, &nrec, stat);
-		if (error || *stat == 0)
-			goto error0;
-	}
-
-	/*
-	 * The current block may have changed if the block was
-	 * previously full and we have just made space in it.
-	 */
-	block = xfs_btree_get_block(cur, level, &bp);
-	numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, level, bp);
-	if (error)
-		return error;
-#endif
-
-	/*
-	 * At this point we know there's room for our new entry in the block
-	 * we're pointing at.
-	 */
-	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
-
-	if (level > 0) {
-		/* It's a nonleaf. make a hole in the keys and ptrs */
-		union xfs_btree_key	*kp;
-		union xfs_btree_ptr	*pp;
-
-		kp = xfs_btree_key_addr(cur, ptr, block);
-		pp = xfs_btree_ptr_addr(cur, ptr, block);
-
-#ifdef DEBUG
-		for (i = numrecs - ptr; i >= 0; i--) {
-			error = xfs_btree_check_ptr(cur, pp, i, level);
-			if (error)
-				return error;
-		}
-#endif
-
-		xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
-		xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
-
-#ifdef DEBUG
-		error = xfs_btree_check_ptr(cur, ptrp, 0, level);
-		if (error)
-			goto error0;
-#endif
-
-		/* Now put the new data in, bump numrecs and log it. */
-		xfs_btree_copy_keys(cur, kp, &key, 1);
-		xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
-		numrecs++;
-		xfs_btree_set_numrecs(block, numrecs);
-		xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
-		xfs_btree_log_keys(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-		if (ptr < numrecs) {
-			ASSERT(cur->bc_ops->keys_inorder(cur, kp,
-				xfs_btree_key_addr(cur, ptr + 1, block)));
-		}
-#endif
-	} else {
-		/* It's a leaf. make a hole in the records */
-		union xfs_btree_rec             *rp;
-
-		rp = xfs_btree_rec_addr(cur, ptr, block);
-
-		xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
-
-		/* Now put the new data in, bump numrecs and log it. */
-		xfs_btree_copy_recs(cur, rp, recp, 1);
-		xfs_btree_set_numrecs(block, ++numrecs);
-		xfs_btree_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-		if (ptr < numrecs) {
-			ASSERT(cur->bc_ops->recs_inorder(cur, rp,
-				xfs_btree_rec_addr(cur, ptr + 1, block)));
-		}
-#endif
-	}
-
-	/* Log the new number of records in the btree header. */
-	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
-
-	/* If we inserted at the start of a block, update the parents' keys. */
-	if (optr == 1) {
-		error = xfs_btree_updkey(cur, &key, level + 1);
-		if (error)
-			goto error0;
-	}
-
-	/*
-	 * If we are tracking the last record in the tree and
-	 * we are at the far right edge of the tree, update it.
-	 */
-	if (xfs_btree_is_lastrec(cur, block, level)) {
-		cur->bc_ops->update_lastrec(cur, block, recp,
-					    ptr, LASTREC_INSREC);
-	}
-
-	/*
-	 * Return the new block number, if any.
-	 * If there is one, give back a record value and a cursor too.
-	 */
-	*ptrp = nptr;
-	if (!xfs_btree_ptr_is_null(cur, &nptr)) {
-		*recp = nrec;
-		*curp = ncur;
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Insert the record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int
-xfs_btree_insert(
-	struct xfs_btree_cur	*cur,
-	int			*stat)
-{
-	int			error;	/* error return value */
-	int			i;	/* result value, 0 for failure */
-	int			level;	/* current level number in btree */
-	union xfs_btree_ptr	nptr;	/* new block number (split result) */
-	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
-	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
-	union xfs_btree_rec	rec;	/* record to insert */
-
-	level = 0;
-	ncur = NULL;
-	pcur = cur;
-
-	xfs_btree_set_ptr_null(cur, &nptr);
-	cur->bc_ops->init_rec_from_cur(cur, &rec);
-
-	/*
-	 * Loop going up the tree, starting at the leaf level.
-	 * Stop when we don't get a split block, that must mean that
-	 * the insert is finished with this level.
-	 */
-	do {
-		/*
-		 * Insert nrec/nptr into this level of the tree.
-		 * Note if we fail, nptr will be null.
-		 */
-		error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
-		if (error) {
-			if (pcur != cur)
-				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-			goto error0;
-		}
-
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		level++;
-
-		/*
-		 * See if the cursor we just used is trash.
-		 * Can't trash the caller's cursor, but otherwise we should
-		 * if ncur is a new cursor or we're about to be done.
-		 */
-		if (pcur != cur &&
-		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
-			/* Save the state from the cursor before we trash it */
-			if (cur->bc_ops->update_cursor)
-				cur->bc_ops->update_cursor(pcur, cur);
-			cur->bc_nlevels = pcur->bc_nlevels;
-			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-		}
-		/* If we got a new cursor, switch to it. */
-		if (ncur) {
-			pcur = ncur;
-			ncur = NULL;
-		}
-	} while (!xfs_btree_ptr_is_null(cur, &nptr));
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = i;
-	return 0;
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Try to merge a non-leaf block back into the inode root.
- *
- * Note: the killroot names comes from the fact that we're effectively
- * killing the old root block.  But because we can't just delete the
- * inode we have to copy the single block it was pointing to into the
- * inode.
- */
-STATIC int
-xfs_btree_kill_iroot(
-	struct xfs_btree_cur	*cur)
-{
-	int			whichfork = cur->bc_private.b.whichfork;
-	struct xfs_inode	*ip = cur->bc_private.b.ip;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-	struct xfs_btree_block	*block;
-	struct xfs_btree_block	*cblock;
-	union xfs_btree_key	*kp;
-	union xfs_btree_key	*ckp;
-	union xfs_btree_ptr	*pp;
-	union xfs_btree_ptr	*cpp;
-	struct xfs_buf		*cbp;
-	int			level;
-	int			index;
-	int			numrecs;
-#ifdef DEBUG
-	union xfs_btree_ptr	ptr;
-	int			i;
-#endif
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-	ASSERT(cur->bc_nlevels > 1);
-
-	/*
-	 * Don't deal with the root block needs to be a leaf case.
-	 * We're just going to turn the thing back into extents anyway.
-	 */
-	level = cur->bc_nlevels - 1;
-	if (level == 1)
-		goto out0;
-
-	/*
-	 * Give up if the root has multiple children.
-	 */
-	block = xfs_btree_get_iroot(cur);
-	if (xfs_btree_get_numrecs(block) != 1)
-		goto out0;
-
-	cblock = xfs_btree_get_block(cur, level - 1, &cbp);
-	numrecs = xfs_btree_get_numrecs(cblock);
-
-	/*
-	 * Only do this if the next level will fit.
-	 * Then the data must be copied up to the inode,
-	 * instead of freeing the root you free the next level.
-	 */
-	if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
-		goto out0;
-
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-#ifdef DEBUG
-	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
-	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
-	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
-	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
-#endif
-
-	index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
-	if (index) {
-		xfs_iroot_realloc(cur->bc_private.b.ip, index,
-				  cur->bc_private.b.whichfork);
-		block = ifp->if_broot;
-	}
-
-	be16_add_cpu(&block->bb_numrecs, index);
-	ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-
-	kp = xfs_btree_key_addr(cur, 1, block);
-	ckp = xfs_btree_key_addr(cur, 1, cblock);
-	xfs_btree_copy_keys(cur, kp, ckp, numrecs);
-
-	pp = xfs_btree_ptr_addr(cur, 1, block);
-	cpp = xfs_btree_ptr_addr(cur, 1, cblock);
-#ifdef DEBUG
-	for (i = 0; i < numrecs; i++) {
-		int		error;
-
-		error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
-		if (error) {
-			XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-			return error;
-		}
-	}
-#endif
-	xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
-
-	cur->bc_ops->free_block(cur, cbp);
-	XFS_BTREE_STATS_INC(cur, free);
-
-	cur->bc_bufs[level - 1] = NULL;
-	be16_add_cpu(&block->bb_level, -1);
-	xfs_trans_log_inode(cur->bc_tp, ip,
-		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
-	cur->bc_nlevels--;
-out0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
-/*
- * Kill the current root node, and replace it with it's only child node.
- */
-STATIC int
-xfs_btree_kill_root(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	union xfs_btree_ptr	*newroot)
-{
-	int			error;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-	/*
-	 * Update the root pointer, decreasing the level by 1 and then
-	 * free the old root.
-	 */
-	cur->bc_ops->set_root(cur, newroot, -1);
-
-	error = cur->bc_ops->free_block(cur, bp);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	XFS_BTREE_STATS_INC(cur, free);
-
-	cur->bc_bufs[level] = NULL;
-	cur->bc_ra[level] = 0;
-	cur->bc_nlevels--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
-STATIC int
-xfs_btree_dec_cursor(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			*stat)
-{
-	int			error;
-	int			i;
-
-	if (level > 0) {
-		error = xfs_btree_decrement(cur, level, &i);
-		if (error)
-			return error;
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = 1;
-	return 0;
-}
-
-/*
- * Single level of the btree record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int					/* error */
-xfs_btree_delrec(
-	struct xfs_btree_cur	*cur,		/* btree cursor */
-	int			level,		/* level removing record from */
-	int			*stat)		/* fail/done/go-on */
-{
-	struct xfs_btree_block	*block;		/* btree block */
-	union xfs_btree_ptr	cptr;		/* current block ptr */
-	struct xfs_buf		*bp;		/* buffer for block */
-	int			error;		/* error return value */
-	int			i;		/* loop counter */
-	union xfs_btree_key	key;		/* storage for keyp */
-	union xfs_btree_key	*keyp = &key;	/* passed to the next level */
-	union xfs_btree_ptr	lptr;		/* left sibling block ptr */
-	struct xfs_buf		*lbp;		/* left buffer pointer */
-	struct xfs_btree_block	*left;		/* left btree block */
-	int			lrecs = 0;	/* left record count */
-	int			ptr;		/* key/record index */
-	union xfs_btree_ptr	rptr;		/* right sibling block ptr */
-	struct xfs_buf		*rbp;		/* right buffer pointer */
-	struct xfs_btree_block	*right;		/* right btree block */
-	struct xfs_btree_block	*rrblock;	/* right-right btree block */
-	struct xfs_buf		*rrbp;		/* right-right buffer pointer */
-	int			rrecs = 0;	/* right record count */
-	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
-	int			numrecs;	/* temporary numrec count */
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_TRACE_ARGI(cur, level);
-
-	tcur = NULL;
-
-	/* Get the index of the entry being deleted, check for nothing there. */
-	ptr = cur->bc_ptrs[level];
-	if (ptr == 0) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-
-	/* Get the buffer & block containing the record or key/ptr. */
-	block = xfs_btree_get_block(cur, level, &bp);
-	numrecs = xfs_btree_get_numrecs(block);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, level, bp);
-	if (error)
-		goto error0;
-#endif
-
-	/* Fail if we're off the end of the block. */
-	if (ptr > numrecs) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-
-	XFS_BTREE_STATS_INC(cur, delrec);
-	XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
-
-	/* Excise the entries being deleted. */
-	if (level > 0) {
-		/* It's a nonleaf. operate on keys and ptrs */
-		union xfs_btree_key	*lkp;
-		union xfs_btree_ptr	*lpp;
-
-		lkp = xfs_btree_key_addr(cur, ptr + 1, block);
-		lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
-
-#ifdef DEBUG
-		for (i = 0; i < numrecs - ptr; i++) {
-			error = xfs_btree_check_ptr(cur, lpp, i, level);
-			if (error)
-				goto error0;
-		}
-#endif
-
-		if (ptr < numrecs) {
-			xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
-			xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
-			xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
-			xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
-		}
-
-		/*
-		 * If it's the first record in the block, we'll need to pass a
-		 * key up to the next level (updkey).
-		 */
-		if (ptr == 1)
-			keyp = xfs_btree_key_addr(cur, 1, block);
-	} else {
-		/* It's a leaf. operate on records */
-		if (ptr < numrecs) {
-			xfs_btree_shift_recs(cur,
-				xfs_btree_rec_addr(cur, ptr + 1, block),
-				-1, numrecs - ptr);
-			xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
-		}
-
-		/*
-		 * If it's the first record in the block, we'll need a key
-		 * structure to pass up to the next level (updkey).
-		 */
-		if (ptr == 1) {
-			cur->bc_ops->init_key_from_rec(&key,
-					xfs_btree_rec_addr(cur, 1, block));
-			keyp = &key;
-		}
-	}
-
-	/*
-	 * Decrement and log the number of entries in the block.
-	 */
-	xfs_btree_set_numrecs(block, --numrecs);
-	xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
-
-	/*
-	 * If we are tracking the last record in the tree and
-	 * we are at the far right edge of the tree, update it.
-	 */
-	if (xfs_btree_is_lastrec(cur, block, level)) {
-		cur->bc_ops->update_lastrec(cur, block, NULL,
-					    ptr, LASTREC_DELREC);
-	}
-
-	/*
-	 * We're at the root level.  First, shrink the root block in-memory.
-	 * Try to get rid of the next level down.  If we can't then there's
-	 * nothing left to do.
-	 */
-	if (level == cur->bc_nlevels - 1) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-			xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-					  cur->bc_private.b.whichfork);
-
-			error = xfs_btree_kill_iroot(cur);
-			if (error)
-				goto error0;
-
-			error = xfs_btree_dec_cursor(cur, level, stat);
-			if (error)
-				goto error0;
-			*stat = 1;
-			return 0;
-		}
-
-		/*
-		 * If this is the root level, and there's only one entry left,
-		 * and it's NOT the leaf level, then we can get rid of this
-		 * level.
-		 */
-		if (numrecs == 1 && level > 0) {
-			union xfs_btree_ptr	*pp;
-			/*
-			 * pp is still set to the first pointer in the block.
-			 * Make it the new root of the btree.
-			 */
-			pp = xfs_btree_ptr_addr(cur, 1, block);
-			error = xfs_btree_kill_root(cur, bp, level, pp);
-			if (error)
-				goto error0;
-		} else if (level > 0) {
-			error = xfs_btree_dec_cursor(cur, level, stat);
-			if (error)
-				goto error0;
-		}
-		*stat = 1;
-		return 0;
-	}
-
-	/*
-	 * If we deleted the leftmost entry in the block, update the
-	 * key values above us in the tree.
-	 */
-	if (ptr == 1) {
-		error = xfs_btree_updkey(cur, keyp, level + 1);
-		if (error)
-			goto error0;
-	}
-
-	/*
-	 * If the number of records remaining in the block is at least
-	 * the minimum, we're done.
-	 */
-	if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
-		error = xfs_btree_dec_cursor(cur, level, stat);
-		if (error)
-			goto error0;
-		return 0;
-	}
-
-	/*
-	 * Otherwise, we have to move some records around to keep the
-	 * tree balanced.  Look at the left and right sibling blocks to
-	 * see if we can re-balance by moving only one record.
-	 */
-	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
-
-	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-		/*
-		 * One child of root, need to get a chance to copy its contents
-		 * into the root and delete it. Can't go up to next level,
-		 * there's nothing to delete there.
-		 */
-		if (xfs_btree_ptr_is_null(cur, &rptr) &&
-		    xfs_btree_ptr_is_null(cur, &lptr) &&
-		    level == cur->bc_nlevels - 2) {
-			error = xfs_btree_kill_iroot(cur);
-			if (!error)
-				error = xfs_btree_dec_cursor(cur, level, stat);
-			if (error)
-				goto error0;
-			return 0;
-		}
-	}
-
-	ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
-	       !xfs_btree_ptr_is_null(cur, &lptr));
-
-	/*
-	 * Duplicate the cursor so our btree manipulations here won't
-	 * disrupt the next level up.
-	 */
-	error = xfs_btree_dup_cursor(cur, &tcur);
-	if (error)
-		goto error0;
-
-	/*
-	 * If there's a right sibling, see if it's ok to shift an entry
-	 * out of it.
-	 */
-	if (!xfs_btree_ptr_is_null(cur, &rptr)) {
-		/*
-		 * Move the temp cursor to the last entry in the next block.
-		 * Actually any entry but the first would suffice.
-		 */
-		i = xfs_btree_lastrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		error = xfs_btree_increment(tcur, level, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		i = xfs_btree_lastrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		/* Grab a pointer to the block. */
-		right = xfs_btree_get_block(tcur, level, &rbp);
-#ifdef DEBUG
-		error = xfs_btree_check_block(tcur, right, level, rbp);
-		if (error)
-			goto error0;
-#endif
-		/* Grab the current block number, for future use. */
-		xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
-
-		/*
-		 * If right block is full enough so that removing one entry
-		 * won't make it too empty, and left-shifting an entry out
-		 * of right to us works, we're done.
-		 */
-		if (xfs_btree_get_numrecs(right) - 1 >=
-		    cur->bc_ops->get_minrecs(tcur, level)) {
-			error = xfs_btree_lshift(tcur, level, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				ASSERT(xfs_btree_get_numrecs(block) >=
-				       cur->bc_ops->get_minrecs(tcur, level));
-
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-				tcur = NULL;
-
-				error = xfs_btree_dec_cursor(cur, level, stat);
-				if (error)
-					goto error0;
-				return 0;
-			}
-		}
-
-		/*
-		 * Otherwise, grab the number of records in right for
-		 * future reference, and fix up the temp cursor to point
-		 * to our block again (last record).
-		 */
-		rrecs = xfs_btree_get_numrecs(right);
-		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
-			i = xfs_btree_firstrec(tcur, level);
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-			error = xfs_btree_decrement(tcur, level, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		}
-	}
-
-	/*
-	 * If there's a left sibling, see if it's ok to shift an entry
-	 * out of it.
-	 */
-	if (!xfs_btree_ptr_is_null(cur, &lptr)) {
-		/*
-		 * Move the temp cursor to the first entry in the
-		 * previous block.
-		 */
-		i = xfs_btree_firstrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		error = xfs_btree_decrement(tcur, level, &i);
-		if (error)
-			goto error0;
-		i = xfs_btree_firstrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		/* Grab a pointer to the block. */
-		left = xfs_btree_get_block(tcur, level, &lbp);
-#ifdef DEBUG
-		error = xfs_btree_check_block(cur, left, level, lbp);
-		if (error)
-			goto error0;
-#endif
-		/* Grab the current block number, for future use. */
-		xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
-
-		/*
-		 * If left block is full enough so that removing one entry
-		 * won't make it too empty, and right-shifting an entry out
-		 * of left to us works, we're done.
-		 */
-		if (xfs_btree_get_numrecs(left) - 1 >=
-		    cur->bc_ops->get_minrecs(tcur, level)) {
-			error = xfs_btree_rshift(tcur, level, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				ASSERT(xfs_btree_get_numrecs(block) >=
-				       cur->bc_ops->get_minrecs(tcur, level));
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-				tcur = NULL;
-				if (level == 0)
-					cur->bc_ptrs[0]++;
-				XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-				*stat = 1;
-				return 0;
-			}
-		}
-
-		/*
-		 * Otherwise, grab the number of records in right for
-		 * future reference.
-		 */
-		lrecs = xfs_btree_get_numrecs(left);
-	}
-
-	/* Delete the temp cursor, we're done with it. */
-	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-	tcur = NULL;
-
-	/* If here, we need to do a join to keep the tree balanced. */
-	ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
-
-	if (!xfs_btree_ptr_is_null(cur, &lptr) &&
-	    lrecs + xfs_btree_get_numrecs(block) <=
-			cur->bc_ops->get_maxrecs(cur, level)) {
-		/*
-		 * Set "right" to be the starting block,
-		 * "left" to be the left neighbor.
-		 */
-		rptr = cptr;
-		right = block;
-		rbp = bp;
-		error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-		if (error)
-			goto error0;
-
-	/*
-	 * If that won't work, see if we can join with the right neighbor block.
-	 */
-	} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
-		   rrecs + xfs_btree_get_numrecs(block) <=
-			cur->bc_ops->get_maxrecs(cur, level)) {
-		/*
-		 * Set "left" to be the starting block,
-		 * "right" to be the right neighbor.
-		 */
-		lptr = cptr;
-		left = block;
-		lbp = bp;
-		error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-		if (error)
-			goto error0;
-
-	/*
-	 * Otherwise, we can't fix the imbalance.
-	 * Just return.  This is probably a logic error, but it's not fatal.
-	 */
-	} else {
-		error = xfs_btree_dec_cursor(cur, level, stat);
-		if (error)
-			goto error0;
-		return 0;
-	}
-
-	rrecs = xfs_btree_get_numrecs(right);
-	lrecs = xfs_btree_get_numrecs(left);
-
-	/*
-	 * We're now going to join "left" and "right" by moving all the stuff
-	 * in "right" to "left" and deleting "right".
-	 */
-	XFS_BTREE_STATS_ADD(cur, moves, rrecs);
-	if (level > 0) {
-		/* It's a non-leaf.  Move keys and pointers. */
-		union xfs_btree_key	*lkp;	/* left btree key */
-		union xfs_btree_ptr	*lpp;	/* left address pointer */
-		union xfs_btree_key	*rkp;	/* right btree key */
-		union xfs_btree_ptr	*rpp;	/* right address pointer */
-
-		lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
-		lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
-		rkp = xfs_btree_key_addr(cur, 1, right);
-		rpp = xfs_btree_ptr_addr(cur, 1, right);
-#ifdef DEBUG
-		for (i = 1; i < rrecs; i++) {
-			error = xfs_btree_check_ptr(cur, rpp, i, level);
-			if (error)
-				goto error0;
-		}
-#endif
-		xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
-		xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
-
-		xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-		xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-	} else {
-		/* It's a leaf.  Move records.  */
-		union xfs_btree_rec	*lrp;	/* left record pointer */
-		union xfs_btree_rec	*rrp;	/* right record pointer */
-
-		lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
-		rrp = xfs_btree_rec_addr(cur, 1, right);
-
-		xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
-		xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-	}
-
-	XFS_BTREE_STATS_INC(cur, join);
-
-	/*
-	 * Fix up the number of records and right block pointer in the
-	 * surviving block, and log it.
-	 */
-	xfs_btree_set_numrecs(left, lrecs + rrecs);
-	xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
-	xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
-	xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-
-	/* If there is a right sibling, point it to the remaining block. */
-	xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
-	if (!xfs_btree_ptr_is_null(cur, &cptr)) {
-		error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
-		if (error)
-			goto error0;
-		xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
-		xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-	}
-
-	/* Free the deleted block. */
-	error = cur->bc_ops->free_block(cur, rbp);
-	if (error)
-		goto error0;
-	XFS_BTREE_STATS_INC(cur, free);
-
-	/*
-	 * If we joined with the left neighbor, set the buffer in the
-	 * cursor to the left block, and fix up the index.
-	 */
-	if (bp != lbp) {
-		cur->bc_bufs[level] = lbp;
-		cur->bc_ptrs[level] += lrecs;
-		cur->bc_ra[level] = 0;
-	}
-	/*
-	 * If we joined with the right neighbor and there's a level above
-	 * us, increment the cursor at that level.
-	 */
-	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
-		   (level + 1 < cur->bc_nlevels)) {
-		error = xfs_btree_increment(cur, level + 1, &i);
-		if (error)
-			goto error0;
-	}
-
-	/*
-	 * Readjust the ptr at this level if it's not a leaf, since it's
-	 * still pointing at the deletion point, which makes the cursor
-	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-	 * We can't use decrement because it would change the next level up.
-	 */
-	if (level > 0)
-		cur->bc_ptrs[level]--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	/* Return value means the next level up has something to do. */
-	*stat = 2;
-	return 0;
-
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	if (tcur)
-		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int					/* error */
-xfs_btree_delete(
-	struct xfs_btree_cur	*cur,
-	int			*stat)	/* success/failure */
-{
-	int			error;	/* error return value */
-	int			level;
-	int			i;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-	/*
-	 * Go up the tree, starting at leaf level.
-	 *
-	 * If 2 is returned then a join was done; go to the next level.
-	 * Otherwise we are done.
-	 */
-	for (level = 0, i = 2; i == 2; level++) {
-		error = xfs_btree_delrec(cur, level, &i);
-		if (error)
-			goto error0;
-	}
-
-	if (i == 0) {
-		for (level = 1; level < cur->bc_nlevels; level++) {
-			if (cur->bc_ptrs[level] == 0) {
-				error = xfs_btree_decrement(cur, level, &i);
-				if (error)
-					goto error0;
-				break;
-			}
-		}
-	}
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	*stat = i;
-	return 0;
-error0:
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-	return error;
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int					/* error */
-xfs_btree_get_rec(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	union xfs_btree_rec	**recp,	/* output: btree record */
-	int			*stat)	/* output: success/failure */
-{
-	struct xfs_btree_block	*block;	/* btree block */
-	struct xfs_buf		*bp;	/* buffer pointer */
-	int			ptr;	/* record number */
-#ifdef DEBUG
-	int			error;	/* error return value */
-#endif
-
-	ptr = cur->bc_ptrs[0];
-	block = xfs_btree_get_block(cur, 0, &bp);
-
-#ifdef DEBUG
-	error = xfs_btree_check_block(cur, block, 0, bp);
-	if (error)
-		return error;
-#endif
-
-	/*
-	 * Off the right end or left end, return failure.
-	 */
-	if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
-		*stat = 0;
-		return 0;
-	}
-
-	/*
-	 * Point to the record and extract its data.
-	 */
-	*recp = xfs_btree_rec_addr(cur, ptr, block);
-	*stat = 1;
-	return 0;
-}
-
-/*
- * Change the owner of a btree.
- *
- * The mechanism we use here is ordered buffer logging. Because we don't know
- * how many buffers were are going to need to modify, we don't really want to
- * have to make transaction reservations for the worst case of every buffer in a
- * full size btree as that may be more space that we can fit in the log....
- *
- * We do the btree walk in the most optimal manner possible - we have sibling
- * pointers so we can just walk all the blocks on each level from left to right
- * in a single pass, and then move to the next level and do the same. We can
- * also do readahead on the sibling pointers to get IO moving more quickly,
- * though for slow disks this is unlikely to make much difference to performance
- * as the amount of CPU work we have to do before moving to the next block is
- * relatively small.
- *
- * For each btree block that we load, modify the owner appropriately, set the
- * buffer as an ordered buffer and log it appropriately. We need to ensure that
- * we mark the region we change dirty so that if the buffer is relogged in
- * a subsequent transaction the changes we make here as an ordered buffer are
- * correctly relogged in that transaction.  If we are in recovery context, then
- * just queue the modified buffer as delayed write buffer so the transaction
- * recovery completion writes the changes to disk.
- */
-static int
-xfs_btree_block_change_owner(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	__uint64_t		new_owner,
-	struct list_head	*buffer_list)
-{
-	struct xfs_btree_block	*block;
-	struct xfs_buf		*bp;
-	union xfs_btree_ptr     rptr;
-
-	/* do right sibling readahead */
-	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-
-	/* modify the owner */
-	block = xfs_btree_get_block(cur, level, &bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
-	else
-		block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
-
-	/*
-	 * If the block is a root block hosted in an inode, we might not have a
-	 * buffer pointer here and we shouldn't attempt to log the change as the
-	 * information is already held in the inode and discarded when the root
-	 * block is formatted into the on-disk inode fork. We still change it,
-	 * though, so everything is consistent in memory.
-	 */
-	if (bp) {
-		if (cur->bc_tp) {
-			xfs_trans_ordered_buf(cur->bc_tp, bp);
-			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
-		} else {
-			xfs_buf_delwri_queue(bp, buffer_list);
-		}
-	} else {
-		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
-		ASSERT(level == cur->bc_nlevels - 1);
-	}
-
-	/* now read rh sibling block for next iteration */
-	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-	if (xfs_btree_ptr_is_null(cur, &rptr))
-		return ENOENT;
-
-	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
-}
-
-int
-xfs_btree_change_owner(
-	struct xfs_btree_cur	*cur,
-	__uint64_t		new_owner,
-	struct list_head	*buffer_list)
-{
-	union xfs_btree_ptr     lptr;
-	int			level;
-	struct xfs_btree_block	*block = NULL;
-	int			error = 0;
-
-	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
-
-	/* for each level */
-	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
-		/* grab the left hand block */
-		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
-		if (error)
-			return error;
-
-		/* readahead the left most block for the next level down */
-		if (level > 0) {
-			union xfs_btree_ptr     *ptr;
-
-			ptr = xfs_btree_ptr_addr(cur, 1, block);
-			xfs_btree_readahead_ptr(cur, ptr, 1);
-
-			/* save for the next iteration of the loop */
-			lptr = *ptr;
-		}
-
-		/* for each buffer in the level */
-		do {
-			error = xfs_btree_block_change_owner(cur, level,
-							     new_owner,
-							     buffer_list);
-		} while (!error);
-
-		if (error != ENOENT)
-			return error;
-	}
-
-	return 0;
-}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
deleted file mode 100644
index a1a4e3e47a1e..000000000000
--- a/fs/xfs/xfs_da_btree.c
+++ /dev/null
@@ -1,2665 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-
-/*
- * xfs_da_btree.c
- *
- * Routines to implement directories as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC int xfs_da3_root_split(xfs_da_state_t *state,
-					    xfs_da_state_blk_t *existing_root,
-					    xfs_da_state_blk_t *new_child);
-STATIC int xfs_da3_node_split(xfs_da_state_t *state,
-					    xfs_da_state_blk_t *existing_blk,
-					    xfs_da_state_blk_t *split_blk,
-					    xfs_da_state_blk_t *blk_to_add,
-					    int treelevel,
-					    int *result);
-STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
-					 xfs_da_state_blk_t *node_blk_1,
-					 xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da3_node_add(xfs_da_state_t *state,
-				   xfs_da_state_blk_t *old_node_blk,
-				   xfs_da_state_blk_t *new_node_blk);
-
-/*
- * Routines used for shrinking the Btree.
- */
-STATIC int xfs_da3_root_join(xfs_da_state_t *state,
-					   xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
-					      xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
-					 xfs_da_state_blk_t *src_node_blk,
-					 xfs_da_state_blk_t *dst_node_blk);
-
-/*
- * Utility routines.
- */
-STATIC int	xfs_da3_blk_unlink(xfs_da_state_t *state,
-				  xfs_da_state_blk_t *drop_blk,
-				  xfs_da_state_blk_t *save_blk);
-
-
-kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
-	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
-{
-	int	i;
-
-	for (i = 0; i < state->altpath.active; i++)
-		state->altpath.blk[i].bp = NULL;
-	state->altpath.active = 0;
-}
-
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
-	xfs_da_state_kill_altpath(state);
-#ifdef DEBUG
-	memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
-	kmem_zone_free(xfs_da_state_zone, state);
-}
-
-static bool
-xfs_da3_node_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_da_intnode	*hdr = bp->b_addr;
-	struct xfs_da3_icnode_hdr ichdr;
-	const struct xfs_dir_ops *ops;
-
-	ops = xfs_dir_get_ops(mp, NULL);
-
-	ops->node_hdr_from_disk(&ichdr, hdr);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-		if (ichdr.magic != XFS_DA3_NODE_MAGIC)
-			return false;
-
-		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (ichdr.magic != XFS_DA_NODE_MAGIC)
-			return false;
-	}
-	if (ichdr.level == 0)
-		return false;
-	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
-		return false;
-	if (ichdr.count == 0)
-		return false;
-
-	/*
-	 * we don't know if the node is for and attribute or directory tree,
-	 * so only fail if the count is outside both bounds
-	 */
-	if (ichdr.count > mp->m_dir_geo->node_ents &&
-	    ichdr.count > mp->m_attr_geo->node_ents)
-		return false;
-
-	/* XXX: hash order check? */
-
-	return true;
-}
-
-static void
-xfs_da3_node_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-	if (!xfs_da3_node_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
-}
-
-/*
- * leaf/node format detection on trees is sketchy, so a node read can be done on
- * leaf level blocks when detection identifies the tree as a node format tree
- * incorrectly. In this case, we need to swap the verifier to match the correct
- * format of the block being read.
- */
-static void
-xfs_da3_node_read_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_da_blkinfo	*info = bp->b_addr;
-
-	switch (be16_to_cpu(info->magic)) {
-		case XFS_DA3_NODE_MAGIC:
-			if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
-				xfs_buf_ioerror(bp, EFSBADCRC);
-				break;
-			}
-			/* fall through */
-		case XFS_DA_NODE_MAGIC:
-			if (!xfs_da3_node_verify(bp)) {
-				xfs_buf_ioerror(bp, EFSCORRUPTED);
-				break;
-			}
-			return;
-		case XFS_ATTR_LEAF_MAGIC:
-		case XFS_ATTR3_LEAF_MAGIC:
-			bp->b_ops = &xfs_attr3_leaf_buf_ops;
-			bp->b_ops->verify_read(bp);
-			return;
-		case XFS_DIR2_LEAFN_MAGIC:
-		case XFS_DIR3_LEAFN_MAGIC:
-			bp->b_ops = &xfs_dir3_leafn_buf_ops;
-			bp->b_ops->verify_read(bp);
-			return;
-		default:
-			break;
-	}
-
-	/* corrupt block */
-	xfs_verifier_error(bp);
-}
-
-const struct xfs_buf_ops xfs_da3_node_buf_ops = {
-	.verify_read = xfs_da3_node_read_verify,
-	.verify_write = xfs_da3_node_write_verify,
-};
-
-int
-xfs_da3_node_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp,
-	int			which_fork)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-					which_fork, &xfs_da3_node_buf_ops);
-	if (!err && tp) {
-		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
-		int			type;
-
-		switch (be16_to_cpu(info->magic)) {
-		case XFS_DA_NODE_MAGIC:
-		case XFS_DA3_NODE_MAGIC:
-			type = XFS_BLFT_DA_NODE_BUF;
-			break;
-		case XFS_ATTR_LEAF_MAGIC:
-		case XFS_ATTR3_LEAF_MAGIC:
-			type = XFS_BLFT_ATTR_LEAF_BUF;
-			break;
-		case XFS_DIR2_LEAFN_MAGIC:
-		case XFS_DIR3_LEAFN_MAGIC:
-			type = XFS_BLFT_DIR_LEAFN_BUF;
-			break;
-		default:
-			type = 0;
-			ASSERT(0);
-			break;
-		}
-		xfs_trans_buf_set_type(tp, *bpp, type);
-	}
-	return err;
-}
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of an intermediate node.
- */
-int
-xfs_da3_node_create(
-	struct xfs_da_args	*args,
-	xfs_dablk_t		blkno,
-	int			level,
-	struct xfs_buf		**bpp,
-	int			whichfork)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_trans	*tp = args->trans;
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_da3_icnode_hdr ichdr = {0};
-	struct xfs_buf		*bp;
-	int			error;
-	struct xfs_inode	*dp = args->dp;
-
-	trace_xfs_da_node_create(args);
-	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
-
-	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
-	if (error)
-		return error;
-	bp->b_ops = &xfs_da3_node_buf_ops;
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
-	node = bp->b_addr;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-		ichdr.magic = XFS_DA3_NODE_MAGIC;
-		hdr3->info.blkno = cpu_to_be64(bp->b_bn);
-		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
-		uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
-	} else {
-		ichdr.magic = XFS_DA_NODE_MAGIC;
-	}
-	ichdr.level = level;
-
-	dp->d_ops->node_hdr_to_disk(node, &ichdr);
-	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Split a leaf node, rebalance, then possibly split
- * intermediate nodes, rebalance, etc.
- */
-int							/* error */
-xfs_da3_split(
-	struct xfs_da_state	*state)
-{
-	struct xfs_da_state_blk	*oldblk;
-	struct xfs_da_state_blk	*newblk;
-	struct xfs_da_state_blk	*addblk;
-	struct xfs_da_intnode	*node;
-	struct xfs_buf		*bp;
-	int			max;
-	int			action = 0;
-	int			error;
-	int			i;
-
-	trace_xfs_da_split(state->args);
-
-	/*
-	 * Walk back up the tree splitting/inserting/adjusting as necessary.
-	 * If we need to insert and there isn't room, split the node, then
-	 * decide which fragment to insert the new block from below into.
-	 * Note that we may split the root this way, but we need more fixup.
-	 */
-	max = state->path.active - 1;
-	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
-	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
-	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
-
-	addblk = &state->path.blk[max];		/* initial dummy value */
-	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
-		oldblk = &state->path.blk[i];
-		newblk = &state->altpath.blk[i];
-
-		/*
-		 * If a leaf node then
-		 *     Allocate a new leaf node, then rebalance across them.
-		 * else if an intermediate node then
-		 *     We split on the last layer, must we split the node?
-		 */
-		switch (oldblk->magic) {
-		case XFS_ATTR_LEAF_MAGIC:
-			error = xfs_attr3_leaf_split(state, oldblk, newblk);
-			if ((error != 0) && (error != ENOSPC)) {
-				return error;	/* GROT: attr is inconsistent */
-			}
-			if (!error) {
-				addblk = newblk;
-				break;
-			}
-			/*
-			 * Entry wouldn't fit, split the leaf again.
-			 */
-			state->extravalid = 1;
-			if (state->inleaf) {
-				state->extraafter = 0;	/* before newblk */
-				trace_xfs_attr_leaf_split_before(state->args);
-				error = xfs_attr3_leaf_split(state, oldblk,
-							    &state->extrablk);
-			} else {
-				state->extraafter = 1;	/* after newblk */
-				trace_xfs_attr_leaf_split_after(state->args);
-				error = xfs_attr3_leaf_split(state, newblk,
-							    &state->extrablk);
-			}
-			if (error)
-				return error;	/* GROT: attr inconsistent */
-			addblk = newblk;
-			break;
-		case XFS_DIR2_LEAFN_MAGIC:
-			error = xfs_dir2_leafn_split(state, oldblk, newblk);
-			if (error)
-				return error;
-			addblk = newblk;
-			break;
-		case XFS_DA_NODE_MAGIC:
-			error = xfs_da3_node_split(state, oldblk, newblk, addblk,
-							 max - i, &action);
-			addblk->bp = NULL;
-			if (error)
-				return error;	/* GROT: dir is inconsistent */
-			/*
-			 * Record the newly split block for the next time thru?
-			 */
-			if (action)
-				addblk = newblk;
-			else
-				addblk = NULL;
-			break;
-		}
-
-		/*
-		 * Update the btree to show the new hashval for this child.
-		 */
-		xfs_da3_fixhashpath(state, &state->path);
-	}
-	if (!addblk)
-		return 0;
-
-	/*
-	 * Split the root node.
-	 */
-	ASSERT(state->path.active == 0);
-	oldblk = &state->path.blk[0];
-	error = xfs_da3_root_split(state, oldblk, addblk);
-	if (error) {
-		addblk->bp = NULL;
-		return error;	/* GROT: dir is inconsistent */
-	}
-
-	/*
-	 * Update pointers to the node which used to be block 0 and
-	 * just got bumped because of the addition of a new root node.
-	 * There might be three blocks involved if a double split occurred,
-	 * and the original block 0 could be at any position in the list.
-	 *
-	 * Note: the magic numbers and sibling pointers are in the same
-	 * physical place for both v2 and v3 headers (by design). Hence it
-	 * doesn't matter which version of the xfs_da_intnode structure we use
-	 * here as the result will be the same using either structure.
-	 */
-	node = oldblk->bp->b_addr;
-	if (node->hdr.info.forw) {
-		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
-			bp = addblk->bp;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
-		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
-		    sizeof(node->hdr.info)));
-	}
-	node = oldblk->bp->b_addr;
-	if (node->hdr.info.back) {
-		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
-			bp = addblk->bp;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
-		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
-		    sizeof(node->hdr.info)));
-	}
-	addblk->bp = NULL;
-	return 0;
-}
-
-/*
- * Split the root.  We have to create a new root and point to the two
- * parts (the split old root) that we just created.  Copy block zero to
- * the EOF, extending the inode in process.
- */
-STATIC int						/* error */
-xfs_da3_root_split(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*blk1,
-	struct xfs_da_state_blk	*blk2)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_da_intnode	*oldroot;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr nodehdr;
-	struct xfs_da_args	*args;
-	struct xfs_buf		*bp;
-	struct xfs_inode	*dp;
-	struct xfs_trans	*tp;
-	struct xfs_mount	*mp;
-	struct xfs_dir2_leaf	*leaf;
-	xfs_dablk_t		blkno;
-	int			level;
-	int			error;
-	int			size;
-
-	trace_xfs_da_root_split(state->args);
-
-	/*
-	 * Copy the existing (incorrect) block from the root node position
-	 * to a free space somewhere.
-	 */
-	args = state->args;
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error)
-		return error;
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = state->mp;
-	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
-	if (error)
-		return error;
-	node = bp->b_addr;
-	oldroot = blk1->bp->b_addr;
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-		struct xfs_da3_icnode_hdr nodehdr;
-
-		dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
-		btree = dp->d_ops->node_tree_p(oldroot);
-		size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
-		level = nodehdr.level;
-
-		/*
-		 * we are about to copy oldroot to bp, so set up the type
-		 * of bp while we know exactly what it will be.
-		 */
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
-	} else {
-		struct xfs_dir3_icleaf_hdr leafhdr;
-		struct xfs_dir2_leaf_entry *ents;
-
-		leaf = (xfs_dir2_leaf_t *)oldroot;
-		dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-		ents = dp->d_ops->leaf_ents_p(leaf);
-
-		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-		size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
-		level = 0;
-
-		/*
-		 * we are about to copy oldroot to bp, so set up the type
-		 * of bp while we know exactly what it will be.
-		 */
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
-	}
-
-	/*
-	 * we can copy most of the information in the node from one block to
-	 * another, but for CRC enabled headers we have to make sure that the
-	 * block specific identifiers are kept intact. We update the buffer
-	 * directly for this.
-	 */
-	memcpy(node, oldroot, size);
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
-	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
-		node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
-	}
-	xfs_trans_log_buf(tp, bp, 0, size - 1);
-
-	bp->b_ops = blk1->bp->b_ops;
-	xfs_trans_buf_copy_type(bp, blk1->bp);
-	blk1->bp = bp;
-	blk1->blkno = blkno;
-
-	/*
-	 * Set up the new root node.
-	 */
-	error = xfs_da3_node_create(args,
-		(args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
-		level + 1, &bp, args->whichfork);
-	if (error)
-		return error;
-
-	node = bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	btree = dp->d_ops->node_tree_p(node);
-	btree[0].hashval = cpu_to_be32(blk1->hashval);
-	btree[0].before = cpu_to_be32(blk1->blkno);
-	btree[1].hashval = cpu_to_be32(blk2->hashval);
-	btree[1].before = cpu_to_be32(blk2->blkno);
-	nodehdr.count = 2;
-	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-
-#ifdef DEBUG
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		ASSERT(blk1->blkno >= args->geo->leafblk &&
-		       blk1->blkno < args->geo->freeblk);
-		ASSERT(blk2->blkno >= args->geo->leafblk &&
-		       blk2->blkno < args->geo->freeblk);
-	}
-#endif
-
-	/* Header is already logged by xfs_da_node_create */
-	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
-
-	return 0;
-}
-
-/*
- * Split the node, rebalance, then add the new entry.
- */
-STATIC int						/* error */
-xfs_da3_node_split(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*oldblk,
-	struct xfs_da_state_blk	*newblk,
-	struct xfs_da_state_blk	*addblk,
-	int			treelevel,
-	int			*result)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_da3_icnode_hdr nodehdr;
-	xfs_dablk_t		blkno;
-	int			newcount;
-	int			error;
-	int			useextra;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_split(state->args);
-
-	node = oldblk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-
-	/*
-	 * With V2 dirs the extra block is data or freespace.
-	 */
-	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
-	newcount = 1 + useextra;
-	/*
-	 * Do we have to split the node?
-	 */
-	if (nodehdr.count + newcount > state->args->geo->node_ents) {
-		/*
-		 * Allocate a new node, add to the doubly linked chain of
-		 * nodes, then move some of our excess entries into it.
-		 */
-		error = xfs_da_grow_inode(state->args, &blkno);
-		if (error)
-			return error;	/* GROT: dir is inconsistent */
-
-		error = xfs_da3_node_create(state->args, blkno, treelevel,
-					   &newblk->bp, state->args->whichfork);
-		if (error)
-			return error;	/* GROT: dir is inconsistent */
-		newblk->blkno = blkno;
-		newblk->magic = XFS_DA_NODE_MAGIC;
-		xfs_da3_node_rebalance(state, oldblk, newblk);
-		error = xfs_da3_blk_link(state, oldblk, newblk);
-		if (error)
-			return error;
-		*result = 1;
-	} else {
-		*result = 0;
-	}
-
-	/*
-	 * Insert the new entry(s) into the correct block
-	 * (updating last hashval in the process).
-	 *
-	 * xfs_da3_node_add() inserts BEFORE the given index,
-	 * and as a result of using node_lookup_int() we always
-	 * point to a valid entry (not after one), but a split
-	 * operation always results in a new block whose hashvals
-	 * FOLLOW the current block.
-	 *
-	 * If we had double-split op below us, then add the extra block too.
-	 */
-	node = oldblk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	if (oldblk->index <= nodehdr.count) {
-		oldblk->index++;
-		xfs_da3_node_add(state, oldblk, addblk);
-		if (useextra) {
-			if (state->extraafter)
-				oldblk->index++;
-			xfs_da3_node_add(state, oldblk, &state->extrablk);
-			state->extravalid = 0;
-		}
-	} else {
-		newblk->index++;
-		xfs_da3_node_add(state, newblk, addblk);
-		if (useextra) {
-			if (state->extraafter)
-				newblk->index++;
-			xfs_da3_node_add(state, newblk, &state->extrablk);
-			state->extravalid = 0;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Balance the btree elements between two intermediate nodes,
- * usually one full and one empty.
- *
- * NOTE: if blk2 is empty, then it will get the upper half of blk1.
- */
-STATIC void
-xfs_da3_node_rebalance(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*blk1,
-	struct xfs_da_state_blk	*blk2)
-{
-	struct xfs_da_intnode	*node1;
-	struct xfs_da_intnode	*node2;
-	struct xfs_da_intnode	*tmpnode;
-	struct xfs_da_node_entry *btree1;
-	struct xfs_da_node_entry *btree2;
-	struct xfs_da_node_entry *btree_s;
-	struct xfs_da_node_entry *btree_d;
-	struct xfs_da3_icnode_hdr nodehdr1;
-	struct xfs_da3_icnode_hdr nodehdr2;
-	struct xfs_trans	*tp;
-	int			count;
-	int			tmp;
-	int			swap = 0;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_rebalance(state->args);
-
-	node1 = blk1->bp->b_addr;
-	node2 = blk2->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-	dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-	btree1 = dp->d_ops->node_tree_p(node1);
-	btree2 = dp->d_ops->node_tree_p(node2);
-
-	/*
-	 * Figure out how many entries need to move, and in which direction.
-	 * Swap the nodes around if that makes it simpler.
-	 */
-	if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
-	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
-	     (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
-			be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
-		tmpnode = node1;
-		node1 = node2;
-		node2 = tmpnode;
-		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-		btree1 = dp->d_ops->node_tree_p(node1);
-		btree2 = dp->d_ops->node_tree_p(node2);
-		swap = 1;
-	}
-
-	count = (nodehdr1.count - nodehdr2.count) / 2;
-	if (count == 0)
-		return;
-	tp = state->args->trans;
-	/*
-	 * Two cases: high-to-low and low-to-high.
-	 */
-	if (count > 0) {
-		/*
-		 * Move elements in node2 up to make a hole.
-		 */
-		tmp = nodehdr2.count;
-		if (tmp > 0) {
-			tmp *= (uint)sizeof(xfs_da_node_entry_t);
-			btree_s = &btree2[0];
-			btree_d = &btree2[count];
-			memmove(btree_d, btree_s, tmp);
-		}
-
-		/*
-		 * Move the req'd B-tree elements from high in node1 to
-		 * low in node2.
-		 */
-		nodehdr2.count += count;
-		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &btree1[nodehdr1.count - count];
-		btree_d = &btree2[0];
-		memcpy(btree_d, btree_s, tmp);
-		nodehdr1.count -= count;
-	} else {
-		/*
-		 * Move the req'd B-tree elements from low in node2 to
-		 * high in node1.
-		 */
-		count = -count;
-		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &btree2[0];
-		btree_d = &btree1[nodehdr1.count];
-		memcpy(btree_d, btree_s, tmp);
-		nodehdr1.count += count;
-
-		xfs_trans_log_buf(tp, blk1->bp,
-			XFS_DA_LOGRANGE(node1, btree_d, tmp));
-
-		/*
-		 * Move elements in node2 down to fill the hole.
-		 */
-		tmp  = nodehdr2.count - count;
-		tmp *= (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &btree2[count];
-		btree_d = &btree2[0];
-		memmove(btree_d, btree_s, tmp);
-		nodehdr2.count -= count;
-	}
-
-	/*
-	 * Log header of node 1 and all current bits of node 2.
-	 */
-	dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
-	xfs_trans_log_buf(tp, blk1->bp,
-		XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
-
-	dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
-	xfs_trans_log_buf(tp, blk2->bp,
-		XFS_DA_LOGRANGE(node2, &node2->hdr,
-				dp->d_ops->node_hdr_size +
-				(sizeof(btree2[0]) * nodehdr2.count)));
-
-	/*
-	 * Record the last hashval from each block for upward propagation.
-	 * (note: don't use the swapped node pointers)
-	 */
-	if (swap) {
-		node1 = blk1->bp->b_addr;
-		node2 = blk2->bp->b_addr;
-		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
-		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
-		btree1 = dp->d_ops->node_tree_p(node1);
-		btree2 = dp->d_ops->node_tree_p(node2);
-	}
-	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
-	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
-
-	/*
-	 * Adjust the expected index for insertion.
-	 */
-	if (blk1->index >= nodehdr1.count) {
-		blk2->index = blk1->index - nodehdr1.count;
-		blk1->index = nodehdr1.count + 1;	/* make it invalid */
-	}
-}
-
-/*
- * Add a new entry to an intermediate node.
- */
-STATIC void
-xfs_da3_node_add(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*oldblk,
-	struct xfs_da_state_blk	*newblk)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_da3_icnode_hdr nodehdr;
-	struct xfs_da_node_entry *btree;
-	int			tmp;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_add(state->args);
-
-	node = oldblk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	btree = dp->d_ops->node_tree_p(node);
-
-	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
-	ASSERT(newblk->blkno != 0);
-	if (state->args->whichfork == XFS_DATA_FORK)
-		ASSERT(newblk->blkno >= state->args->geo->leafblk &&
-		       newblk->blkno < state->args->geo->freeblk);
-
-	/*
-	 * We may need to make some room before we insert the new node.
-	 */
-	tmp = 0;
-	if (oldblk->index < nodehdr.count) {
-		tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
-		memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
-	}
-	btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
-	btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
-	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, &btree[oldblk->index],
-				tmp + sizeof(*btree)));
-
-	nodehdr.count += 1;
-	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-	/*
-	 * Copy the last hash value from the oldblk to propagate upwards.
-	 */
-	oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Deallocate an empty leaf node, remove it from its parent,
- * possibly deallocating that block, etc...
- */
-int
-xfs_da3_join(
-	struct xfs_da_state	*state)
-{
-	struct xfs_da_state_blk	*drop_blk;
-	struct xfs_da_state_blk	*save_blk;
-	int			action = 0;
-	int			error;
-
-	trace_xfs_da_join(state->args);
-
-	drop_blk = &state->path.blk[ state->path.active-1 ];
-	save_blk = &state->altpath.blk[ state->path.active-1 ];
-	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
-	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
-	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-
-	/*
-	 * Walk back up the tree joining/deallocating as necessary.
-	 * When we stop dropping blocks, break out.
-	 */
-	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
-		 state->path.active--) {
-		/*
-		 * See if we can combine the block with a neighbor.
-		 *   (action == 0) => no options, just leave
-		 *   (action == 1) => coalesce, then unlink
-		 *   (action == 2) => block empty, unlink it
-		 */
-		switch (drop_blk->magic) {
-		case XFS_ATTR_LEAF_MAGIC:
-			error = xfs_attr3_leaf_toosmall(state, &action);
-			if (error)
-				return error;
-			if (action == 0)
-				return 0;
-			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
-			break;
-		case XFS_DIR2_LEAFN_MAGIC:
-			error = xfs_dir2_leafn_toosmall(state, &action);
-			if (error)
-				return error;
-			if (action == 0)
-				return 0;
-			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
-			break;
-		case XFS_DA_NODE_MAGIC:
-			/*
-			 * Remove the offending node, fixup hashvals,
-			 * check for a toosmall neighbor.
-			 */
-			xfs_da3_node_remove(state, drop_blk);
-			xfs_da3_fixhashpath(state, &state->path);
-			error = xfs_da3_node_toosmall(state, &action);
-			if (error)
-				return error;
-			if (action == 0)
-				return 0;
-			xfs_da3_node_unbalance(state, drop_blk, save_blk);
-			break;
-		}
-		xfs_da3_fixhashpath(state, &state->altpath);
-		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
-		xfs_da_state_kill_altpath(state);
-		if (error)
-			return error;
-		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
-							 drop_blk->bp);
-		drop_blk->bp = NULL;
-		if (error)
-			return error;
-	}
-	/*
-	 * We joined all the way to the top.  If it turns out that
-	 * we only have one entry in the root, make the child block
-	 * the new root.
-	 */
-	xfs_da3_node_remove(state, drop_blk);
-	xfs_da3_fixhashpath(state, &state->path);
-	error = xfs_da3_root_join(state, &state->path.blk[0]);
-	return error;
-}
-
-#ifdef	DEBUG
-static void
-xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
-{
-	__be16	magic = blkinfo->magic;
-
-	if (level == 1) {
-		ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-		       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
-		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-		       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-	} else {
-		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-		       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-	}
-	ASSERT(!blkinfo->forw);
-	ASSERT(!blkinfo->back);
-}
-#else	/* !DEBUG */
-#define	xfs_da_blkinfo_onlychild_validate(blkinfo, level)
-#endif	/* !DEBUG */
-
-/*
- * We have only one entry in the root.  Copy the only remaining child of
- * the old root to block 0 as the new root node.
- */
-STATIC int
-xfs_da3_root_join(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*root_blk)
-{
-	struct xfs_da_intnode	*oldroot;
-	struct xfs_da_args	*args;
-	xfs_dablk_t		child;
-	struct xfs_buf		*bp;
-	struct xfs_da3_icnode_hdr oldroothdr;
-	struct xfs_da_node_entry *btree;
-	int			error;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_root_join(state->args);
-
-	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
-
-	args = state->args;
-	oldroot = root_blk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
-	ASSERT(oldroothdr.forw == 0);
-	ASSERT(oldroothdr.back == 0);
-
-	/*
-	 * If the root has more than one child, then don't do anything.
-	 */
-	if (oldroothdr.count > 1)
-		return 0;
-
-	/*
-	 * Read in the (only) child block, then copy those bytes into
-	 * the root block's buffer and free the original child block.
-	 */
-	btree = dp->d_ops->node_tree_p(oldroot);
-	child = be32_to_cpu(btree[0].before);
-	ASSERT(child != 0);
-	error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
-					     args->whichfork);
-	if (error)
-		return error;
-	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
-
-	/*
-	 * This could be copying a leaf back into the root block in the case of
-	 * there only being a single leaf block left in the tree. Hence we have
-	 * to update the b_ops pointer as well to match the buffer type change
-	 * that could occur. For dir3 blocks we also need to update the block
-	 * number in the buffer header.
-	 */
-	memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
-	root_blk->bp->b_ops = bp->b_ops;
-	xfs_trans_buf_copy_type(root_blk->bp, bp);
-	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
-		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
-		da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
-	}
-	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
-			  args->geo->blksize - 1);
-	error = xfs_da_shrink_inode(args, child, bp);
-	return error;
-}
-
-/*
- * Check a node block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-STATIC int
-xfs_da3_node_toosmall(
-	struct xfs_da_state	*state,
-	int			*action)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_da_state_blk	*blk;
-	struct xfs_da_blkinfo	*info;
-	xfs_dablk_t		blkno;
-	struct xfs_buf		*bp;
-	struct xfs_da3_icnode_hdr nodehdr;
-	int			count;
-	int			forward;
-	int			error;
-	int			retval;
-	int			i;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_toosmall(state->args);
-
-	/*
-	 * Check for the degenerate case of the block being over 50% full.
-	 * If so, it's not worth even looking to see if we might be able
-	 * to coalesce with a sibling.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	info = blk->bp->b_addr;
-	node = (xfs_da_intnode_t *)info;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
-		*action = 0;	/* blk over 50%, don't try to join */
-		return 0;	/* blk over 50%, don't try to join */
-	}
-
-	/*
-	 * Check for the degenerate case of the block being empty.
-	 * If the block is empty, we'll simply delete it, no need to
-	 * coalesce it with a sibling block.  We choose (arbitrarily)
-	 * to merge with the forward block unless it is NULL.
-	 */
-	if (nodehdr.count == 0) {
-		/*
-		 * Make altpath point to the block we want to keep and
-		 * path point to the block we want to drop (this one).
-		 */
-		forward = (info->forw != 0);
-		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da3_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-		if (error)
-			return error;
-		if (retval) {
-			*action = 0;
-		} else {
-			*action = 2;
-		}
-		return 0;
-	}
-
-	/*
-	 * Examine each sibling block to see if we can coalesce with
-	 * at least 25% free space to spare.  We need to figure out
-	 * whether to merge with the forward or the backward block.
-	 * We prefer coalescing with the lower numbered sibling so as
-	 * to shrink a directory over time.
-	 */
-	count  = state->args->geo->node_ents;
-	count -= state->args->geo->node_ents >> 2;
-	count -= nodehdr.count;
-
-	/* start with smaller blk num */
-	forward = nodehdr.forw < nodehdr.back;
-	for (i = 0; i < 2; forward = !forward, i++) {
-		struct xfs_da3_icnode_hdr thdr;
-		if (forward)
-			blkno = nodehdr.forw;
-		else
-			blkno = nodehdr.back;
-		if (blkno == 0)
-			continue;
-		error = xfs_da3_node_read(state->args->trans, dp,
-					blkno, -1, &bp, state->args->whichfork);
-		if (error)
-			return error;
-
-		node = bp->b_addr;
-		dp->d_ops->node_hdr_from_disk(&thdr, node);
-		xfs_trans_brelse(state->args->trans, bp);
-
-		if (count - thdr.count >= 0)
-			break;	/* fits with at least 25% to spare */
-	}
-	if (i >= 2) {
-		*action = 0;
-		return 0;
-	}
-
-	/*
-	 * Make altpath point to the block we want to keep (the lower
-	 * numbered block) and path point to the block we want to drop.
-	 */
-	memcpy(&state->altpath, &state->path, sizeof(state->path));
-	if (blkno < blk->blkno) {
-		error = xfs_da3_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-	} else {
-		error = xfs_da3_path_shift(state, &state->path, forward,
-						 0, &retval);
-	}
-	if (error)
-		return error;
-	if (retval) {
-		*action = 0;
-		return 0;
-	}
-	*action = 1;
-	return 0;
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da3_node_lasthash(
-	struct xfs_inode	*dp,
-	struct xfs_buf		*bp,
-	int			*count)
-{
-	struct xfs_da_intnode	 *node;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr nodehdr;
-
-	node = bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	if (count)
-		*count = nodehdr.count;
-	if (!nodehdr.count)
-		return 0;
-	btree = dp->d_ops->node_tree_p(node);
-	return be32_to_cpu(btree[nodehdr.count - 1].hashval);
-}
-
-/*
- * Walk back up the tree adjusting hash values as necessary,
- * when we stop making changes, return.
- */
-void
-xfs_da3_fixhashpath(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_path *path)
-{
-	struct xfs_da_state_blk	*blk;
-	struct xfs_da_intnode	*node;
-	struct xfs_da_node_entry *btree;
-	xfs_dahash_t		lasthash=0;
-	int			level;
-	int			count;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_fixhashpath(state->args);
-
-	level = path->active-1;
-	blk = &path->blk[ level ];
-	switch (blk->magic) {
-	case XFS_ATTR_LEAF_MAGIC:
-		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
-		if (count == 0)
-			return;
-		break;
-	case XFS_DIR2_LEAFN_MAGIC:
-		lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
-		if (count == 0)
-			return;
-		break;
-	case XFS_DA_NODE_MAGIC:
-		lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
-		if (count == 0)
-			return;
-		break;
-	}
-	for (blk--, level--; level >= 0; blk--, level--) {
-		struct xfs_da3_icnode_hdr nodehdr;
-
-		node = blk->bp->b_addr;
-		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-		btree = dp->d_ops->node_tree_p(node);
-		if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
-			break;
-		blk->hashval = lasthash;
-		btree[blk->index].hashval = cpu_to_be32(lasthash);
-		xfs_trans_log_buf(state->args->trans, blk->bp,
-				  XFS_DA_LOGRANGE(node, &btree[blk->index],
-						  sizeof(*btree)));
-
-		lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-	}
-}
-
-/*
- * Remove an entry from an intermediate node.
- */
-STATIC void
-xfs_da3_node_remove(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*drop_blk)
-{
-	struct xfs_da_intnode	*node;
-	struct xfs_da3_icnode_hdr nodehdr;
-	struct xfs_da_node_entry *btree;
-	int			index;
-	int			tmp;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_remove(state->args);
-
-	node = drop_blk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-	ASSERT(drop_blk->index < nodehdr.count);
-	ASSERT(drop_blk->index >= 0);
-
-	/*
-	 * Copy over the offending entry, or just zero it out.
-	 */
-	index = drop_blk->index;
-	btree = dp->d_ops->node_tree_p(node);
-	if (index < nodehdr.count - 1) {
-		tmp  = nodehdr.count - index - 1;
-		tmp *= (uint)sizeof(xfs_da_node_entry_t);
-		memmove(&btree[index], &btree[index + 1], tmp);
-		xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-		    XFS_DA_LOGRANGE(node, &btree[index], tmp));
-		index = nodehdr.count - 1;
-	}
-	memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
-	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
-	nodehdr.count -= 1;
-	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
-	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
-
-	/*
-	 * Copy the last hash value from the block to propagate upwards.
-	 */
-	drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
-}
-
-/*
- * Unbalance the elements between two intermediate nodes,
- * move all Btree elements from one node into another.
- */
-STATIC void
-xfs_da3_node_unbalance(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*drop_blk,
-	struct xfs_da_state_blk	*save_blk)
-{
-	struct xfs_da_intnode	*drop_node;
-	struct xfs_da_intnode	*save_node;
-	struct xfs_da_node_entry *drop_btree;
-	struct xfs_da_node_entry *save_btree;
-	struct xfs_da3_icnode_hdr drop_hdr;
-	struct xfs_da3_icnode_hdr save_hdr;
-	struct xfs_trans	*tp;
-	int			sindex;
-	int			tmp;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_node_unbalance(state->args);
-
-	drop_node = drop_blk->bp->b_addr;
-	save_node = save_blk->bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
-	dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
-	drop_btree = dp->d_ops->node_tree_p(drop_node);
-	save_btree = dp->d_ops->node_tree_p(save_node);
-	tp = state->args->trans;
-
-	/*
-	 * If the dying block has lower hashvals, then move all the
-	 * elements in the remaining block up to make a hole.
-	 */
-	if ((be32_to_cpu(drop_btree[0].hashval) <
-			be32_to_cpu(save_btree[0].hashval)) ||
-	    (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
-			be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
-		/* XXX: check this - is memmove dst correct? */
-		tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
-		memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
-
-		sindex = 0;
-		xfs_trans_log_buf(tp, save_blk->bp,
-			XFS_DA_LOGRANGE(save_node, &save_btree[0],
-				(save_hdr.count + drop_hdr.count) *
-						sizeof(xfs_da_node_entry_t)));
-	} else {
-		sindex = save_hdr.count;
-		xfs_trans_log_buf(tp, save_blk->bp,
-			XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
-				drop_hdr.count * sizeof(xfs_da_node_entry_t)));
-	}
-
-	/*
-	 * Move all the B-tree elements from drop_blk to save_blk.
-	 */
-	tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
-	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
-	save_hdr.count += drop_hdr.count;
-
-	dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
-	xfs_trans_log_buf(tp, save_blk->bp,
-		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
-				dp->d_ops->node_hdr_size));
-
-	/*
-	 * Save the last hashval in the remaining block for upward propagation.
-	 */
-	save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Walk down the Btree looking for a particular filename, filling
- * in the state structure as we go.
- *
- * We will set the state structure to point to each of the elements
- * in each of the nodes where either the hashval is or should be.
- *
- * We support duplicate hashval's so for each entry in the current
- * node that could contain the desired hashval, descend.  This is a
- * pruned depth-first tree search.
- */
-int							/* error */
-xfs_da3_node_lookup_int(
-	struct xfs_da_state	*state,
-	int			*result)
-{
-	struct xfs_da_state_blk	*blk;
-	struct xfs_da_blkinfo	*curr;
-	struct xfs_da_intnode	*node;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr nodehdr;
-	struct xfs_da_args	*args;
-	xfs_dablk_t		blkno;
-	xfs_dahash_t		hashval;
-	xfs_dahash_t		btreehashval;
-	int			probe;
-	int			span;
-	int			max;
-	int			error;
-	int			retval;
-	struct xfs_inode	*dp = state->args->dp;
-
-	args = state->args;
-
-	/*
-	 * Descend thru the B-tree searching each level for the right
-	 * node to use, until the right hashval is found.
-	 */
-	blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
-	for (blk = &state->path.blk[0], state->path.active = 1;
-			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
-			 blk++, state->path.active++) {
-		/*
-		 * Read the next node down in the tree.
-		 */
-		blk->blkno = blkno;
-		error = xfs_da3_node_read(args->trans, args->dp, blkno,
-					-1, &blk->bp, args->whichfork);
-		if (error) {
-			blk->blkno = 0;
-			state->path.active--;
-			return error;
-		}
-		curr = blk->bp->b_addr;
-		blk->magic = be16_to_cpu(curr->magic);
-
-		if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
-		    blk->magic == XFS_ATTR3_LEAF_MAGIC) {
-			blk->magic = XFS_ATTR_LEAF_MAGIC;
-			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
-			break;
-		}
-
-		if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-		    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
-			blk->magic = XFS_DIR2_LEAFN_MAGIC;
-			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
-							       blk->bp, NULL);
-			break;
-		}
-
-		blk->magic = XFS_DA_NODE_MAGIC;
-
-
-		/*
-		 * Search an intermediate node for a match.
-		 */
-		node = blk->bp->b_addr;
-		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-		btree = dp->d_ops->node_tree_p(node);
-
-		max = nodehdr.count;
-		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
-
-		/*
-		 * Binary search.  (note: small blocks will skip loop)
-		 */
-		probe = span = max / 2;
-		hashval = args->hashval;
-		while (span > 4) {
-			span /= 2;
-			btreehashval = be32_to_cpu(btree[probe].hashval);
-			if (btreehashval < hashval)
-				probe += span;
-			else if (btreehashval > hashval)
-				probe -= span;
-			else
-				break;
-		}
-		ASSERT((probe >= 0) && (probe < max));
-		ASSERT((span <= 4) ||
-			(be32_to_cpu(btree[probe].hashval) == hashval));
-
-		/*
-		 * Since we may have duplicate hashval's, find the first
-		 * matching hashval in the node.
-		 */
-		while (probe > 0 &&
-		       be32_to_cpu(btree[probe].hashval) >= hashval) {
-			probe--;
-		}
-		while (probe < max &&
-		       be32_to_cpu(btree[probe].hashval) < hashval) {
-			probe++;
-		}
-
-		/*
-		 * Pick the right block to descend on.
-		 */
-		if (probe == max) {
-			blk->index = max - 1;
-			blkno = be32_to_cpu(btree[max - 1].before);
-		} else {
-			blk->index = probe;
-			blkno = be32_to_cpu(btree[probe].before);
-		}
-	}
-
-	/*
-	 * A leaf block that ends in the hashval that we are interested in
-	 * (final hashval == search hashval) means that the next block may
-	 * contain more entries with the same hashval, shift upward to the
-	 * next leaf and keep searching.
-	 */
-	for (;;) {
-		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
-							&blk->index, state);
-		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-			retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
-			blk->index = args->index;
-			args->blkno = blk->blkno;
-		} else {
-			ASSERT(0);
-			return EFSCORRUPTED;
-		}
-		if (((retval == ENOENT) || (retval == ENOATTR)) &&
-		    (blk->hashval == args->hashval)) {
-			error = xfs_da3_path_shift(state, &state->path, 1, 1,
-							 &retval);
-			if (error)
-				return error;
-			if (retval == 0) {
-				continue;
-			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-				/* path_shift() gives ENOENT */
-				retval = ENOATTR;
-			}
-		}
-		break;
-	}
-	*result = retval;
-	return 0;
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da3_node_order(
-	struct xfs_inode *dp,
-	struct xfs_buf	*node1_bp,
-	struct xfs_buf	*node2_bp)
-{
-	struct xfs_da_intnode	*node1;
-	struct xfs_da_intnode	*node2;
-	struct xfs_da_node_entry *btree1;
-	struct xfs_da_node_entry *btree2;
-	struct xfs_da3_icnode_hdr node1hdr;
-	struct xfs_da3_icnode_hdr node2hdr;
-
-	node1 = node1_bp->b_addr;
-	node2 = node2_bp->b_addr;
-	dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
-	dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
-	btree1 = dp->d_ops->node_tree_p(node1);
-	btree2 = dp->d_ops->node_tree_p(node2);
-
-	if (node1hdr.count > 0 && node2hdr.count > 0 &&
-	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
-	     (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
-	      be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Link a new block into a doubly linked list of blocks (of whatever type).
- */
-int							/* error */
-xfs_da3_blk_link(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*old_blk,
-	struct xfs_da_state_blk	*new_blk)
-{
-	struct xfs_da_blkinfo	*old_info;
-	struct xfs_da_blkinfo	*new_info;
-	struct xfs_da_blkinfo	*tmp_info;
-	struct xfs_da_args	*args;
-	struct xfs_buf		*bp;
-	int			before = 0;
-	int			error;
-	struct xfs_inode	*dp = state->args->dp;
-
-	/*
-	 * Set up environment.
-	 */
-	args = state->args;
-	ASSERT(args != NULL);
-	old_info = old_blk->bp->b_addr;
-	new_info = new_blk->bp->b_addr;
-	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
-	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
-
-	switch (old_blk->magic) {
-	case XFS_ATTR_LEAF_MAGIC:
-		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
-		break;
-	case XFS_DIR2_LEAFN_MAGIC:
-		before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
-		break;
-	case XFS_DA_NODE_MAGIC:
-		before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
-		break;
-	}
-
-	/*
-	 * Link blocks in appropriate order.
-	 */
-	if (before) {
-		/*
-		 * Link new block in before existing block.
-		 */
-		trace_xfs_da_link_before(args);
-		new_info->forw = cpu_to_be32(old_blk->blkno);
-		new_info->back = old_info->back;
-		if (old_info->back) {
-			error = xfs_da3_node_read(args->trans, dp,
-						be32_to_cpu(old_info->back),
-						-1, &bp, args->whichfork);
-			if (error)
-				return error;
-			ASSERT(bp != NULL);
-			tmp_info = bp->b_addr;
-			ASSERT(tmp_info->magic == old_info->magic);
-			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
-			tmp_info->forw = cpu_to_be32(new_blk->blkno);
-			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
-		}
-		old_info->back = cpu_to_be32(new_blk->blkno);
-	} else {
-		/*
-		 * Link new block in after existing block.
-		 */
-		trace_xfs_da_link_after(args);
-		new_info->forw = old_info->forw;
-		new_info->back = cpu_to_be32(old_blk->blkno);
-		if (old_info->forw) {
-			error = xfs_da3_node_read(args->trans, dp,
-						be32_to_cpu(old_info->forw),
-						-1, &bp, args->whichfork);
-			if (error)
-				return error;
-			ASSERT(bp != NULL);
-			tmp_info = bp->b_addr;
-			ASSERT(tmp_info->magic == old_info->magic);
-			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
-			tmp_info->back = cpu_to_be32(new_blk->blkno);
-			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
-		}
-		old_info->forw = cpu_to_be32(new_blk->blkno);
-	}
-
-	xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
-	xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
-	return 0;
-}
-
-/*
- * Unlink a block from a doubly linked list of blocks.
- */
-STATIC int						/* error */
-xfs_da3_blk_unlink(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_blk	*drop_blk,
-	struct xfs_da_state_blk	*save_blk)
-{
-	struct xfs_da_blkinfo	*drop_info;
-	struct xfs_da_blkinfo	*save_info;
-	struct xfs_da_blkinfo	*tmp_info;
-	struct xfs_da_args	*args;
-	struct xfs_buf		*bp;
-	int			error;
-
-	/*
-	 * Set up environment.
-	 */
-	args = state->args;
-	ASSERT(args != NULL);
-	save_info = save_blk->bp->b_addr;
-	drop_info = drop_blk->bp->b_addr;
-	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
-	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
-	ASSERT(save_blk->magic == drop_blk->magic);
-	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
-	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
-	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
-	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
-
-	/*
-	 * Unlink the leaf block from the doubly linked chain of leaves.
-	 */
-	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
-		trace_xfs_da_unlink_back(args);
-		save_info->back = drop_info->back;
-		if (drop_info->back) {
-			error = xfs_da3_node_read(args->trans, args->dp,
-						be32_to_cpu(drop_info->back),
-						-1, &bp, args->whichfork);
-			if (error)
-				return error;
-			ASSERT(bp != NULL);
-			tmp_info = bp->b_addr;
-			ASSERT(tmp_info->magic == save_info->magic);
-			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
-			tmp_info->forw = cpu_to_be32(save_blk->blkno);
-			xfs_trans_log_buf(args->trans, bp, 0,
-						    sizeof(*tmp_info) - 1);
-		}
-	} else {
-		trace_xfs_da_unlink_forward(args);
-		save_info->forw = drop_info->forw;
-		if (drop_info->forw) {
-			error = xfs_da3_node_read(args->trans, args->dp,
-						be32_to_cpu(drop_info->forw),
-						-1, &bp, args->whichfork);
-			if (error)
-				return error;
-			ASSERT(bp != NULL);
-			tmp_info = bp->b_addr;
-			ASSERT(tmp_info->magic == save_info->magic);
-			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
-			tmp_info->back = cpu_to_be32(save_blk->blkno);
-			xfs_trans_log_buf(args->trans, bp, 0,
-						    sizeof(*tmp_info) - 1);
-		}
-	}
-
-	xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
-	return 0;
-}
-
-/*
- * Move a path "forward" or "!forward" one block at the current level.
- *
- * This routine will adjust a "path" to point to the next block
- * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
- * Btree, including updating pointers to the intermediate nodes between
- * the new bottom and the root.
- */
-int							/* error */
-xfs_da3_path_shift(
-	struct xfs_da_state	*state,
-	struct xfs_da_state_path *path,
-	int			forward,
-	int			release,
-	int			*result)
-{
-	struct xfs_da_state_blk	*blk;
-	struct xfs_da_blkinfo	*info;
-	struct xfs_da_intnode	*node;
-	struct xfs_da_args	*args;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr nodehdr;
-	xfs_dablk_t		blkno = 0;
-	int			level;
-	int			error;
-	struct xfs_inode	*dp = state->args->dp;
-
-	trace_xfs_da_path_shift(state->args);
-
-	/*
-	 * Roll up the Btree looking for the first block where our
-	 * current index is not at the edge of the block.  Note that
-	 * we skip the bottom layer because we want the sibling block.
-	 */
-	args = state->args;
-	ASSERT(args != NULL);
-	ASSERT(path != NULL);
-	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
-	level = (path->active-1) - 1;	/* skip bottom layer in path */
-	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
-		node = blk->bp->b_addr;
-		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-		btree = dp->d_ops->node_tree_p(node);
-
-		if (forward && (blk->index < nodehdr.count - 1)) {
-			blk->index++;
-			blkno = be32_to_cpu(btree[blk->index].before);
-			break;
-		} else if (!forward && (blk->index > 0)) {
-			blk->index--;
-			blkno = be32_to_cpu(btree[blk->index].before);
-			break;
-		}
-	}
-	if (level < 0) {
-		*result = ENOENT;	/* we're out of our tree */
-		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-		return 0;
-	}
-
-	/*
-	 * Roll down the edge of the subtree until we reach the
-	 * same depth we were at originally.
-	 */
-	for (blk++, level++; level < path->active; blk++, level++) {
-		/*
-		 * Release the old block.
-		 * (if it's dirty, trans won't actually let go)
-		 */
-		if (release)
-			xfs_trans_brelse(args->trans, blk->bp);
-
-		/*
-		 * Read the next child block.
-		 */
-		blk->blkno = blkno;
-		error = xfs_da3_node_read(args->trans, dp, blkno, -1,
-					&blk->bp, args->whichfork);
-		if (error)
-			return error;
-		info = blk->bp->b_addr;
-		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
-
-
-		/*
-		 * Note: we flatten the magic number to a single type so we
-		 * don't have to compare against crc/non-crc types elsewhere.
-		 */
-		switch (be16_to_cpu(info->magic)) {
-		case XFS_DA_NODE_MAGIC:
-		case XFS_DA3_NODE_MAGIC:
-			blk->magic = XFS_DA_NODE_MAGIC;
-			node = (xfs_da_intnode_t *)info;
-			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
-			btree = dp->d_ops->node_tree_p(node);
-			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
-			if (forward)
-				blk->index = 0;
-			else
-				blk->index = nodehdr.count - 1;
-			blkno = be32_to_cpu(btree[blk->index].before);
-			break;
-		case XFS_ATTR_LEAF_MAGIC:
-		case XFS_ATTR3_LEAF_MAGIC:
-			blk->magic = XFS_ATTR_LEAF_MAGIC;
-			ASSERT(level == path->active-1);
-			blk->index = 0;
-			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
-			break;
-		case XFS_DIR2_LEAFN_MAGIC:
-		case XFS_DIR3_LEAFN_MAGIC:
-			blk->magic = XFS_DIR2_LEAFN_MAGIC;
-			ASSERT(level == path->active-1);
-			blk->index = 0;
-			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
-							       blk->bp, NULL);
-			break;
-		default:
-			ASSERT(0);
-			break;
-		}
-	}
-	*result = 0;
-	return 0;
-}
-
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Implement a simple hash on a character string.
- * Rotate the hash value by 7 bits, then XOR each character in.
- * This is implemented with some source-level loop unrolling.
- */
-xfs_dahash_t
-xfs_da_hashname(const __uint8_t *name, int namelen)
-{
-	xfs_dahash_t hash;
-
-	/*
-	 * Do four characters at a time as long as we can.
-	 */
-	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
-		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
-		       (name[3] << 0) ^ rol32(hash, 7 * 4);
-
-	/*
-	 * Now do the rest of the characters.
-	 */
-	switch (namelen) {
-	case 3:
-		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
-		       rol32(hash, 7 * 3);
-	case 2:
-		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
-	case 1:
-		return (name[0] << 0) ^ rol32(hash, 7 * 1);
-	default: /* case 0: */
-		return hash;
-	}
-}
-
-enum xfs_dacmp
-xfs_da_compname(
-	struct xfs_da_args *args,
-	const unsigned char *name,
-	int		len)
-{
-	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
-					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
-}
-
-static xfs_dahash_t
-xfs_default_hashname(
-	struct xfs_name	*name)
-{
-	return xfs_da_hashname(name->name, name->len);
-}
-
-const struct xfs_nameops xfs_default_nameops = {
-	.hashname	= xfs_default_hashname,
-	.compname	= xfs_da_compname
-};
-
-int
-xfs_da_grow_inode_int(
-	struct xfs_da_args	*args,
-	xfs_fileoff_t		*bno,
-	int			count)
-{
-	struct xfs_trans	*tp = args->trans;
-	struct xfs_inode	*dp = args->dp;
-	int			w = args->whichfork;
-	xfs_drfsbno_t		nblks = dp->i_d.di_nblocks;
-	struct xfs_bmbt_irec	map, *mapp;
-	int			nmap, error, got, i, mapi;
-
-	/*
-	 * Find a spot in the file space to put the new block.
-	 */
-	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
-	if (error)
-		return error;
-
-	/*
-	 * Try mapping it in one filesystem block.
-	 */
-	nmap = 1;
-	ASSERT(args->firstblock != NULL);
-	error = xfs_bmapi_write(tp, dp, *bno, count,
-			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
-			args->firstblock, args->total, &map, &nmap,
-			args->flist);
-	if (error)
-		return error;
-
-	ASSERT(nmap <= 1);
-	if (nmap == 1) {
-		mapp = &map;
-		mapi = 1;
-	} else if (nmap == 0 && count > 1) {
-		xfs_fileoff_t		b;
-		int			c;
-
-		/*
-		 * If we didn't get it and the block might work if fragmented,
-		 * try without the CONTIG flag.  Loop until we get it all.
-		 */
-		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
-		for (b = *bno, mapi = 0; b < *bno + count; ) {
-			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
-			c = (int)(*bno + count - b);
-			error = xfs_bmapi_write(tp, dp, b, c,
-					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist);
-			if (error)
-				goto out_free_map;
-			if (nmap < 1)
-				break;
-			mapi += nmap;
-			b = mapp[mapi - 1].br_startoff +
-			    mapp[mapi - 1].br_blockcount;
-		}
-	} else {
-		mapi = 0;
-		mapp = NULL;
-	}
-
-	/*
-	 * Count the blocks we got, make sure it matches the total.
-	 */
-	for (i = 0, got = 0; i < mapi; i++)
-		got += mapp[i].br_blockcount;
-	if (got != count || mapp[0].br_startoff != *bno ||
-	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
-	    *bno + count) {
-		error = ENOSPC;
-		goto out_free_map;
-	}
-
-	/* account for newly allocated blocks in reserved blocks total */
-	args->total -= dp->i_d.di_nblocks - nblks;
-
-out_free_map:
-	if (mapp != &map)
-		kmem_free(mapp);
-	return error;
-}
-
-/*
- * Add a block to the btree ahead of the file.
- * Return the new block number to the caller.
- */
-int
-xfs_da_grow_inode(
-	struct xfs_da_args	*args,
-	xfs_dablk_t		*new_blkno)
-{
-	xfs_fileoff_t		bno;
-	int			error;
-
-	trace_xfs_da_grow_inode(args);
-
-	bno = args->geo->leafblk;
-	error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
-	if (!error)
-		*new_blkno = (xfs_dablk_t)bno;
-	return error;
-}
-
-/*
- * Ick.  We need to always be able to remove a btree block, even
- * if there's no space reservation because the filesystem is full.
- * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
- * It swaps the target block with the last block in the file.  The
- * last block in the file can always be removed since it can't cause
- * a bmap btree split to do that.
- */
-STATIC int
-xfs_da3_swap_lastblock(
-	struct xfs_da_args	*args,
-	xfs_dablk_t		*dead_blknop,
-	struct xfs_buf		**dead_bufp)
-{
-	struct xfs_da_blkinfo	*dead_info;
-	struct xfs_da_blkinfo	*sib_info;
-	struct xfs_da_intnode	*par_node;
-	struct xfs_da_intnode	*dead_node;
-	struct xfs_dir2_leaf	*dead_leaf2;
-	struct xfs_da_node_entry *btree;
-	struct xfs_da3_icnode_hdr par_hdr;
-	struct xfs_inode	*dp;
-	struct xfs_trans	*tp;
-	struct xfs_mount	*mp;
-	struct xfs_buf		*dead_buf;
-	struct xfs_buf		*last_buf;
-	struct xfs_buf		*sib_buf;
-	struct xfs_buf		*par_buf;
-	xfs_dahash_t		dead_hash;
-	xfs_fileoff_t		lastoff;
-	xfs_dablk_t		dead_blkno;
-	xfs_dablk_t		last_blkno;
-	xfs_dablk_t		sib_blkno;
-	xfs_dablk_t		par_blkno;
-	int			error;
-	int			w;
-	int			entno;
-	int			level;
-	int			dead_level;
-
-	trace_xfs_da_swap_lastblock(args);
-
-	dead_buf = *dead_bufp;
-	dead_blkno = *dead_blknop;
-	tp = args->trans;
-	dp = args->dp;
-	w = args->whichfork;
-	ASSERT(w == XFS_DATA_FORK);
-	mp = dp->i_mount;
-	lastoff = args->geo->freeblk;
-	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
-	if (error)
-		return error;
-	if (unlikely(lastoff == 0)) {
-		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
-				 mp);
-		return EFSCORRUPTED;
-	}
-	/*
-	 * Read the last block in the btree space.
-	 */
-	last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
-	error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
-	if (error)
-		return error;
-	/*
-	 * Copy the last block into the dead buffer and log it.
-	 */
-	memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
-	xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
-	dead_info = dead_buf->b_addr;
-	/*
-	 * Get values from the moved block.
-	 */
-	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-	    dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		struct xfs_dir3_icleaf_hdr leafhdr;
-		struct xfs_dir2_leaf_entry *ents;
-
-		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
-		dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
-		ents = dp->d_ops->leaf_ents_p(dead_leaf2);
-		dead_level = 0;
-		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
-	} else {
-		struct xfs_da3_icnode_hdr deadhdr;
-
-		dead_node = (xfs_da_intnode_t *)dead_info;
-		dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
-		btree = dp->d_ops->node_tree_p(dead_node);
-		dead_level = deadhdr.level;
-		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
-	}
-	sib_buf = par_buf = NULL;
-	/*
-	 * If the moved block has a left sibling, fix up the pointers.
-	 */
-	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
-		if (error)
-			goto done;
-		sib_info = sib_buf->b_addr;
-		if (unlikely(
-		    be32_to_cpu(sib_info->forw) != last_blkno ||
-		    sib_info->magic != dead_info->magic)) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		sib_info->forw = cpu_to_be32(dead_blkno);
-		xfs_trans_log_buf(tp, sib_buf,
-			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
-					sizeof(sib_info->forw)));
-		sib_buf = NULL;
-	}
-	/*
-	 * If the moved block has a right sibling, fix up the pointers.
-	 */
-	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
-		if (error)
-			goto done;
-		sib_info = sib_buf->b_addr;
-		if (unlikely(
-		       be32_to_cpu(sib_info->back) != last_blkno ||
-		       sib_info->magic != dead_info->magic)) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		sib_info->back = cpu_to_be32(dead_blkno);
-		xfs_trans_log_buf(tp, sib_buf,
-			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
-					sizeof(sib_info->back)));
-		sib_buf = NULL;
-	}
-	par_blkno = args->geo->leafblk;
-	level = -1;
-	/*
-	 * Walk down the tree looking for the parent of the moved block.
-	 */
-	for (;;) {
-		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
-		if (error)
-			goto done;
-		par_node = par_buf->b_addr;
-		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
-		if (level >= 0 && level != par_hdr.level + 1) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		level = par_hdr.level;
-		btree = dp->d_ops->node_tree_p(par_node);
-		for (entno = 0;
-		     entno < par_hdr.count &&
-		     be32_to_cpu(btree[entno].hashval) < dead_hash;
-		     entno++)
-			continue;
-		if (entno == par_hdr.count) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		par_blkno = be32_to_cpu(btree[entno].before);
-		if (level == dead_level + 1)
-			break;
-		xfs_trans_brelse(tp, par_buf);
-		par_buf = NULL;
-	}
-	/*
-	 * We're in the right parent block.
-	 * Look for the right entry.
-	 */
-	for (;;) {
-		for (;
-		     entno < par_hdr.count &&
-		     be32_to_cpu(btree[entno].before) != last_blkno;
-		     entno++)
-			continue;
-		if (entno < par_hdr.count)
-			break;
-		par_blkno = par_hdr.forw;
-		xfs_trans_brelse(tp, par_buf);
-		par_buf = NULL;
-		if (unlikely(par_blkno == 0)) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
-		if (error)
-			goto done;
-		par_node = par_buf->b_addr;
-		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
-		if (par_hdr.level != level) {
-			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
-					 XFS_ERRLEVEL_LOW, mp);
-			error = EFSCORRUPTED;
-			goto done;
-		}
-		btree = dp->d_ops->node_tree_p(par_node);
-		entno = 0;
-	}
-	/*
-	 * Update the parent entry pointing to the moved block.
-	 */
-	btree[entno].before = cpu_to_be32(dead_blkno);
-	xfs_trans_log_buf(tp, par_buf,
-		XFS_DA_LOGRANGE(par_node, &btree[entno].before,
-				sizeof(btree[entno].before)));
-	*dead_blknop = last_blkno;
-	*dead_bufp = last_buf;
-	return 0;
-done:
-	if (par_buf)
-		xfs_trans_brelse(tp, par_buf);
-	if (sib_buf)
-		xfs_trans_brelse(tp, sib_buf);
-	xfs_trans_brelse(tp, last_buf);
-	return error;
-}
-
-/*
- * Remove a btree block from a directory or attribute.
- */
-int
-xfs_da_shrink_inode(
-	xfs_da_args_t	*args,
-	xfs_dablk_t	dead_blkno,
-	struct xfs_buf	*dead_buf)
-{
-	xfs_inode_t *dp;
-	int done, error, w, count;
-	xfs_trans_t *tp;
-	xfs_mount_t *mp;
-
-	trace_xfs_da_shrink_inode(args);
-
-	dp = args->dp;
-	w = args->whichfork;
-	tp = args->trans;
-	mp = dp->i_mount;
-	count = args->geo->fsbcount;
-	for (;;) {
-		/*
-		 * Remove extents.  If we get ENOSPC for a dir we have to move
-		 * the last block to the place we want to kill.
-		 */
-		error = xfs_bunmapi(tp, dp, dead_blkno, count,
-				    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-				    0, args->firstblock, args->flist, &done);
-		if (error == ENOSPC) {
-			if (w != XFS_DATA_FORK)
-				break;
-			error = xfs_da3_swap_lastblock(args, &dead_blkno,
-						      &dead_buf);
-			if (error)
-				break;
-		} else {
-			break;
-		}
-	}
-	xfs_trans_binval(tp, dead_buf);
-	return error;
-}
-
-/*
- * See if the mapping(s) for this btree block are valid, i.e.
- * don't contain holes, are logically contiguous, and cover the whole range.
- */
-STATIC int
-xfs_da_map_covers_blocks(
-	int		nmap,
-	xfs_bmbt_irec_t	*mapp,
-	xfs_dablk_t	bno,
-	int		count)
-{
-	int		i;
-	xfs_fileoff_t	off;
-
-	for (i = 0, off = bno; i < nmap; i++) {
-		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
-		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
-			return 0;
-		}
-		if (off != mapp[i].br_startoff) {
-			return 0;
-		}
-		off += mapp[i].br_blockcount;
-	}
-	return off == bno + count;
-}
-
-/*
- * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
- *
- * For the single map case, it is assumed that the caller has provided a pointer
- * to a valid xfs_buf_map.  For the multiple map case, this function will
- * allocate the xfs_buf_map to hold all the maps and replace the caller's single
- * map pointer with the allocated map.
- */
-static int
-xfs_buf_map_from_irec(
-	struct xfs_mount	*mp,
-	struct xfs_buf_map	**mapp,
-	int			*nmaps,
-	struct xfs_bmbt_irec	*irecs,
-	int			nirecs)
-{
-	struct xfs_buf_map	*map;
-	int			i;
-
-	ASSERT(*nmaps == 1);
-	ASSERT(nirecs >= 1);
-
-	if (nirecs > 1) {
-		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-				  KM_SLEEP | KM_NOFS);
-		if (!map)
-			return ENOMEM;
-		*mapp = map;
-	}
-
-	*nmaps = nirecs;
-	map = *mapp;
-	for (i = 0; i < *nmaps; i++) {
-		ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
-		       irecs[i].br_startblock != HOLESTARTBLOCK);
-		map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
-		map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
-	}
-	return 0;
-}
-
-/*
- * Map the block we are given ready for reading. There are three possible return
- * values:
- *	-1 - will be returned if we land in a hole and mappedbno == -2 so the
- *	     caller knows not to execute a subsequent read.
- *	 0 - if we mapped the block successfully
- *	>0 - positive error number if there was an error.
- */
-static int
-xfs_dabuf_map(
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	int			whichfork,
-	struct xfs_buf_map	**map,
-	int			*nmaps)
-{
-	struct xfs_mount	*mp = dp->i_mount;
-	int			nfsb;
-	int			error = 0;
-	struct xfs_bmbt_irec	irec;
-	struct xfs_bmbt_irec	*irecs = &irec;
-	int			nirecs;
-
-	ASSERT(map && *map);
-	ASSERT(*nmaps == 1);
-
-	if (whichfork == XFS_DATA_FORK)
-		nfsb = mp->m_dir_geo->fsbcount;
-	else
-		nfsb = mp->m_attr_geo->fsbcount;
-
-	/*
-	 * Caller doesn't have a mapping.  -2 means don't complain
-	 * if we land in a hole.
-	 */
-	if (mappedbno == -1 || mappedbno == -2) {
-		/*
-		 * Optimize the one-block case.
-		 */
-		if (nfsb != 1)
-			irecs = kmem_zalloc(sizeof(irec) * nfsb,
-					    KM_SLEEP | KM_NOFS);
-
-		nirecs = nfsb;
-		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
-				       &nirecs, xfs_bmapi_aflag(whichfork));
-		if (error)
-			goto out;
-	} else {
-		irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
-		irecs->br_startoff = (xfs_fileoff_t)bno;
-		irecs->br_blockcount = nfsb;
-		irecs->br_state = 0;
-		nirecs = 1;
-	}
-
-	if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
-		error = mappedbno == -2 ? -1 : EFSCORRUPTED;
-		if (unlikely(error == EFSCORRUPTED)) {
-			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-				int i;
-				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-					__func__, (long long)bno,
-					(long long)dp->i_ino);
-				for (i = 0; i < *nmaps; i++) {
-					xfs_alert(mp,
-"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
-						i,
-						(long long)irecs[i].br_startoff,
-						(long long)irecs[i].br_startblock,
-						(long long)irecs[i].br_blockcount,
-						irecs[i].br_state);
-				}
-			}
-			XFS_ERROR_REPORT("xfs_da_do_buf(1)",
-					 XFS_ERRLEVEL_LOW, mp);
-		}
-		goto out;
-	}
-	error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
-out:
-	if (irecs != &irec)
-		kmem_free(irecs);
-	return error;
-}
-
-/*
- * Get a buffer for the dir/attr block.
- */
-int
-xfs_da_get_buf(
-	struct xfs_trans	*trans,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp,
-	int			whichfork)
-{
-	struct xfs_buf		*bp;
-	struct xfs_buf_map	map;
-	struct xfs_buf_map	*mapp;
-	int			nmap;
-	int			error;
-
-	*bpp = NULL;
-	mapp = &map;
-	nmap = 1;
-	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-				&mapp, &nmap);
-	if (error) {
-		/* mapping a hole is not an error, but we don't continue */
-		if (error == -1)
-			error = 0;
-		goto out_free;
-	}
-
-	bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
-				    mapp, nmap, 0);
-	error = bp ? bp->b_error : EIO;
-	if (error) {
-		xfs_trans_brelse(trans, bp);
-		goto out_free;
-	}
-
-	*bpp = bp;
-
-out_free:
-	if (mapp != &map)
-		kmem_free(mapp);
-
-	return error;
-}
-
-/*
- * Get a buffer for the dir/attr block, fill in the contents.
- */
-int
-xfs_da_read_buf(
-	struct xfs_trans	*trans,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp,
-	int			whichfork,
-	const struct xfs_buf_ops *ops)
-{
-	struct xfs_buf		*bp;
-	struct xfs_buf_map	map;
-	struct xfs_buf_map	*mapp;
-	int			nmap;
-	int			error;
-
-	*bpp = NULL;
-	mapp = &map;
-	nmap = 1;
-	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-				&mapp, &nmap);
-	if (error) {
-		/* mapping a hole is not an error, but we don't continue */
-		if (error == -1)
-			error = 0;
-		goto out_free;
-	}
-
-	error = xfs_trans_read_buf_map(dp->i_mount, trans,
-					dp->i_mount->m_ddev_targp,
-					mapp, nmap, 0, &bp, ops);
-	if (error)
-		goto out_free;
-
-	if (whichfork == XFS_ATTR_FORK)
-		xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
-	else
-		xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-	*bpp = bp;
-out_free:
-	if (mapp != &map)
-		kmem_free(mapp);
-
-	return error;
-}
-
-/*
- * Readahead the dir/attr block.
- */
-xfs_daddr_t
-xfs_da_reada_buf(
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mappedbno,
-	int			whichfork,
-	const struct xfs_buf_ops *ops)
-{
-	struct xfs_buf_map	map;
-	struct xfs_buf_map	*mapp;
-	int			nmap;
-	int			error;
-
-	mapp = &map;
-	nmap = 1;
-	error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
-				&mapp, &nmap);
-	if (error) {
-		/* mapping a hole is not an error, but we don't continue */
-		if (error == -1)
-			error = 0;
-		goto out_free;
-	}
-
-	mappedbno = mapp[0].bm_bn;
-	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
-
-out_free:
-	if (mapp != &map)
-		kmem_free(mapp);
-
-	if (error)
-		return -1;
-	return mappedbno;
-}
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
deleted file mode 100644
index c9aee52a37e2..000000000000
--- a/fs/xfs/xfs_da_format.c
+++ /dev/null
@@ -1,911 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-
-/*
- * Shortform directory ops
- */
-static int
-xfs_dir2_sf_entsize(
-	struct xfs_dir2_sf_hdr	*hdr,
-	int			len)
-{
-	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
-
-	count += len;					/* name */
-	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-				sizeof(xfs_dir2_ino4_t); /* ino # */
-	return count;
-}
-
-static int
-xfs_dir3_sf_entsize(
-	struct xfs_dir2_sf_hdr	*hdr,
-	int			len)
-{
-	return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (struct xfs_dir2_sf_entry *)
-		((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (struct xfs_dir2_sf_entry *)
-		((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
-}
-
-
-/*
- * For filetype enabled shortform directories, the file type field is stored at
- * the end of the name.  Because it's only a single byte, endian conversion is
- * not necessary. For non-filetype enable directories, the type is always
- * unknown and we never store the value.
- */
-static __uint8_t
-xfs_dir2_sfe_get_ftype(
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_sfe_put_ftype(
-	struct xfs_dir2_sf_entry *sfep,
-	__uint8_t		ftype)
-{
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static __uint8_t
-xfs_dir3_sfe_get_ftype(
-	struct xfs_dir2_sf_entry *sfep)
-{
-	__uint8_t	ftype;
-
-	ftype = sfep->name[sfep->namelen];
-	if (ftype >= XFS_DIR3_FT_MAX)
-		return XFS_DIR3_FT_UNKNOWN;
-	return ftype;
-}
-
-static void
-xfs_dir3_sfe_put_ftype(
-	struct xfs_dir2_sf_entry *sfep,
-	__uint8_t		ftype)
-{
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-
-	sfep->name[sfep->namelen] = ftype;
-}
-
-/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide.  These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*from)
-{
-	if (hdr->i8count)
-		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
-	else
-		return get_unaligned_be32(&from->i4.i);
-}
-
-static void
-xfs_dir2_sf_put_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*to,
-	xfs_ino_t		ino)
-{
-	ASSERT((ino & 0xff00000000000000ULL) == 0);
-
-	if (hdr->i8count)
-		put_unaligned_be64(ino, &to->i8.i);
-	else
-		put_unaligned_be32(ino, &to->i4.i);
-}
-
-static xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
-	struct xfs_dir2_sf_hdr	*hdr)
-{
-	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
-}
-
-static void
-xfs_dir2_sf_put_parent_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_ino_t		ino)
-{
-	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_ino_t
-xfs_dir2_sfe_get_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return xfs_dir2_sf_get_ino(hdr,
-				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
-}
-
-static void
-xfs_dir2_sfe_put_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep,
-	xfs_ino_t		ino)
-{
-	xfs_dir2_sf_put_ino(hdr,
-			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
-}
-
-static xfs_ino_t
-xfs_dir3_sfe_get_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return xfs_dir2_sf_get_ino(hdr,
-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
-}
-
-static void
-xfs_dir3_sfe_put_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep,
-	xfs_ino_t		ino)
-{
-	xfs_dir2_sf_put_ino(hdr,
-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
-}
-
-
-/*
- * Directory data block operations
- */
-
-/*
- * For special situations, the dirent size ends up fixed because we always know
- * what the size of the entry is. That's true for the "." and "..", and
- * therefore we know that they are a fixed size and hence their offsets are
- * constant, as is the first entry.
- *
- * Hence, this calculation is written as a macro to be able to be calculated at
- * compile time and so certain offsets can be calculated directly in the
- * structure initaliser via the macro. There are two macros - one for dirents
- * with ftype and without so there are no unresolvable conditionals in the
- * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
- * of 2 and the compiler doesn't reject it (unlike roundup()).
- */
-#define XFS_DIR2_DATA_ENTSIZE(n)					\
-	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
-		 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
-
-#define XFS_DIR3_DATA_ENTSIZE(n)					\
-	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
-		 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),	\
-		XFS_DIR2_DATA_ALIGN)
-
-static int
-xfs_dir2_data_entsize(
-	int			n)
-{
-	return XFS_DIR2_DATA_ENTSIZE(n);
-}
-
-static int
-xfs_dir3_data_entsize(
-	int			n)
-{
-	return XFS_DIR3_DATA_ENTSIZE(n);
-}
-
-static __uint8_t
-xfs_dir2_data_get_ftype(
-	struct xfs_dir2_data_entry *dep)
-{
-	return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_data_put_ftype(
-	struct xfs_dir2_data_entry *dep,
-	__uint8_t		ftype)
-{
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static __uint8_t
-xfs_dir3_data_get_ftype(
-	struct xfs_dir2_data_entry *dep)
-{
-	__uint8_t	ftype = dep->name[dep->namelen];
-
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-	if (ftype >= XFS_DIR3_FT_MAX)
-		return XFS_DIR3_FT_UNKNOWN;
-	return ftype;
-}
-
-static void
-xfs_dir3_data_put_ftype(
-	struct xfs_dir2_data_entry *dep,
-	__uint8_t		type)
-{
-	ASSERT(type < XFS_DIR3_FT_MAX);
-	ASSERT(dep->namelen != 0);
-
-	dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static __be16 *
-xfs_dir2_data_entry_tag_p(
-	struct xfs_dir2_data_entry *dep)
-{
-	return (__be16 *)((char *)dep +
-		xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-static __be16 *
-xfs_dir3_data_entry_tag_p(
-	struct xfs_dir2_data_entry *dep)
-{
-	return (__be16 *)((char *)dep +
-		xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dot_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dotdot_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR2_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_first_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR2_DATA_ENTSIZE(1) +
-				XFS_DIR2_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_dotdot_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_first_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1) +
-				XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1) +
-				XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return hdr->bestfree;
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-
-/*
- * Directory Leaf block operations
- */
-static int
-xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
-{
-	return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
-		(uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-	return lp->__ents;
-}
-
-static int
-xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
-{
-	return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
-		(uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-	return ((struct xfs_dir3_leaf *)lp)->__ents;
-}
-
-static void
-xfs_dir2_leaf_hdr_from_disk(
-	struct xfs_dir3_icleaf_hdr	*to,
-	struct xfs_dir2_leaf		*from)
-{
-	to->forw = be32_to_cpu(from->hdr.info.forw);
-	to->back = be32_to_cpu(from->hdr.info.back);
-	to->magic = be16_to_cpu(from->hdr.info.magic);
-	to->count = be16_to_cpu(from->hdr.count);
-	to->stale = be16_to_cpu(from->hdr.stale);
-
-	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
-	       to->magic == XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir2_leaf_hdr_to_disk(
-	struct xfs_dir2_leaf		*to,
-	struct xfs_dir3_icleaf_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
-	       from->magic == XFS_DIR2_LEAFN_MAGIC);
-
-	to->hdr.info.forw = cpu_to_be32(from->forw);
-	to->hdr.info.back = cpu_to_be32(from->back);
-	to->hdr.info.magic = cpu_to_be16(from->magic);
-	to->hdr.count = cpu_to_be16(from->count);
-	to->hdr.stale = cpu_to_be16(from->stale);
-}
-
-static void
-xfs_dir3_leaf_hdr_from_disk(
-	struct xfs_dir3_icleaf_hdr	*to,
-	struct xfs_dir2_leaf		*from)
-{
-	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
-	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-	to->back = be32_to_cpu(hdr3->info.hdr.back);
-	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-	to->count = be16_to_cpu(hdr3->count);
-	to->stale = be16_to_cpu(hdr3->stale);
-
-	ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
-	       to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leaf_hdr_to_disk(
-	struct xfs_dir2_leaf		*to,
-	struct xfs_dir3_icleaf_hdr	*from)
-{
-	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
-	ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
-	       from->magic == XFS_DIR3_LEAFN_MAGIC);
-
-	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-	hdr3->info.hdr.back = cpu_to_be32(from->back);
-	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-	hdr3->count = cpu_to_be16(from->count);
-	hdr3->stale = cpu_to_be16(from->stale);
-}
-
-
-/*
- * Directory/Attribute Node block operations
- */
-static struct xfs_da_node_entry *
-xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
-{
-	return dap->__btree;
-}
-
-static struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
-	return ((struct xfs_da3_intnode *)dap)->__btree;
-}
-
-static void
-xfs_da2_node_hdr_from_disk(
-	struct xfs_da3_icnode_hdr	*to,
-	struct xfs_da_intnode		*from)
-{
-	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	to->forw = be32_to_cpu(from->hdr.info.forw);
-	to->back = be32_to_cpu(from->hdr.info.back);
-	to->magic = be16_to_cpu(from->hdr.info.magic);
-	to->count = be16_to_cpu(from->hdr.__count);
-	to->level = be16_to_cpu(from->hdr.__level);
-}
-
-static void
-xfs_da2_node_hdr_to_disk(
-	struct xfs_da_intnode		*to,
-	struct xfs_da3_icnode_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DA_NODE_MAGIC);
-	to->hdr.info.forw = cpu_to_be32(from->forw);
-	to->hdr.info.back = cpu_to_be32(from->back);
-	to->hdr.info.magic = cpu_to_be16(from->magic);
-	to->hdr.__count = cpu_to_be16(from->count);
-	to->hdr.__level = cpu_to_be16(from->level);
-}
-
-static void
-xfs_da3_node_hdr_from_disk(
-	struct xfs_da3_icnode_hdr	*to,
-	struct xfs_da_intnode		*from)
-{
-	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
-	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-	to->back = be32_to_cpu(hdr3->info.hdr.back);
-	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-	to->count = be16_to_cpu(hdr3->__count);
-	to->level = be16_to_cpu(hdr3->__level);
-}
-
-static void
-xfs_da3_node_hdr_to_disk(
-	struct xfs_da_intnode		*to,
-	struct xfs_da3_icnode_hdr	*from)
-{
-	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
-	ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
-	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-	hdr3->info.hdr.back = cpu_to_be32(from->back);
-	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-	hdr3->__count = cpu_to_be16(from->count);
-	hdr3->__level = cpu_to_be16(from->level);
-}
-
-
-/*
- * Directory free space block operations
- */
-static int
-xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
-{
-	return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
-		sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
-{
-	return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
-			(db / xfs_dir2_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-	return db % xfs_dir2_free_max_bests(geo);
-}
-
-static int
-xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
-{
-	return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
-		sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
-{
-	return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-	return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
-			(db / xfs_dir3_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-	return db % xfs_dir3_free_max_bests(geo);
-}
-
-static void
-xfs_dir2_free_hdr_from_disk(
-	struct xfs_dir3_icfree_hdr	*to,
-	struct xfs_dir2_free		*from)
-{
-	to->magic = be32_to_cpu(from->hdr.magic);
-	to->firstdb = be32_to_cpu(from->hdr.firstdb);
-	to->nvalid = be32_to_cpu(from->hdr.nvalid);
-	to->nused = be32_to_cpu(from->hdr.nused);
-	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
-}
-
-static void
-xfs_dir2_free_hdr_to_disk(
-	struct xfs_dir2_free		*to,
-	struct xfs_dir3_icfree_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
-
-	to->hdr.magic = cpu_to_be32(from->magic);
-	to->hdr.firstdb = cpu_to_be32(from->firstdb);
-	to->hdr.nvalid = cpu_to_be32(from->nvalid);
-	to->hdr.nused = cpu_to_be32(from->nused);
-}
-
-static void
-xfs_dir3_free_hdr_from_disk(
-	struct xfs_dir3_icfree_hdr	*to,
-	struct xfs_dir2_free		*from)
-{
-	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
-	to->magic = be32_to_cpu(hdr3->hdr.magic);
-	to->firstdb = be32_to_cpu(hdr3->firstdb);
-	to->nvalid = be32_to_cpu(hdr3->nvalid);
-	to->nused = be32_to_cpu(hdr3->nused);
-
-	ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
-	struct xfs_dir2_free		*to,
-	struct xfs_dir3_icfree_hdr	*from)
-{
-	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
-	ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
-
-	hdr3->hdr.magic = cpu_to_be32(from->magic);
-	hdr3->firstdb = cpu_to_be32(from->firstdb);
-	hdr3->nvalid = cpu_to_be32(from->nvalid);
-	hdr3->nused = cpu_to_be32(from->nused);
-}
-
-static const struct xfs_dir_ops xfs_dir2_ops = {
-	.sf_entsize = xfs_dir2_sf_entsize,
-	.sf_nextentry = xfs_dir2_sf_nextentry,
-	.sf_get_ftype = xfs_dir2_sfe_get_ftype,
-	.sf_put_ftype = xfs_dir2_sfe_put_ftype,
-	.sf_get_ino = xfs_dir2_sfe_get_ino,
-	.sf_put_ino = xfs_dir2_sfe_put_ino,
-	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-	.data_entsize = xfs_dir2_data_entsize,
-	.data_get_ftype = xfs_dir2_data_get_ftype,
-	.data_put_ftype = xfs_dir2_data_put_ftype,
-	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
-	.data_bestfree_p = xfs_dir2_data_bestfree_p,
-
-	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
-	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR2_DATA_ENTSIZE(1),
-	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR2_DATA_ENTSIZE(1) +
-				XFS_DIR2_DATA_ENTSIZE(2),
-	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
-	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
-	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
-	.data_first_entry_p = xfs_dir2_data_first_entry_p,
-	.data_entry_p = xfs_dir2_data_entry_p,
-	.data_unused_p = xfs_dir2_data_unused_p,
-
-	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
-	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
-	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
-	.leaf_max_ents = xfs_dir2_max_leaf_ents,
-	.leaf_ents_p = xfs_dir2_leaf_ents_p,
-
-	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
-	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-	.node_tree_p = xfs_da2_node_tree_p,
-
-	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
-	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
-	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
-	.free_max_bests = xfs_dir2_free_max_bests,
-	.free_bests_p = xfs_dir2_free_bests_p,
-	.db_to_fdb = xfs_dir2_db_to_fdb,
-	.db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
-	.sf_entsize = xfs_dir3_sf_entsize,
-	.sf_nextentry = xfs_dir3_sf_nextentry,
-	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
-	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
-	.sf_get_ino = xfs_dir3_sfe_get_ino,
-	.sf_put_ino = xfs_dir3_sfe_put_ino,
-	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-	.data_entsize = xfs_dir3_data_entsize,
-	.data_get_ftype = xfs_dir3_data_get_ftype,
-	.data_put_ftype = xfs_dir3_data_put_ftype,
-	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
-	.data_bestfree_p = xfs_dir2_data_bestfree_p,
-
-	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
-	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1),
-	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1) +
-				XFS_DIR3_DATA_ENTSIZE(2),
-	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
-	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
-	.data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
-	.data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
-	.data_entry_p = xfs_dir2_data_entry_p,
-	.data_unused_p = xfs_dir2_data_unused_p,
-
-	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
-	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
-	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
-	.leaf_max_ents = xfs_dir2_max_leaf_ents,
-	.leaf_ents_p = xfs_dir2_leaf_ents_p,
-
-	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
-	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-	.node_tree_p = xfs_da2_node_tree_p,
-
-	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
-	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
-	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
-	.free_max_bests = xfs_dir2_free_max_bests,
-	.free_bests_p = xfs_dir2_free_bests_p,
-	.db_to_fdb = xfs_dir2_db_to_fdb,
-	.db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir3_ops = {
-	.sf_entsize = xfs_dir3_sf_entsize,
-	.sf_nextentry = xfs_dir3_sf_nextentry,
-	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
-	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
-	.sf_get_ino = xfs_dir3_sfe_get_ino,
-	.sf_put_ino = xfs_dir3_sfe_put_ino,
-	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
-	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
-	.data_entsize = xfs_dir3_data_entsize,
-	.data_get_ftype = xfs_dir3_data_get_ftype,
-	.data_put_ftype = xfs_dir3_data_put_ftype,
-	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
-	.data_bestfree_p = xfs_dir3_data_bestfree_p,
-
-	.data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
-	.data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1),
-	.data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
-				XFS_DIR3_DATA_ENTSIZE(1) +
-				XFS_DIR3_DATA_ENTSIZE(2),
-	.data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
-
-	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
-	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
-	.data_first_entry_p = xfs_dir3_data_first_entry_p,
-	.data_entry_p = xfs_dir3_data_entry_p,
-	.data_unused_p = xfs_dir3_data_unused_p,
-
-	.leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
-	.leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
-	.leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
-	.leaf_max_ents = xfs_dir3_max_leaf_ents,
-	.leaf_ents_p = xfs_dir3_leaf_ents_p,
-
-	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
-	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
-	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
-	.node_tree_p = xfs_da3_node_tree_p,
-
-	.free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
-	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
-	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
-	.free_max_bests = xfs_dir3_free_max_bests,
-	.free_bests_p = xfs_dir3_free_bests_p,
-	.db_to_fdb = xfs_dir3_db_to_fdb,
-	.db_to_fdindex = xfs_dir3_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
-	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
-	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
-	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
-	.node_tree_p = xfs_da2_node_tree_p,
-};
-
-static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
-	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
-	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
-	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
-	.node_tree_p = xfs_da3_node_tree_p,
-};
-
-/*
- * Return the ops structure according to the current config.  If we are passed
- * an inode, then that overrides the default config we use which is based on
- * feature bits.
- */
-const struct xfs_dir_ops *
-xfs_dir_get_ops(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*dp)
-{
-	if (dp)
-		return dp->d_ops;
-	if (mp->m_dir_inode_ops)
-		return mp->m_dir_inode_ops;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return &xfs_dir3_ops;
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		return &xfs_dir2_ftype_ops;
-	return &xfs_dir2_ops;
-}
-
-const struct xfs_dir_ops *
-xfs_nondir_get_ops(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*dp)
-{
-	if (dp)
-		return dp->d_ops;
-	if (mp->m_nondir_inode_ops)
-		return mp->m_nondir_inode_ops;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return &xfs_dir3_nondir_ops;
-	return &xfs_dir2_nondir_ops;
-}
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
deleted file mode 100644
index a0aca734199b..000000000000
--- a/fs/xfs/xfs_dir2.c
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
-
-
-/*
- * ASCII case-insensitive (ie. A-Z) support for directories that was
- * used in IRIX.
- */
-STATIC xfs_dahash_t
-xfs_ascii_ci_hashname(
-	struct xfs_name	*name)
-{
-	xfs_dahash_t	hash;
-	int		i;
-
-	for (i = 0, hash = 0; i < name->len; i++)
-		hash = tolower(name->name[i]) ^ rol32(hash, 7);
-
-	return hash;
-}
-
-STATIC enum xfs_dacmp
-xfs_ascii_ci_compname(
-	struct xfs_da_args *args,
-	const unsigned char *name,
-	int		len)
-{
-	enum xfs_dacmp	result;
-	int		i;
-
-	if (args->namelen != len)
-		return XFS_CMP_DIFFERENT;
-
-	result = XFS_CMP_EXACT;
-	for (i = 0; i < len; i++) {
-		if (args->name[i] == name[i])
-			continue;
-		if (tolower(args->name[i]) != tolower(name[i]))
-			return XFS_CMP_DIFFERENT;
-		result = XFS_CMP_CASE;
-	}
-
-	return result;
-}
-
-static struct xfs_nameops xfs_ascii_ci_nameops = {
-	.hashname	= xfs_ascii_ci_hashname,
-	.compname	= xfs_ascii_ci_compname,
-};
-
-int
-xfs_da_mount(
-	struct xfs_mount	*mp)
-{
-	struct xfs_da_geometry	*dageo;
-	int			nodehdr_size;
-
-
-	ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
-	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
-	       XFS_MAX_BLOCKSIZE);
-
-	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
-	mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
-
-	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
-	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				    KM_SLEEP | KM_MAYFAIL);
-	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				     KM_SLEEP | KM_MAYFAIL);
-	if (!mp->m_dir_geo || !mp->m_attr_geo) {
-		kmem_free(mp->m_dir_geo);
-		kmem_free(mp->m_attr_geo);
-		return ENOMEM;
-	}
-
-	/* set up directory geometry */
-	dageo = mp->m_dir_geo;
-	dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
-	dageo->fsblog = mp->m_sb.sb_blocklog;
-	dageo->blksize = 1 << dageo->blklog;
-	dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
-
-	/*
-	 * Now we've set up the block conversion variables, we can calculate the
-	 * segment block constants using the geometry structure.
-	 */
-	dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
-	dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
-	dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
-	dageo->node_ents = (dageo->blksize - nodehdr_size) /
-				(uint)sizeof(xfs_da_node_entry_t);
-	dageo->magicpct = (dageo->blksize * 37) / 100;
-
-	/* set up attribute geometry - single fsb only */
-	dageo = mp->m_attr_geo;
-	dageo->blklog = mp->m_sb.sb_blocklog;
-	dageo->fsblog = mp->m_sb.sb_blocklog;
-	dageo->blksize = 1 << dageo->blklog;
-	dageo->fsbcount = 1;
-	dageo->node_ents = (dageo->blksize - nodehdr_size) /
-				(uint)sizeof(xfs_da_node_entry_t);
-	dageo->magicpct = (dageo->blksize * 37) / 100;
-
-	if (xfs_sb_version_hasasciici(&mp->m_sb))
-		mp->m_dirnameops = &xfs_ascii_ci_nameops;
-	else
-		mp->m_dirnameops = &xfs_default_nameops;
-
-	return 0;
-}
-
-void
-xfs_da_unmount(
-	struct xfs_mount	*mp)
-{
-	kmem_free(mp->m_dir_geo);
-	kmem_free(mp->m_attr_geo);
-}
-
-/*
- * Return 1 if directory contains only "." and "..".
- */
-int
-xfs_dir_isempty(
-	xfs_inode_t	*dp)
-{
-	xfs_dir2_sf_hdr_t	*sfp;
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
-		return 1;
-	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
-		return 0;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	return !sfp->count;
-}
-
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(
-	xfs_mount_t	*mp,
-	xfs_ino_t	ino)
-{
-	xfs_agblock_t	agblkno;
-	xfs_agino_t	agino;
-	xfs_agnumber_t	agno;
-	int		ino_ok;
-	int		ioff;
-
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agblkno = XFS_INO_TO_AGBNO(mp, ino);
-	ioff = XFS_INO_TO_OFFSET(mp, ino);
-	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
-	ino_ok =
-		agno < mp->m_sb.sb_agcount &&
-		agblkno < mp->m_sb.sb_agblocks &&
-		agblkno != 0 &&
-		ioff < (1 << mp->m_sb.sb_inopblog) &&
-		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
-	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
-			XFS_RANDOM_DIR_INO_VALIDATE))) {
-		xfs_warn(mp, "Invalid inode number 0x%Lx",
-				(unsigned long long) ino);
-		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-	return 0;
-}
-
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-int
-xfs_dir_init(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	xfs_inode_t	*pdp)
-{
-	struct xfs_da_args *args;
-	int		error;
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
-	if (error)
-		return error;
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->dp = dp;
-	args->trans = tp;
-	error = xfs_dir2_sf_create(args, pdp->i_ino);
-	kmem_free(args);
-	return error;
-}
-
-/*
-  Enter a name in a directory.
- */
-int
-xfs_dir_createname(
-	xfs_trans_t		*tp,
-	xfs_inode_t		*dp,
-	struct xfs_name		*name,
-	xfs_ino_t		inum,		/* new entry inode number */
-	xfs_fsblock_t		*first,		/* bmap's firstblock */
-	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
-	xfs_extlen_t		total)		/* bmap's total block count */
-{
-	struct xfs_da_args	*args;
-	int			rval;
-	int			v;		/* type-checking value */
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-	if (rval)
-		return rval;
-	XFS_STATS_INC(xs_dir_create);
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->inumber = inum;
-	args->dp = dp;
-	args->firstblock = first;
-	args->flist = flist;
-	args->total = total;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-	args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_addname(args);
-	else
-		rval = xfs_dir2_node_addname(args);
-
-out_free:
-	kmem_free(args);
-	return rval;
-}
-
-/*
- * If doing a CI lookup and case-insensitive match, dup actual name into
- * args.value. Return EEXIST for success (ie. name found) or an error.
- */
-int
-xfs_dir_cilookup_result(
-	struct xfs_da_args *args,
-	const unsigned char *name,
-	int		len)
-{
-	if (args->cmpresult == XFS_CMP_DIFFERENT)
-		return ENOENT;
-	if (args->cmpresult != XFS_CMP_CASE ||
-					!(args->op_flags & XFS_DA_OP_CILOOKUP))
-		return EEXIST;
-
-	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
-	if (!args->value)
-		return ENOMEM;
-
-	memcpy(args->value, name, len);
-	args->valuelen = len;
-	return EEXIST;
-}
-
-/*
- * Lookup a name in a directory, give back the inode number.
- * If ci_name is not NULL, returns the actual name in ci_name if it differs
- * to name, or ci_name->name is set to NULL for an exact match.
- */
-
-int
-xfs_dir_lookup(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name,
-	xfs_ino_t	*inum,		/* out: inode number */
-	struct xfs_name *ci_name)	/* out: actual name if CI match */
-{
-	struct xfs_da_args *args;
-	int		rval;
-	int		v;		/* type-checking value */
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	XFS_STATS_INC(xs_dir_lookup);
-
-	/*
-	 * We need to use KM_NOFS here so that lockdep will not throw false
-	 * positive deadlock warnings on a non-transactional lookup path. It is
-	 * safe to recurse into inode recalim in that case, but lockdep can't
-	 * easily be taught about it. Hence KM_NOFS avoids having to add more
-	 * lockdep Doing this avoids having to add a bunch of lockdep class
-	 * annotations into the reclaim path for the ilock.
-	 */
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->dp = dp;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-	args->op_flags = XFS_DA_OP_OKNOENT;
-	if (ci_name)
-		args->op_flags |= XFS_DA_OP_CILOOKUP;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_lookup(args);
-		goto out_check_rval;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_lookup(args);
-		goto out_check_rval;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_lookup(args);
-	else
-		rval = xfs_dir2_node_lookup(args);
-
-out_check_rval:
-	if (rval == EEXIST)
-		rval = 0;
-	if (!rval) {
-		*inum = args->inumber;
-		if (ci_name) {
-			ci_name->name = args->value;
-			ci_name->len = args->valuelen;
-		}
-	}
-out_free:
-	kmem_free(args);
-	return rval;
-}
-
-/*
- * Remove an entry from a directory.
- */
-int
-xfs_dir_removename(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name,
-	xfs_ino_t	ino,
-	xfs_fsblock_t	*first,		/* bmap's firstblock */
-	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
-	xfs_extlen_t	total)		/* bmap's total block count */
-{
-	struct xfs_da_args *args;
-	int		rval;
-	int		v;		/* type-checking value */
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-	XFS_STATS_INC(xs_dir_remove);
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->inumber = ino;
-	args->dp = dp;
-	args->firstblock = first;
-	args->flist = flist;
-	args->total = total;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_removename(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_removename(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_removename(args);
-	else
-		rval = xfs_dir2_node_removename(args);
-out_free:
-	kmem_free(args);
-	return rval;
-}
-
-/*
- * Replace the inode number of a directory entry.
- */
-int
-xfs_dir_replace(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name,		/* name of entry to replace */
-	xfs_ino_t	inum,		/* new inode number */
-	xfs_fsblock_t	*first,		/* bmap's firstblock */
-	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
-	xfs_extlen_t	total)		/* bmap's total block count */
-{
-	struct xfs_da_args *args;
-	int		rval;
-	int		v;		/* type-checking value */
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-
-	rval = xfs_dir_ino_validate(tp->t_mountp, inum);
-	if (rval)
-		return rval;
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->inumber = inum;
-	args->dp = dp;
-	args->firstblock = first;
-	args->flist = flist;
-	args->total = total;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_replace(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_replace(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_replace(args);
-	else
-		rval = xfs_dir2_node_replace(args);
-out_free:
-	kmem_free(args);
-	return rval;
-}
-
-/*
- * See if this entry can be added to the directory without allocating space.
- * First checks that the caller couldn't reserve enough space (resblks = 0).
- */
-int
-xfs_dir_canenter(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*dp,
-	struct xfs_name	*name,		/* name of entry to add */
-	uint		resblks)
-{
-	struct xfs_da_args *args;
-	int		rval;
-	int		v;		/* type-checking value */
-
-	if (resblks)
-		return 0;
-
-	ASSERT(S_ISDIR(dp->i_d.di_mode));
-
-	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-	if (!args)
-		return ENOMEM;
-
-	args->geo = dp->i_mount->m_dir_geo;
-	args->name = name->name;
-	args->namelen = name->len;
-	args->filetype = name->type;
-	args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-	args->dp = dp;
-	args->whichfork = XFS_DATA_FORK;
-	args->trans = tp;
-	args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
-							XFS_DA_OP_OKNOENT;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		rval = xfs_dir2_sf_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isblock(args, &v);
-	if (rval)
-		goto out_free;
-	if (v) {
-		rval = xfs_dir2_block_addname(args);
-		goto out_free;
-	}
-
-	rval = xfs_dir2_isleaf(args, &v);
-	if (rval)
-		goto out_free;
-	if (v)
-		rval = xfs_dir2_leaf_addname(args);
-	else
-		rval = xfs_dir2_node_addname(args);
-out_free:
-	kmem_free(args);
-	return rval;
-}
-
-/*
- * Utility routines.
- */
-
-/*
- * Add a block to the directory.
- *
- * This routine is for data and free blocks, not leaf/node blocks which are
- * handled by xfs_da_grow_inode.
- */
-int
-xfs_dir2_grow_inode(
-	struct xfs_da_args	*args,
-	int			space,	/* v2 dir's space XFS_DIR2_xxx_SPACE */
-	xfs_dir2_db_t		*dbp)	/* out: block number added */
-{
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_mount	*mp = dp->i_mount;
-	xfs_fileoff_t		bno;	/* directory offset of new block */
-	int			count;	/* count of filesystem blocks */
-	int			error;
-
-	trace_xfs_dir2_grow_inode(args, space);
-
-	/*
-	 * Set lowest possible block in the space requested.
-	 */
-	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
-	count = args->geo->fsbcount;
-
-	error = xfs_da_grow_inode_int(args, &bno, count);
-	if (error)
-		return error;
-
-	*dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
-
-	/*
-	 * Update file's size if this is the data space and it grew.
-	 */
-	if (space == XFS_DIR2_DATA_SPACE) {
-		xfs_fsize_t	size;		/* directory file (data) size */
-
-		size = XFS_FSB_TO_B(mp, bno + count);
-		if (size > dp->i_d.di_size) {
-			dp->i_d.di_size = size;
-			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-		}
-	}
-	return 0;
-}
-
-/*
- * See if the directory is a single-block form directory.
- */
-int
-xfs_dir2_isblock(
-	struct xfs_da_args	*args,
-	int			*vp)	/* out: 1 is block, 0 is not block */
-{
-	xfs_fileoff_t		last;	/* last file offset */
-	int			rval;
-
-	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
-		return rval;
-	rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
-	ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
-	*vp = rval;
-	return 0;
-}
-
-/*
- * See if the directory is a single-leaf form directory.
- */
-int
-xfs_dir2_isleaf(
-	struct xfs_da_args	*args,
-	int			*vp)	/* out: 1 is block, 0 is not block */
-{
-	xfs_fileoff_t		last;	/* last file offset */
-	int			rval;
-
-	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
-		return rval;
-	*vp = last == args->geo->leafblk + args->geo->fsbcount;
-	return 0;
-}
-
-/*
- * Remove the given block from the directory.
- * This routine is used for data and free blocks, leaf/node are done
- * by xfs_da_shrink_inode.
- */
-int
-xfs_dir2_shrink_inode(
-	xfs_da_args_t	*args,
-	xfs_dir2_db_t	db,
-	struct xfs_buf	*bp)
-{
-	xfs_fileoff_t	bno;		/* directory file offset */
-	xfs_dablk_t	da;		/* directory file offset */
-	int		done;		/* bunmap is finished */
-	xfs_inode_t	*dp;
-	int		error;
-	xfs_mount_t	*mp;
-	xfs_trans_t	*tp;
-
-	trace_xfs_dir2_shrink_inode(args, db);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	da = xfs_dir2_db_to_da(args->geo, db);
-	/*
-	 * Unmap the fsblock(s).
-	 */
-	if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
-			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-			&done))) {
-		/*
-		 * ENOSPC actually can happen if we're in a removename with
-		 * no space reservation, and the resulting block removal
-		 * would cause a bmap btree split or conversion from extents
-		 * to btree.  This can only happen for un-fragmented
-		 * directory blocks, since you need to be punching out
-		 * the middle of an extent.
-		 * In this case we need to leave the block in the file,
-		 * and not binval it.
-		 * So the block has to be in a consistent empty state
-		 * and appropriately logged.
-		 * We don't free up the buffer, the caller can tell it
-		 * hasn't happened since it got an error back.
-		 */
-		return error;
-	}
-	ASSERT(done);
-	/*
-	 * Invalidate the buffer from the transaction.
-	 */
-	xfs_trans_binval(tp, bp);
-	/*
-	 * If it's not a data block, we're done.
-	 */
-	if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
-		return 0;
-	/*
-	 * If the block isn't the last one in the directory, we're done.
-	 */
-	if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
-		return 0;
-	bno = da;
-	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
-		/*
-		 * This can't really happen unless there's kernel corruption.
-		 */
-		return error;
-	}
-	if (db == args->geo->datablk)
-		ASSERT(bno == 0);
-	else
-		ASSERT(bno > 0);
-	/*
-	 * Set the size to the new last block.
-	 */
-	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
-	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-	return 0;
-}
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
deleted file mode 100644
index ab0bffccf5c3..000000000000
--- a/fs/xfs/xfs_dir2_block.c
+++ /dev/null
@@ -1,1265 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_dinode.h"
-
-/*
- * Local function prototypes.
- */
-static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
-				    int first, int last);
-static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
-static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
-				     int *entno);
-static int xfs_dir2_block_sort(const void *a, const void *b);
-
-static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
-	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
-	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
-}
-
-static bool
-xfs_dir3_block_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
-			return false;
-		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-			return false;
-	}
-	if (__xfs_dir3_data_check(NULL, bp))
-		return false;
-	return true;
-}
-
-static void
-xfs_dir3_block_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_dir3_block_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_block_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
-
-	if (!xfs_dir3_block_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
-	.verify_read = xfs_dir3_block_read_verify,
-	.verify_write = xfs_dir3_block_write_verify,
-};
-
-int
-xfs_dir3_block_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_mount	*mp = dp->i_mount;
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
-				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
-	if (!err && tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
-	return err;
-}
-
-static void
-xfs_dir3_block_init(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_buf		*bp,
-	struct xfs_inode	*dp)
-{
-	struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-	bp->b_ops = &xfs_dir3_block_buf_ops;
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		memset(hdr3, 0, sizeof(*hdr3));
-		hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-		hdr3->blkno = cpu_to_be64(bp->b_bn);
-		hdr3->owner = cpu_to_be64(dp->i_ino);
-		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-		return;
-
-	}
-	hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
-}
-
-static void
-xfs_dir2_block_need_space(
-	struct xfs_inode		*dp,
-	struct xfs_dir2_data_hdr	*hdr,
-	struct xfs_dir2_block_tail	*btp,
-	struct xfs_dir2_leaf_entry	*blp,
-	__be16				**tagpp,
-	struct xfs_dir2_data_unused	**dupp,
-	struct xfs_dir2_data_unused	**enddupp,
-	int				*compact,
-	int				len)
-{
-	struct xfs_dir2_data_free	*bf;
-	__be16				*tagp = NULL;
-	struct xfs_dir2_data_unused	*dup = NULL;
-	struct xfs_dir2_data_unused	*enddup = NULL;
-
-	*compact = 0;
-	bf = dp->d_ops->data_bestfree_p(hdr);
-
-	/*
-	 * If there are stale entries we'll use one for the leaf.
-	 */
-	if (btp->stale) {
-		if (be16_to_cpu(bf[0].length) >= len) {
-			/*
-			 * The biggest entry enough to avoid compaction.
-			 */
-			dup = (xfs_dir2_data_unused_t *)
-			      ((char *)hdr + be16_to_cpu(bf[0].offset));
-			goto out;
-		}
-
-		/*
-		 * Will need to compact to make this work.
-		 * Tag just before the first leaf entry.
-		 */
-		*compact = 1;
-		tagp = (__be16 *)blp - 1;
-
-		/* Data object just before the first leaf entry.  */
-		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-
-		/*
-		 * If it's not free then the data will go where the
-		 * leaf data starts now, if it works at all.
-		 */
-		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-			    (uint)sizeof(*blp) < len)
-				dup = NULL;
-		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-			dup = NULL;
-		else
-			dup = (xfs_dir2_data_unused_t *)blp;
-		goto out;
-	}
-
-	/*
-	 * no stale entries, so just use free space.
-	 * Tag just before the first leaf entry.
-	 */
-	tagp = (__be16 *)blp - 1;
-
-	/* Data object just before the first leaf entry.  */
-	enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-
-	/*
-	 * If it's not free then can't do this add without cleaning up:
-	 * the space before the first leaf entry needs to be free so it
-	 * can be expanded to hold the pointer to the new entry.
-	 */
-	if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-		/*
-		 * Check out the biggest freespace and see if it's the same one.
-		 */
-		dup = (xfs_dir2_data_unused_t *)
-		      ((char *)hdr + be16_to_cpu(bf[0].offset));
-		if (dup != enddup) {
-			/*
-			 * Not the same free entry, just check its length.
-			 */
-			if (be16_to_cpu(dup->length) < len)
-				dup = NULL;
-			goto out;
-		}
-
-		/*
-		 * It is the biggest freespace, can it hold the leaf too?
-		 */
-		if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-			/*
-			 * Yes, use the second-largest entry instead if it works.
-			 */
-			if (be16_to_cpu(bf[1].length) >= len)
-				dup = (xfs_dir2_data_unused_t *)
-				      ((char *)hdr + be16_to_cpu(bf[1].offset));
-			else
-				dup = NULL;
-		}
-	}
-out:
-	*tagpp = tagp;
-	*dupp = dup;
-	*enddupp = enddup;
-}
-
-/*
- * compact the leaf entries.
- * Leave the highest-numbered stale entry stale.
- * XXX should be the one closest to mid but mid is not yet computed.
- */
-static void
-xfs_dir2_block_compact(
-	struct xfs_da_args		*args,
-	struct xfs_buf			*bp,
-	struct xfs_dir2_data_hdr	*hdr,
-	struct xfs_dir2_block_tail	*btp,
-	struct xfs_dir2_leaf_entry	*blp,
-	int				*needlog,
-	int				*lfloghigh,
-	int				*lfloglow)
-{
-	int			fromidx;	/* source leaf index */
-	int			toidx;		/* target leaf index */
-	int			needscan = 0;
-	int			highstale;	/* high stale index */
-
-	fromidx = toidx = be32_to_cpu(btp->count) - 1;
-	highstale = *lfloghigh = -1;
-	for (; fromidx >= 0; fromidx--) {
-		if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-			if (highstale == -1)
-				highstale = toidx;
-			else {
-				if (*lfloghigh == -1)
-					*lfloghigh = toidx;
-				continue;
-			}
-		}
-		if (fromidx < toidx)
-			blp[toidx] = blp[fromidx];
-		toidx--;
-	}
-	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
-	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-	xfs_dir2_data_make_free(args, bp,
-		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-		needlog, &needscan);
-	btp->stale = cpu_to_be32(1);
-	/*
-	 * If we now need to rebuild the bestfree map, do so.
-	 * This needs to happen before the next call to use_free.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(args->dp, hdr, needlog);
-}
-
-/*
- * Add an entry to a block directory.
- */
-int						/* error */
-xfs_dir2_block_addname(
-	xfs_da_args_t		*args)		/* directory op arguments */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
-	struct xfs_buf		*bp;		/* buffer for block */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	int			compact;	/* need to compact leaf ents */
-	xfs_dir2_data_entry_t	*dep;		/* block data entry */
-	xfs_inode_t		*dp;		/* directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
-	int			error;		/* error return value */
-	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
-	xfs_dahash_t		hash;		/* hash value of found entry */
-	int			high;		/* high index for binary srch */
-	int			highstale;	/* high stale index */
-	int			lfloghigh=0;	/* last final leaf to log */
-	int			lfloglow=0;	/* first final leaf to log */
-	int			len;		/* length of the new entry */
-	int			low;		/* low index for binary srch */
-	int			lowstale;	/* low stale index */
-	int			mid=0;		/* midpoint for binary srch */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log header */
-	int			needscan;	/* need to rescan freespace */
-	__be16			*tagp;		/* pointer to tag value */
-	xfs_trans_t		*tp;		/* transaction structure */
-
-	trace_xfs_dir2_block_addname(args);
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-
-	/* Read the (one and only) directory block into bp. */
-	error = xfs_dir3_block_read(tp, dp, &bp);
-	if (error)
-		return error;
-
-	len = dp->d_ops->data_entsize(args->namelen);
-
-	/*
-	 * Set up pointers to parts of the block.
-	 */
-	hdr = bp->b_addr;
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-
-	/*
-	 * Find out if we can reuse stale entries or whether we need extra
-	 * space for entry and new leaf.
-	 */
-	xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
-				  &enddup, &compact, len);
-
-	/*
-	 * Done everything we need for a space check now.
-	 */
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-		xfs_trans_brelse(tp, bp);
-		if (!dup)
-			return ENOSPC;
-		return 0;
-	}
-
-	/*
-	 * If we don't have space for the new entry & leaf ...
-	 */
-	if (!dup) {
-		/* Don't have a space reservation: return no-space.  */
-		if (args->total == 0)
-			return ENOSPC;
-		/*
-		 * Convert to the next larger format.
-		 * Then add the new entry in that format.
-		 */
-		error = xfs_dir2_block_to_leaf(args, bp);
-		if (error)
-			return error;
-		return xfs_dir2_leaf_addname(args);
-	}
-
-	needlog = needscan = 0;
-
-	/*
-	 * If need to compact the leaf entries, do it now.
-	 */
-	if (compact) {
-		xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
-				      &lfloghigh, &lfloglow);
-		/* recalculate blp post-compaction */
-		blp = xfs_dir2_block_leaf_p(btp);
-	} else if (btp->stale) {
-		/*
-		 * Set leaf logging boundaries to impossible state.
-		 * For the no-stale case they're set explicitly.
-		 */
-		lfloglow = be32_to_cpu(btp->count);
-		lfloghigh = -1;
-	}
-
-	/*
-	 * Find the slot that's first lower than our hash value, -1 if none.
-	 */
-	for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
-		mid = (low + high) >> 1;
-		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
-			break;
-		if (hash < args->hashval)
-			low = mid + 1;
-		else
-			high = mid - 1;
-	}
-	while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
-		mid--;
-	}
-	/*
-	 * No stale entries, will use enddup space to hold new leaf.
-	 */
-	if (!btp->stale) {
-		/*
-		 * Mark the space needed for the new leaf entry, now in use.
-		 */
-		xfs_dir2_data_use_free(args, bp, enddup,
-			(xfs_dir2_data_aoff_t)
-			((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
-			 sizeof(*blp)),
-			(xfs_dir2_data_aoff_t)sizeof(*blp),
-			&needlog, &needscan);
-		/*
-		 * Update the tail (entry count).
-		 */
-		be32_add_cpu(&btp->count, 1);
-		/*
-		 * If we now need to rebuild the bestfree map, do so.
-		 * This needs to happen before the next call to use_free.
-		 */
-		if (needscan) {
-			xfs_dir2_data_freescan(dp, hdr, &needlog);
-			needscan = 0;
-		}
-		/*
-		 * Adjust pointer to the first leaf entry, we're about to move
-		 * the table up one to open up space for the new leaf entry.
-		 * Then adjust our index to match.
-		 */
-		blp--;
-		mid++;
-		if (mid)
-			memmove(blp, &blp[1], mid * sizeof(*blp));
-		lfloglow = 0;
-		lfloghigh = mid;
-	}
-	/*
-	 * Use a stale leaf for our new entry.
-	 */
-	else {
-		for (lowstale = mid;
-		     lowstale >= 0 &&
-			blp[lowstale].address !=
-			cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-		     lowstale--)
-			continue;
-		for (highstale = mid + 1;
-		     highstale < be32_to_cpu(btp->count) &&
-			blp[highstale].address !=
-			cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
-			(lowstale < 0 || mid - lowstale > highstale - mid);
-		     highstale++)
-			continue;
-		/*
-		 * Move entries toward the low-numbered stale entry.
-		 */
-		if (lowstale >= 0 &&
-		    (highstale == be32_to_cpu(btp->count) ||
-		     mid - lowstale <= highstale - mid)) {
-			if (mid - lowstale)
-				memmove(&blp[lowstale], &blp[lowstale + 1],
-					(mid - lowstale) * sizeof(*blp));
-			lfloglow = MIN(lowstale, lfloglow);
-			lfloghigh = MAX(mid, lfloghigh);
-		}
-		/*
-		 * Move entries toward the high-numbered stale entry.
-		 */
-		else {
-			ASSERT(highstale < be32_to_cpu(btp->count));
-			mid++;
-			if (highstale - mid)
-				memmove(&blp[mid + 1], &blp[mid],
-					(highstale - mid) * sizeof(*blp));
-			lfloglow = MIN(mid, lfloglow);
-			lfloghigh = MAX(highstale, lfloghigh);
-		}
-		be32_add_cpu(&btp->stale, -1);
-	}
-	/*
-	 * Point to the new data entry.
-	 */
-	dep = (xfs_dir2_data_entry_t *)dup;
-	/*
-	 * Fill in the leaf entry.
-	 */
-	blp[mid].hashval = cpu_to_be32(args->hashval);
-	blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-				(char *)dep - (char *)hdr));
-	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
-	/*
-	 * Mark space for the data entry used.
-	 */
-	xfs_dir2_data_use_free(args, bp, dup,
-		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
-		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
-	/*
-	 * Create the new data entry.
-	 */
-	dep->inumber = cpu_to_be64(args->inumber);
-	dep->namelen = args->namelen;
-	memcpy(dep->name, args->name, args->namelen);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	/*
-	 * Clean up the bestfree array and log the header, tail, and entry.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	if (needlog)
-		xfs_dir2_data_log_header(args, bp);
-	xfs_dir2_block_log_tail(tp, bp);
-	xfs_dir2_data_log_entry(args, bp, dep);
-	xfs_dir3_data_check(dp, bp);
-	return 0;
-}
-
-/*
- * Log leaf entries from the block.
- */
-static void
-xfs_dir2_block_log_leaf(
-	xfs_trans_t		*tp,		/* transaction structure */
-	struct xfs_buf		*bp,		/* block buffer */
-	int			first,		/* index of first logged leaf */
-	int			last)		/* index of last logged leaf */
-{
-	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
-	xfs_dir2_leaf_entry_t	*blp;
-	xfs_dir2_block_tail_t	*btp;
-
-	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
-		(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
-}
-
-/*
- * Log the block tail.
- */
-static void
-xfs_dir2_block_log_tail(
-	xfs_trans_t		*tp,		/* transaction structure */
-	struct xfs_buf		*bp)		/* block buffer */
-{
-	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
-	xfs_dir2_block_tail_t	*btp;
-
-	btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
-	xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
-		(uint)((char *)(btp + 1) - (char *)hdr - 1));
-}
-
-/*
- * Look up an entry in the block.  This is the external routine,
- * xfs_dir2_block_lookup_int does the real work.
- */
-int						/* error */
-xfs_dir2_block_lookup(
-	xfs_da_args_t		*args)		/* dir lookup arguments */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_dir2_data_entry_t	*dep;		/* block data entry */
-	xfs_inode_t		*dp;		/* incore inode */
-	int			ent;		/* entry index */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-
-	trace_xfs_dir2_block_lookup(args);
-
-	/*
-	 * Get the buffer, look up the entry.
-	 * If not found (ENOENT) then return, have no buffer.
-	 */
-	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
-		return error;
-	dp = args->dp;
-	mp = dp->i_mount;
-	hdr = bp->b_addr;
-	xfs_dir3_data_check(dp, bp);
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	/*
-	 * Get the offset from the leaf entry, to point to the data.
-	 */
-	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-			xfs_dir2_dataptr_to_off(args->geo,
-						be32_to_cpu(blp[ent].address)));
-	/*
-	 * Fill in inode number, CI name if appropriate, release the block.
-	 */
-	args->inumber = be64_to_cpu(dep->inumber);
-	args->filetype = dp->d_ops->data_get_ftype(dep);
-	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-	xfs_trans_brelse(args->trans, bp);
-	return error;
-}
-
-/*
- * Internal block lookup routine.
- */
-static int					/* error */
-xfs_dir2_block_lookup_int(
-	xfs_da_args_t		*args,		/* dir lookup arguments */
-	struct xfs_buf		**bpp,		/* returned block buffer */
-	int			*entno)		/* returned entry number */
-{
-	xfs_dir2_dataptr_t	addr;		/* data entry address */
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_dir2_data_entry_t	*dep;		/* block data entry */
-	xfs_inode_t		*dp;		/* incore inode */
-	int			error;		/* error return value */
-	xfs_dahash_t		hash;		/* found hash value */
-	int			high;		/* binary search high index */
-	int			low;		/* binary search low index */
-	int			mid;		/* binary search current idx */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	enum xfs_dacmp		cmp;		/* comparison result */
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-
-	error = xfs_dir3_block_read(tp, dp, &bp);
-	if (error)
-		return error;
-
-	hdr = bp->b_addr;
-	xfs_dir3_data_check(dp, bp);
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	/*
-	 * Loop doing a binary search for our hash value.
-	 * Find our entry, ENOENT if it's not there.
-	 */
-	for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
-		ASSERT(low <= high);
-		mid = (low + high) >> 1;
-		if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
-			break;
-		if (hash < args->hashval)
-			low = mid + 1;
-		else
-			high = mid - 1;
-		if (low > high) {
-			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-			xfs_trans_brelse(tp, bp);
-			return ENOENT;
-		}
-	}
-	/*
-	 * Back up to the first one with the right hash value.
-	 */
-	while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
-		mid--;
-	}
-	/*
-	 * Now loop forward through all the entries with the
-	 * right hash value looking for our name.
-	 */
-	do {
-		if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		/*
-		 * Get pointer to the entry from the leaf.
-		 */
-		dep = (xfs_dir2_data_entry_t *)
-			((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
-		/*
-		 * Compare name and if it's an exact match, return the index
-		 * and buffer. If it's the first case-insensitive match, store
-		 * the index and buffer and continue looking for an exact match.
-		 */
-		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-			args->cmpresult = cmp;
-			*bpp = bp;
-			*entno = mid;
-			if (cmp == XFS_CMP_EXACT)
-				return 0;
-		}
-	} while (++mid < be32_to_cpu(btp->count) &&
-			be32_to_cpu(blp[mid].hashval) == hash);
-
-	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-	/*
-	 * Here, we can only be doing a lookup (not a rename or replace).
-	 * If a case-insensitive match was found earlier, return success.
-	 */
-	if (args->cmpresult == XFS_CMP_CASE)
-		return 0;
-	/*
-	 * No match, release the buffer and return ENOENT.
-	 */
-	xfs_trans_brelse(tp, bp);
-	return ENOENT;
-}
-
-/*
- * Remove an entry from a block format directory.
- * If that makes the block small enough to fit in shortform, transform it.
- */
-int						/* error */
-xfs_dir2_block_removename(
-	xfs_da_args_t		*args)		/* directory operation args */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_dir2_data_entry_t	*dep;		/* block data entry */
-	xfs_inode_t		*dp;		/* incore inode */
-	int			ent;		/* block leaf entry index */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log block header */
-	int			needscan;	/* need to fixup bestfree */
-	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
-	int			size;		/* shortform size */
-	xfs_trans_t		*tp;		/* transaction pointer */
-
-	trace_xfs_dir2_block_removename(args);
-
-	/*
-	 * Look up the entry in the block.  Gets the buffer and entry index.
-	 * It will always be there, the vnodeops level does a lookup first.
-	 */
-	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
-		return error;
-	}
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	hdr = bp->b_addr;
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	/*
-	 * Point to the data entry using the leaf entry.
-	 */
-	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-			xfs_dir2_dataptr_to_off(args->geo,
-						be32_to_cpu(blp[ent].address)));
-	/*
-	 * Mark the data entry's space free.
-	 */
-	needlog = needscan = 0;
-	xfs_dir2_data_make_free(args, bp,
-		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-	/*
-	 * Fix up the block tail.
-	 */
-	be32_add_cpu(&btp->stale, 1);
-	xfs_dir2_block_log_tail(tp, bp);
-	/*
-	 * Remove the leaf entry by marking it stale.
-	 */
-	blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
-	/*
-	 * Fix up bestfree, log the header if necessary.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	if (needlog)
-		xfs_dir2_data_log_header(args, bp);
-	xfs_dir3_data_check(dp, bp);
-	/*
-	 * See if the size as a shortform is good enough.
-	 */
-	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-	if (size > XFS_IFORK_DSIZE(dp))
-		return 0;
-
-	/*
-	 * If it works, do the conversion.
-	 */
-	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
-}
-
-/*
- * Replace an entry in a V2 block directory.
- * Change the inode number to the new value.
- */
-int						/* error */
-xfs_dir2_block_replace(
-	xfs_da_args_t		*args)		/* directory operation args */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_dir2_data_entry_t	*dep;		/* block data entry */
-	xfs_inode_t		*dp;		/* incore inode */
-	int			ent;		/* leaf entry index */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-
-	trace_xfs_dir2_block_replace(args);
-
-	/*
-	 * Lookup the entry in the directory.  Get buffer and entry index.
-	 * This will always succeed since the caller has already done a lookup.
-	 */
-	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
-		return error;
-	}
-	dp = args->dp;
-	mp = dp->i_mount;
-	hdr = bp->b_addr;
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	/*
-	 * Point to the data entry we need to change.
-	 */
-	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-			xfs_dir2_dataptr_to_off(args->geo,
-						be32_to_cpu(blp[ent].address)));
-	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
-	/*
-	 * Change the inode number to the new value.
-	 */
-	dep->inumber = cpu_to_be64(args->inumber);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	xfs_dir2_data_log_entry(args, bp, dep);
-	xfs_dir3_data_check(dp, bp);
-	return 0;
-}
-
-/*
- * Qsort comparison routine for the block leaf entries.
- */
-static int					/* sort order */
-xfs_dir2_block_sort(
-	const void			*a,	/* first leaf entry */
-	const void			*b)	/* second leaf entry */
-{
-	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
-	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
-
-	la = a;
-	lb = b;
-	return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
-		(be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
-}
-
-/*
- * Convert a V2 leaf directory to a V2 block directory if possible.
- */
-int						/* error */
-xfs_dir2_leaf_to_block(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*lbp,		/* leaf buffer */
-	struct xfs_buf		*dbp)		/* data buffer */
-{
-	__be16			*bestsp;	/* leaf bests table */
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
-	int			error;		/* error return value */
-	int			from;		/* leaf from index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* file system mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to scan for bestfree */
-	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
-	int			size;		/* bytes used */
-	__be16			*tagp;		/* end of entry (tag) */
-	int			to;		/* block/leaf to index */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	trace_xfs_dir2_leaf_to_block(args);
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	leaf = lbp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-
-	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
-	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
-	/*
-	 * If there are data blocks other than the first one, take this
-	 * opportunity to remove trailing empty data blocks that may have
-	 * been left behind during no-space-reservation operations.
-	 * These will show up in the leaf bests table.
-	 */
-	while (dp->i_d.di_size > args->geo->blksize) {
-		int hdrsz;
-
-		hdrsz = dp->d_ops->data_entry_offset;
-		bestsp = xfs_dir2_leaf_bests_p(ltp);
-		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
-					    args->geo->blksize - hdrsz) {
-			if ((error =
-			    xfs_dir2_leaf_trim_data(args, lbp,
-				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
-				return error;
-		} else
-			return 0;
-	}
-	/*
-	 * Read the data block if we don't already have it, give up if it fails.
-	 */
-	if (!dbp) {
-		error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
-		if (error)
-			return error;
-	}
-	hdr = dbp->b_addr;
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-
-	/*
-	 * Size of the "leaf" area in the block.
-	 */
-	size = (uint)sizeof(xfs_dir2_block_tail_t) +
-	       (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
-	/*
-	 * Look at the last data entry.
-	 */
-	tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
-	dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-	/*
-	 * If it's not free or is too short we can't do it.
-	 */
-	if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
-	    be16_to_cpu(dup->length) < size)
-		return 0;
-
-	/*
-	 * Start converting it to block form.
-	 */
-	xfs_dir3_block_init(mp, tp, dbp, dp);
-
-	needlog = 1;
-	needscan = 0;
-	/*
-	 * Use up the space at the end of the block (blp/btp).
-	 */
-	xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
-		&needlog, &needscan);
-	/*
-	 * Initialize the block tail.
-	 */
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
-	btp->stale = 0;
-	xfs_dir2_block_log_tail(tp, dbp);
-	/*
-	 * Initialize the block leaf area.  We compact out stale entries.
-	 */
-	lep = xfs_dir2_block_leaf_p(btp);
-	for (from = to = 0; from < leafhdr.count; from++) {
-		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			continue;
-		lep[to++] = ents[from];
-	}
-	ASSERT(to == be32_to_cpu(btp->count));
-	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
-	/*
-	 * Scan the bestfree if we need it and log the data block header.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	/*
-	 * Pitch the old leaf block.
-	 */
-	error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
-	if (error)
-		return error;
-
-	/*
-	 * Now see if the resulting block can be shrunken to shortform.
-	 */
-	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-	if (size > XFS_IFORK_DSIZE(dp))
-		return 0;
-
-	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
-}
-
-/*
- * Convert the shortform directory to block form.
- */
-int						/* error */
-xfs_dir2_sf_to_block(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			dummy;		/* trash */
-	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
-	int			endoffset;	/* end of data objects */
-	int			error;		/* error return value */
-	int			i;		/* index */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log block header */
-	int			needscan;	/* need to scan block freespc */
-	int			newoffset;	/* offset from current entry */
-	int			offset;		/* target block offset */
-	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old shortform header  */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform header  */
-	__be16			*tagp;		/* end of data entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_name		name;
-	struct xfs_ifork	*ifp;
-
-	trace_xfs_dir2_sf_to_block(args);
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
-	ASSERT(ifp->if_flags & XFS_IFINLINE);
-	/*
-	 * Bomb out if the shortform directory is way too short.
-	 */
-	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		return EIO;
-	}
-
-	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
-	ASSERT(ifp->if_bytes == dp->i_d.di_size);
-	ASSERT(ifp->if_u1.if_data != NULL);
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
-	ASSERT(dp->i_d.di_nextents == 0);
-
-	/*
-	 * Copy the directory into a temporary buffer.
-	 * Then pitch the incore inode data so we can make extents.
-	 */
-	sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
-	memcpy(sfp, oldsfp, ifp->if_bytes);
-
-	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
-	xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
-	dp->i_d.di_size = 0;
-
-	/*
-	 * Add block 0 to the inode.
-	 */
-	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
-	if (error) {
-		kmem_free(sfp);
-		return error;
-	}
-	/*
-	 * Initialize the data block, then convert it to block format.
-	 */
-	error = xfs_dir3_data_init(args, blkno, &bp);
-	if (error) {
-		kmem_free(sfp);
-		return error;
-	}
-	xfs_dir3_block_init(mp, tp, bp, dp);
-	hdr = bp->b_addr;
-
-	/*
-	 * Compute size of block "tail" area.
-	 */
-	i = (uint)sizeof(*btp) +
-	    (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
-	/*
-	 * The whole thing is initialized to free by the init routine.
-	 * Say we're using the leaf and tail area.
-	 */
-	dup = dp->d_ops->data_unused_p(hdr);
-	needlog = needscan = 0;
-	xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
-			       i, &needlog, &needscan);
-	ASSERT(needscan == 0);
-	/*
-	 * Fill in the tail.
-	 */
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	btp->count = cpu_to_be32(sfp->count + 2);	/* ., .. */
-	btp->stale = 0;
-	blp = xfs_dir2_block_leaf_p(btp);
-	endoffset = (uint)((char *)blp - (char *)hdr);
-	/*
-	 * Remove the freespace, we'll manage it.
-	 */
-	xfs_dir2_data_use_free(args, bp, dup,
-		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
-		be16_to_cpu(dup->length), &needlog, &needscan);
-	/*
-	 * Create entry for .
-	 */
-	dep = dp->d_ops->data_dot_entry_p(hdr);
-	dep->inumber = cpu_to_be64(dp->i_ino);
-	dep->namelen = 1;
-	dep->name[0] = '.';
-	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(args, bp, dep);
-	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
-	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-				(char *)dep - (char *)hdr));
-	/*
-	 * Create entry for ..
-	 */
-	dep = dp->d_ops->data_dotdot_entry_p(hdr);
-	dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
-	dep->namelen = 2;
-	dep->name[0] = dep->name[1] = '.';
-	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(args, bp, dep);
-	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
-	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-				(char *)dep - (char *)hdr));
-	offset = dp->d_ops->data_first_offset;
-	/*
-	 * Loop over existing entries, stuff them in.
-	 */
-	i = 0;
-	if (!sfp->count)
-		sfep = NULL;
-	else
-		sfep = xfs_dir2_sf_firstentry(sfp);
-	/*
-	 * Need to preserve the existing offset values in the sf directory.
-	 * Insert holes (unused entries) where necessary.
-	 */
-	while (offset < endoffset) {
-		/*
-		 * sfep is null when we reach the end of the list.
-		 */
-		if (sfep == NULL)
-			newoffset = endoffset;
-		else
-			newoffset = xfs_dir2_sf_get_offset(sfep);
-		/*
-		 * There should be a hole here, make one.
-		 */
-		if (offset < newoffset) {
-			dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-			dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-			dup->length = cpu_to_be16(newoffset - offset);
-			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
-				((char *)dup - (char *)hdr));
-			xfs_dir2_data_log_unused(args, bp, dup);
-			xfs_dir2_data_freeinsert(hdr,
-						 dp->d_ops->data_bestfree_p(hdr),
-						 dup, &dummy);
-			offset += be16_to_cpu(dup->length);
-			continue;
-		}
-		/*
-		 * Copy a real entry.
-		 */
-		dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
-		dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
-		dep->namelen = sfep->namelen;
-		dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
-		memcpy(dep->name, sfep->name, dep->namelen);
-		tagp = dp->d_ops->data_entry_tag_p(dep);
-		*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-		xfs_dir2_data_log_entry(args, bp, dep);
-		name.name = sfep->name;
-		name.len = sfep->namelen;
-		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
-							hashname(&name));
-		blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
-						 (char *)dep - (char *)hdr));
-		offset = (int)((char *)(tagp + 1) - (char *)hdr);
-		if (++i == sfp->count)
-			sfep = NULL;
-		else
-			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-	}
-	/* Done with the temporary buffer */
-	kmem_free(sfp);
-	/*
-	 * Sort the leaf entries by hash value.
-	 */
-	xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
-	/*
-	 * Log the leaf entry area and tail.
-	 * Already logged the header in data_init, ignore needlog.
-	 */
-	ASSERT(needscan == 0);
-	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
-	xfs_dir2_block_log_tail(tp, bp);
-	xfs_dir3_data_check(dp, bp);
-	return 0;
-}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
deleted file mode 100644
index 8c2f6422648e..000000000000
--- a/fs/xfs/xfs_dir2_data.c
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Check the consistency of the data block.
- * The input can also be a block-format directory.
- * Return 0 is the buffer is good, otherwise an error.
- */
-int
-__xfs_dir3_data_check(
-	struct xfs_inode	*dp,		/* incore inode pointer */
-	struct xfs_buf		*bp)		/* data block's buffer */
-{
-	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
-	xfs_dir2_data_free_t	*bf;		/* bestfree table */
-	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
-	int			count;		/* count of entries found */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_data_entry_t	*dep;		/* data entry */
-	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
-	xfs_dir2_data_unused_t	*dup;		/* unused entry */
-	char			*endp;		/* end of useful data */
-	int			freeseen;	/* mask of bestfrees seen */
-	xfs_dahash_t		hash;		/* hash of current name */
-	int			i;		/* leaf index */
-	int			lastfree;	/* last entry was unused */
-	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	char			*p;		/* current data position */
-	int			stale;		/* count of stale leaves */
-	struct xfs_name		name;
-	const struct xfs_dir_ops *ops;
-	struct xfs_da_geometry	*geo;
-
-	mp = bp->b_target->bt_mount;
-	geo = mp->m_dir_geo;
-
-	/*
-	 * We can be passed a null dp here from a verifier, so we need to go the
-	 * hard way to get them.
-	 */
-	ops = xfs_dir_get_ops(mp, dp);
-
-	hdr = bp->b_addr;
-	p = (char *)ops->data_entry_p(hdr);
-
-	switch (hdr->magic) {
-	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
-	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
-		btp = xfs_dir2_block_tail_p(geo, hdr);
-		lep = xfs_dir2_block_leaf_p(btp);
-		endp = (char *)lep;
-
-		/*
-		 * The number of leaf entries is limited by the size of the
-		 * block and the amount of space used by the data entries.
-		 * We don't know how much space is used by the data entries yet,
-		 * so just ensure that the count falls somewhere inside the
-		 * block right now.
-		 */
-		XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
-			((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
-		break;
-	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
-	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
-		endp = (char *)hdr + geo->blksize;
-		break;
-	default:
-		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
-	}
-
-	/*
-	 * Account for zero bestfree entries.
-	 */
-	bf = ops->data_bestfree_p(hdr);
-	count = lastfree = freeseen = 0;
-	if (!bf[0].length) {
-		XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
-		freeseen |= 1 << 0;
-	}
-	if (!bf[1].length) {
-		XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
-		freeseen |= 1 << 1;
-	}
-	if (!bf[2].length) {
-		XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
-		freeseen |= 1 << 2;
-	}
-
-	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
-						be16_to_cpu(bf[1].length));
-	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
-						be16_to_cpu(bf[2].length));
-	/*
-	 * Loop over the data/unused entries.
-	 */
-	while (p < endp) {
-		dup = (xfs_dir2_data_unused_t *)p;
-		/*
-		 * If it's unused, look for the space in the bestfree table.
-		 * If we find it, account for that, else make sure it
-		 * doesn't need to be there.
-		 */
-		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
-			XFS_WANT_CORRUPTED_RETURN(
-				be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
-					       (char *)dup - (char *)hdr);
-			dfp = xfs_dir2_data_freefind(hdr, bf, dup);
-			if (dfp) {
-				i = (int)(dfp - bf);
-				XFS_WANT_CORRUPTED_RETURN(
-					(freeseen & (1 << i)) == 0);
-				freeseen |= 1 << i;
-			} else {
-				XFS_WANT_CORRUPTED_RETURN(
-					be16_to_cpu(dup->length) <=
-						be16_to_cpu(bf[2].length));
-			}
-			p += be16_to_cpu(dup->length);
-			lastfree = 1;
-			continue;
-		}
-		/*
-		 * It's a real entry.  Validate the fields.
-		 * If this is a block directory then make sure it's
-		 * in the leaf section of the block.
-		 * The linear search is crude but this is DEBUG code.
-		 */
-		dep = (xfs_dir2_data_entry_t *)p;
-		XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
-		XFS_WANT_CORRUPTED_RETURN(
-			!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-		XFS_WANT_CORRUPTED_RETURN(
-			be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
-					       (char *)dep - (char *)hdr);
-		XFS_WANT_CORRUPTED_RETURN(
-				ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
-		count++;
-		lastfree = 0;
-		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-			addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
-						(xfs_dir2_data_aoff_t)
-						((char *)dep - (char *)hdr));
-			name.name = dep->name;
-			name.len = dep->namelen;
-			hash = mp->m_dirnameops->hashname(&name);
-			for (i = 0; i < be32_to_cpu(btp->count); i++) {
-				if (be32_to_cpu(lep[i].address) == addr &&
-				    be32_to_cpu(lep[i].hashval) == hash)
-					break;
-			}
-			XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
-		}
-		p += ops->data_entsize(dep->namelen);
-	}
-	/*
-	 * Need to have seen all the entries and all the bestfree slots.
-	 */
-	XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
-			if (lep[i].address ==
-			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-				stale++;
-			if (i > 0)
-				XFS_WANT_CORRUPTED_RETURN(
-					be32_to_cpu(lep[i].hashval) >=
-						be32_to_cpu(lep[i - 1].hashval));
-		}
-		XFS_WANT_CORRUPTED_RETURN(count ==
-			be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-		XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
-	}
-	return 0;
-}
-
-static bool
-xfs_dir3_data_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-			return false;
-		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
-			return false;
-	}
-	if (__xfs_dir3_data_check(NULL, bp))
-		return false;
-	return true;
-}
-
-/*
- * Readahead of the first block of the directory when it is opened is completely
- * oblivious to the format of the directory. Hence we can either get a block
- * format buffer or a data format buffer on readahead.
- */
-static void
-xfs_dir3_data_reada_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-	switch (hdr->magic) {
-	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
-	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
-		bp->b_ops = &xfs_dir3_block_buf_ops;
-		bp->b_ops->verify_read(bp);
-		return;
-	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
-	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
-		xfs_dir3_data_verify(bp);
-		return;
-	default:
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		break;
-	}
-}
-
-static void
-xfs_dir3_data_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-		 xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_dir3_data_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_data_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
-
-	if (!xfs_dir3_data_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
-	.verify_read = xfs_dir3_data_read_verify,
-	.verify_write = xfs_dir3_data_write_verify,
-};
-
-static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
-	.verify_read = xfs_dir3_data_reada_verify,
-	.verify_write = xfs_dir3_data_write_verify,
-};
-
-
-int
-xfs_dir3_data_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mapped_bno,
-	struct xfs_buf		**bpp)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
-				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
-	if (!err && tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
-	return err;
-}
-
-int
-xfs_dir3_data_readahead(
-	struct xfs_inode	*dp,
-	xfs_dablk_t		bno,
-	xfs_daddr_t		mapped_bno)
-{
-	return xfs_da_reada_buf(dp, bno, mapped_bno,
-				XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
-}
-
-/*
- * Given a data block and an unused entry from that block,
- * return the bestfree entry if any that corresponds to it.
- */
-xfs_dir2_data_free_t *
-xfs_dir2_data_freefind(
-	struct xfs_dir2_data_hdr *hdr,		/* data block header */
-	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
-	struct xfs_dir2_data_unused *dup)	/* unused space */
-{
-	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
-	xfs_dir2_data_aoff_t	off;		/* offset value needed */
-#ifdef DEBUG
-	int			matched;	/* matched the value */
-	int			seenzero;	/* saw a 0 bestfree entry */
-#endif
-
-	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-
-#ifdef DEBUG
-	/*
-	 * Validate some consistency in the bestfree table.
-	 * Check order, non-overlapping entries, and if we find the
-	 * one we're looking for it has to be exact.
-	 */
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-	for (dfp = &bf[0], seenzero = matched = 0;
-	     dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
-	     dfp++) {
-		if (!dfp->offset) {
-			ASSERT(!dfp->length);
-			seenzero = 1;
-			continue;
-		}
-		ASSERT(seenzero == 0);
-		if (be16_to_cpu(dfp->offset) == off) {
-			matched = 1;
-			ASSERT(dfp->length == dup->length);
-		} else if (off < be16_to_cpu(dfp->offset))
-			ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
-		else
-			ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
-		ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
-		if (dfp > &bf[0])
-			ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
-	}
-#endif
-	/*
-	 * If this is smaller than the smallest bestfree entry,
-	 * it can't be there since they're sorted.
-	 */
-	if (be16_to_cpu(dup->length) <
-	    be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
-		return NULL;
-	/*
-	 * Look at the three bestfree entries for our guy.
-	 */
-	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
-		if (!dfp->offset)
-			return NULL;
-		if (be16_to_cpu(dfp->offset) == off)
-			return dfp;
-	}
-	/*
-	 * Didn't find it.  This only happens if there are duplicate lengths.
-	 */
-	return NULL;
-}
-
-/*
- * Insert an unused-space entry into the bestfree table.
- */
-xfs_dir2_data_free_t *				/* entry inserted */
-xfs_dir2_data_freeinsert(
-	struct xfs_dir2_data_hdr *hdr,		/* data block pointer */
-	struct xfs_dir2_data_free *dfp,		/* bestfree table pointer */
-	struct xfs_dir2_data_unused *dup,	/* unused space */
-	int			*loghead)	/* log the data header (out) */
-{
-	xfs_dir2_data_free_t	new;		/* new bestfree entry */
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-	new.length = dup->length;
-	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
-
-	/*
-	 * Insert at position 0, 1, or 2; or not at all.
-	 */
-	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
-		dfp[2] = dfp[1];
-		dfp[1] = dfp[0];
-		dfp[0] = new;
-		*loghead = 1;
-		return &dfp[0];
-	}
-	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
-		dfp[2] = dfp[1];
-		dfp[1] = new;
-		*loghead = 1;
-		return &dfp[1];
-	}
-	if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
-		dfp[2] = new;
-		*loghead = 1;
-		return &dfp[2];
-	}
-	return NULL;
-}
-
-/*
- * Remove a bestfree entry from the table.
- */
-STATIC void
-xfs_dir2_data_freeremove(
-	struct xfs_dir2_data_hdr *hdr,		/* data block header */
-	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
-	struct xfs_dir2_data_free *dfp,		/* bestfree entry pointer */
-	int			*loghead)	/* out: log data header */
-{
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-	/*
-	 * It's the first entry, slide the next 2 up.
-	 */
-	if (dfp == &bf[0]) {
-		bf[0] = bf[1];
-		bf[1] = bf[2];
-	}
-	/*
-	 * It's the second entry, slide the 3rd entry up.
-	 */
-	else if (dfp == &bf[1])
-		bf[1] = bf[2];
-	/*
-	 * Must be the last entry.
-	 */
-	else
-		ASSERT(dfp == &bf[2]);
-	/*
-	 * Clear the 3rd entry, must be zero now.
-	 */
-	bf[2].length = 0;
-	bf[2].offset = 0;
-	*loghead = 1;
-}
-
-/*
- * Given a data block, reconstruct its bestfree map.
- */
-void
-xfs_dir2_data_freescan(
-	struct xfs_inode	*dp,
-	struct xfs_dir2_data_hdr *hdr,
-	int			*loghead)
-{
-	xfs_dir2_block_tail_t	*btp;		/* block tail */
-	xfs_dir2_data_entry_t	*dep;		/* active data entry */
-	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
-	struct xfs_dir2_data_free *bf;
-	char			*endp;		/* end of block's data */
-	char			*p;		/* current entry pointer */
-	struct xfs_da_geometry	*geo = dp->i_mount->m_dir_geo;
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-	/*
-	 * Start by clearing the table.
-	 */
-	bf = dp->d_ops->data_bestfree_p(hdr);
-	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
-	*loghead = 1;
-	/*
-	 * Set up pointers.
-	 */
-	p = (char *)dp->d_ops->data_entry_p(hdr);
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-		btp = xfs_dir2_block_tail_p(geo, hdr);
-		endp = (char *)xfs_dir2_block_leaf_p(btp);
-	} else
-		endp = (char *)hdr + geo->blksize;
-	/*
-	 * Loop over the block's entries.
-	 */
-	while (p < endp) {
-		dup = (xfs_dir2_data_unused_t *)p;
-		/*
-		 * If it's a free entry, insert it.
-		 */
-		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			ASSERT((char *)dup - (char *)hdr ==
-			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
-			xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
-			p += be16_to_cpu(dup->length);
-		}
-		/*
-		 * For active entries, check their tags and skip them.
-		 */
-		else {
-			dep = (xfs_dir2_data_entry_t *)p;
-			ASSERT((char *)dep - (char *)hdr ==
-			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
-			p += dp->d_ops->data_entsize(dep->namelen);
-		}
-	}
-}
-
-/*
- * Initialize a data block at the given block number in the directory.
- * Give back the buffer for the created block.
- */
-int						/* error */
-xfs_dir3_data_init(
-	xfs_da_args_t		*args,		/* directory operation args */
-	xfs_dir2_db_t		blkno,		/* logical dir block number */
-	struct xfs_buf		**bpp)		/* output block buffer */
-{
-	struct xfs_buf		*bp;		/* block buffer */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
-	struct xfs_dir2_data_free *bf;
-	int			error;		/* error return value */
-	int			i;		/* bestfree index */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	int                     t;              /* temp */
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	/*
-	 * Get the buffer set up for the block.
-	 */
-	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
-			       -1, &bp, XFS_DATA_FORK);
-	if (error)
-		return error;
-	bp->b_ops = &xfs_dir3_data_buf_ops;
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
-
-	/*
-	 * Initialize the header.
-	 */
-	hdr = bp->b_addr;
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-		memset(hdr3, 0, sizeof(*hdr3));
-		hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
-		hdr3->blkno = cpu_to_be64(bp->b_bn);
-		hdr3->owner = cpu_to_be64(dp->i_ino);
-		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
-
-	} else
-		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-
-	bf = dp->d_ops->data_bestfree_p(hdr);
-	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
-	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
-		bf[i].length = 0;
-		bf[i].offset = 0;
-	}
-
-	/*
-	 * Set up an unused entry for the block's body.
-	 */
-	dup = dp->d_ops->data_unused_p(hdr);
-	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-
-	t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
-	bf[0].length = cpu_to_be16(t);
-	dup->length = cpu_to_be16(t);
-	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
-	/*
-	 * Log it and return it.
-	 */
-	xfs_dir2_data_log_header(args, bp);
-	xfs_dir2_data_log_unused(args, bp, dup);
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Log an active data entry from the block.
- */
-void
-xfs_dir2_data_log_entry(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
-{
-	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
-		(uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
-		       (char *)hdr - 1));
-}
-
-/*
- * Log a data block header.
- */
-void
-xfs_dir2_data_log_header(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp)
-{
-#ifdef DEBUG
-	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-#endif
-
-	xfs_trans_log_buf(args->trans, bp, 0,
-			  args->dp->d_ops->data_entry_offset - 1);
-}
-
-/*
- * Log a data unused entry.
- */
-void
-xfs_dir2_data_log_unused(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
-{
-	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-
-	/*
-	 * Log the first part of the unused entry.
-	 */
-	xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
-		(uint)((char *)&dup->length + sizeof(dup->length) -
-		       1 - (char *)hdr));
-	/*
-	 * Log the end (tag) of the unused entry.
-	 */
-	xfs_trans_log_buf(args->trans, bp,
-		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
-		(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
-		       sizeof(xfs_dir2_data_off_t) - 1));
-}
-
-/*
- * Make a byte range in the data block unused.
- * Its current contents are unimportant.
- */
-void
-xfs_dir2_data_make_free(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
-	xfs_dir2_data_aoff_t	len,		/* length in bytes */
-	int			*needlogp,	/* out: log header */
-	int			*needscanp)	/* out: regen bestfree */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block pointer */
-	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
-	char			*endptr;	/* end of data area */
-	int			needscan;	/* need to regen bestfree */
-	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
-	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
-	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
-	struct xfs_dir2_data_free *bf;
-
-	hdr = bp->b_addr;
-
-	/*
-	 * Figure out where the end of the data area is.
-	 */
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	    hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-		endptr = (char *)hdr + args->geo->blksize;
-	else {
-		xfs_dir2_block_tail_t	*btp;	/* block tail */
-
-		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-			hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-		btp = xfs_dir2_block_tail_p(args->geo, hdr);
-		endptr = (char *)xfs_dir2_block_leaf_p(btp);
-	}
-	/*
-	 * If this isn't the start of the block, then back up to
-	 * the previous entry and see if it's free.
-	 */
-	if (offset > args->dp->d_ops->data_entry_offset) {
-		__be16			*tagp;	/* tag just before us */
-
-		tagp = (__be16 *)((char *)hdr + offset) - 1;
-		prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-		if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-			prevdup = NULL;
-	} else
-		prevdup = NULL;
-	/*
-	 * If this isn't the end of the block, see if the entry after
-	 * us is free.
-	 */
-	if ((char *)hdr + offset + len < endptr) {
-		postdup =
-			(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-		if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-			postdup = NULL;
-	} else
-		postdup = NULL;
-	ASSERT(*needscanp == 0);
-	needscan = 0;
-	/*
-	 * Previous and following entries are both free,
-	 * merge everything into a single free entry.
-	 */
-	bf = args->dp->d_ops->data_bestfree_p(hdr);
-	if (prevdup && postdup) {
-		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
-
-		/*
-		 * See if prevdup and/or postdup are in bestfree table.
-		 */
-		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
-		dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
-		/*
-		 * We need a rescan unless there are exactly 2 free entries
-		 * namely our two.  Then we know what's happening, otherwise
-		 * since the third bestfree is there, there might be more
-		 * entries.
-		 */
-		needscan = (bf[2].length != 0);
-		/*
-		 * Fix up the new big freespace.
-		 */
-		be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
-		*xfs_dir2_data_unused_tag_p(prevdup) =
-			cpu_to_be16((char *)prevdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, prevdup);
-		if (!needscan) {
-			/*
-			 * Has to be the case that entries 0 and 1 are
-			 * dfp and dfp2 (don't know which is which), and
-			 * entry 2 is empty.
-			 * Remove entry 1 first then entry 0.
-			 */
-			ASSERT(dfp && dfp2);
-			if (dfp == &bf[1]) {
-				dfp = &bf[0];
-				ASSERT(dfp2 == dfp);
-				dfp2 = &bf[1];
-			}
-			xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
-			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-			/*
-			 * Now insert the new entry.
-			 */
-			dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
-						       needlogp);
-			ASSERT(dfp == &bf[0]);
-			ASSERT(dfp->length == prevdup->length);
-			ASSERT(!dfp[1].length);
-			ASSERT(!dfp[2].length);
-		}
-	}
-	/*
-	 * The entry before us is free, merge with it.
-	 */
-	else if (prevdup) {
-		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
-		be16_add_cpu(&prevdup->length, len);
-		*xfs_dir2_data_unused_tag_p(prevdup) =
-			cpu_to_be16((char *)prevdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, prevdup);
-		/*
-		 * If the previous entry was in the table, the new entry
-		 * is longer, so it will be in the table too.  Remove
-		 * the old one and add the new one.
-		 */
-		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-			xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
-		}
-		/*
-		 * Otherwise we need a scan if the new entry is big enough.
-		 */
-		else {
-			needscan = be16_to_cpu(prevdup->length) >
-				   be16_to_cpu(bf[2].length);
-		}
-	}
-	/*
-	 * The following entry is free, merge with it.
-	 */
-	else if (postdup) {
-		dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
-		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
-		*xfs_dir2_data_unused_tag_p(newdup) =
-			cpu_to_be16((char *)newdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup);
-		/*
-		 * If the following entry was in the table, the new entry
-		 * is longer, so it will be in the table too.  Remove
-		 * the old one and add the new one.
-		 */
-		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-			xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
-		}
-		/*
-		 * Otherwise we need a scan if the new entry is big enough.
-		 */
-		else {
-			needscan = be16_to_cpu(newdup->length) >
-				   be16_to_cpu(bf[2].length);
-		}
-	}
-	/*
-	 * Neither neighbor is free.  Make a new entry.
-	 */
-	else {
-		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
-		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-		newdup->length = cpu_to_be16(len);
-		*xfs_dir2_data_unused_tag_p(newdup) =
-			cpu_to_be16((char *)newdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup);
-		xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
-	}
-	*needscanp = needscan;
-}
-
-/*
- * Take a byte range out of an existing unused space and make it un-free.
- */
-void
-xfs_dir2_data_use_free(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	xfs_dir2_data_unused_t	*dup,		/* unused entry */
-	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
-	xfs_dir2_data_aoff_t	len,		/* length to use */
-	int			*needlogp,	/* out: need to log header */
-	int			*needscanp)	/* out: need regen bestfree */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
-	int			matchback;	/* matches end of freespace */
-	int			matchfront;	/* matches start of freespace */
-	int			needscan;	/* need to regen bestfree */
-	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
-	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
-	int			oldlen;		/* old unused entry's length */
-	struct xfs_dir2_data_free *bf;
-
-	hdr = bp->b_addr;
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-	ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
-	ASSERT(offset >= (char *)dup - (char *)hdr);
-	ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
-	ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
-	/*
-	 * Look up the entry in the bestfree table.
-	 */
-	oldlen = be16_to_cpu(dup->length);
-	bf = args->dp->d_ops->data_bestfree_p(hdr);
-	dfp = xfs_dir2_data_freefind(hdr, bf, dup);
-	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
-	/*
-	 * Check for alignment with front and back of the entry.
-	 */
-	matchfront = (char *)dup - (char *)hdr == offset;
-	matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
-	ASSERT(*needscanp == 0);
-	needscan = 0;
-	/*
-	 * If we matched it exactly we just need to get rid of it from
-	 * the bestfree table.
-	 */
-	if (matchfront && matchback) {
-		if (dfp) {
-			needscan = (bf[2].offset != 0);
-			if (!needscan)
-				xfs_dir2_data_freeremove(hdr, bf, dfp,
-							 needlogp);
-		}
-	}
-	/*
-	 * We match the first part of the entry.
-	 * Make a new entry with the remaining freespace.
-	 */
-	else if (matchfront) {
-		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-		newdup->length = cpu_to_be16(oldlen - len);
-		*xfs_dir2_data_unused_tag_p(newdup) =
-			cpu_to_be16((char *)newdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup);
-		/*
-		 * If it was in the table, remove it and add the new one.
-		 */
-		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
-						       needlogp);
-			ASSERT(dfp != NULL);
-			ASSERT(dfp->length == newdup->length);
-			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
-			/*
-			 * If we got inserted at the last slot,
-			 * that means we don't know if there was a better
-			 * choice for the last slot, or not.  Rescan.
-			 */
-			needscan = dfp == &bf[2];
-		}
-	}
-	/*
-	 * We match the last part of the entry.
-	 * Trim the allocated space off the tail of the entry.
-	 */
-	else if (matchback) {
-		newdup = dup;
-		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
-		*xfs_dir2_data_unused_tag_p(newdup) =
-			cpu_to_be16((char *)newdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup);
-		/*
-		 * If it was in the table, remove it and add the new one.
-		 */
-		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
-			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
-						       needlogp);
-			ASSERT(dfp != NULL);
-			ASSERT(dfp->length == newdup->length);
-			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
-			/*
-			 * If we got inserted at the last slot,
-			 * that means we don't know if there was a better
-			 * choice for the last slot, or not.  Rescan.
-			 */
-			needscan = dfp == &bf[2];
-		}
-	}
-	/*
-	 * Poking out the middle of an entry.
-	 * Make two new entries.
-	 */
-	else {
-		newdup = dup;
-		newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
-		*xfs_dir2_data_unused_tag_p(newdup) =
-			cpu_to_be16((char *)newdup - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup);
-		newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
-		newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-		newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
-		*xfs_dir2_data_unused_tag_p(newdup2) =
-			cpu_to_be16((char *)newdup2 - (char *)hdr);
-		xfs_dir2_data_log_unused(args, bp, newdup2);
-		/*
-		 * If the old entry was in the table, we need to scan
-		 * if the 3rd entry was valid, since these entries
-		 * are smaller than the old one.
-		 * If we don't need to scan that means there were 1 or 2
-		 * entries in the table, and removing the old and adding
-		 * the 2 new will work.
-		 */
-		if (dfp) {
-			needscan = (bf[2].length != 0);
-			if (!needscan) {
-				xfs_dir2_data_freeremove(hdr, bf, dfp,
-							 needlogp);
-				xfs_dir2_data_freeinsert(hdr, bf, newdup,
-							 needlogp);
-				xfs_dir2_data_freeinsert(hdr, bf, newdup2,
-							 needlogp);
-			}
-		}
-	}
-	*needscanp = needscan;
-}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
deleted file mode 100644
index 78b411bfc543..000000000000
--- a/fs/xfs/xfs_dir2_leaf.c
+++ /dev/null
@@ -1,1831 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Local function declarations.
- */
-static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
-				    int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
-				    struct xfs_buf *bp, int first, int last);
-static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
-				   struct xfs_buf *bp);
-
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-#ifdef DEBUG
-#define	xfs_dir3_leaf_check(dp, bp) \
-do { \
-	if (!xfs_dir3_leaf1_check((dp), (bp))) \
-		ASSERT(0); \
-} while (0);
-
-STATIC bool
-xfs_dir3_leaf1_check(
-	struct xfs_inode	*dp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
-		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-			return false;
-	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
-		return false;
-
-	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
-}
-#else
-#define	xfs_dir3_leaf_check(dp, bp)
-#endif
-
-bool
-xfs_dir3_leaf_check_int(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*dp,
-	struct xfs_dir3_icleaf_hdr *hdr,
-	struct xfs_dir2_leaf	*leaf)
-{
-	struct xfs_dir2_leaf_entry *ents;
-	xfs_dir2_leaf_tail_t	*ltp;
-	int			stale;
-	int			i;
-	const struct xfs_dir_ops *ops;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_da_geometry	*geo = mp->m_dir_geo;
-
-	/*
-	 * we can be passed a null dp here from a verifier, so we need to go the
-	 * hard way to get them.
-	 */
-	ops = xfs_dir_get_ops(mp, dp);
-
-	if (!hdr) {
-		ops->leaf_hdr_from_disk(&leafhdr, leaf);
-		hdr = &leafhdr;
-	}
-
-	ents = ops->leaf_ents_p(leaf);
-	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
-
-	/*
-	 * XXX (dgc): This value is not restrictive enough.
-	 * Should factor in the size of the bests table as well.
-	 * We can deduce a value for that from di_size.
-	 */
-	if (hdr->count > ops->leaf_max_ents(geo))
-		return false;
-
-	/* Leaves and bests don't overlap in leaf format. */
-	if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
-	     hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
-	    (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
-		return false;
-
-	/* Check hash value order, count stale entries.  */
-	for (i = stale = 0; i < hdr->count; i++) {
-		if (i + 1 < hdr->count) {
-			if (be32_to_cpu(ents[i].hashval) >
-					be32_to_cpu(ents[i + 1].hashval))
-				return false;
-		}
-		if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			stale++;
-	}
-	if (hdr->stale != stale)
-		return false;
-	return true;
-}
-
-/*
- * We verify the magic numbers before decoding the leaf header so that on debug
- * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
- * to incorrect magic numbers.
- */
-static bool
-xfs_dir3_leaf_verify(
-	struct xfs_buf		*bp,
-	__uint16_t		magic)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-
-	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-		__uint16_t		magic3;
-
-		magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
-							 : XFS_DIR3_LEAFN_MAGIC;
-
-		if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
-			return false;
-		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (leaf->hdr.info.magic != cpu_to_be16(magic))
-			return false;
-	}
-
-	return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
-}
-
-static void
-__read_verify(
-	struct xfs_buf  *bp,
-	__uint16_t	magic)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_dir3_leaf_verify(bp, magic))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-__write_verify(
-	struct xfs_buf  *bp,
-	__uint16_t	magic)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
-
-	if (!xfs_dir3_leaf_verify(bp, magic)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
-	struct xfs_buf	*bp)
-{
-	__read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
-	struct xfs_buf	*bp)
-{
-	__write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
-	struct xfs_buf	*bp)
-{
-	__read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
-	struct xfs_buf	*bp)
-{
-	__write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
-	.verify_read = xfs_dir3_leaf1_read_verify,
-	.verify_write = xfs_dir3_leaf1_write_verify,
-};
-
-const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
-	.verify_read = xfs_dir3_leafn_read_verify,
-	.verify_write = xfs_dir3_leafn_write_verify,
-};
-
-static int
-xfs_dir3_leaf_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		fbno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
-	if (!err && tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
-	return err;
-}
-
-int
-xfs_dir3_leafn_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		fbno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
-	if (!err && tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
-	return err;
-}
-
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-static void
-xfs_dir3_leaf_init(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_buf		*bp,
-	xfs_ino_t		owner,
-	__uint16_t		type)
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-
-	ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-
-		memset(leaf3, 0, sizeof(*leaf3));
-
-		leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
-					 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
-					 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
-		leaf3->info.blkno = cpu_to_be64(bp->b_bn);
-		leaf3->info.owner = cpu_to_be64(owner);
-		uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
-	} else {
-		memset(leaf, 0, sizeof(*leaf));
-		leaf->hdr.info.magic = cpu_to_be16(type);
-	}
-
-	/*
-	 * If it's a leaf-format directory initialize the tail.
-	 * Caller is responsible for initialising the bests table.
-	 */
-	if (type == XFS_DIR2_LEAF1_MAGIC) {
-		struct xfs_dir2_leaf_tail *ltp;
-
-		ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
-		ltp->bestcount = 0;
-		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
-	} else {
-		bp->b_ops = &xfs_dir3_leafn_buf_ops;
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
-	}
-}
-
-int
-xfs_dir3_leaf_get_buf(
-	xfs_da_args_t		*args,
-	xfs_dir2_db_t		bno,
-	struct xfs_buf		**bpp,
-	__uint16_t		magic)
-{
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_trans	*tp = args->trans;
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_buf		*bp;
-	int			error;
-
-	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-	ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
-	       bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
-
-	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
-			       -1, &bp, XFS_DATA_FORK);
-	if (error)
-		return error;
-
-	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
-	xfs_dir3_leaf_log_header(args, bp);
-	if (magic == XFS_DIR2_LEAF1_MAGIC)
-		xfs_dir3_leaf_log_tail(args, bp);
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Convert a block form directory to a leaf form directory.
- */
-int						/* error */
-xfs_dir2_block_to_leaf(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*dbp)		/* input block's buffer */
-{
-	__be16			*bestsp;	/* leaf's bestsp entries */
-	xfs_dablk_t		blkno;		/* leaf block's bno */
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
-	xfs_dir2_block_tail_t	*btp;		/* block's tail */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	struct xfs_buf		*lbp;		/* leaf block's buffer */
-	xfs_dir2_db_t		ldb;		/* leaf block's bno */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log block header */
-	int			needscan;	/* need to rescan bestfree */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_data_free *bf;
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	trace_xfs_dir2_block_to_leaf(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	/*
-	 * Add the leaf block to the inode.
-	 * This interface will only put blocks in the leaf/node range.
-	 * Since that's empty now, we'll get the root (block 0 in range).
-	 */
-	if ((error = xfs_da_grow_inode(args, &blkno))) {
-		return error;
-	}
-	ldb = xfs_dir2_da_to_db(args->geo, blkno);
-	ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
-	/*
-	 * Initialize the leaf block, get a buffer for it.
-	 */
-	error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
-	if (error)
-		return error;
-
-	leaf = lbp->b_addr;
-	hdr = dbp->b_addr;
-	xfs_dir3_data_check(dp, dbp);
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-	bf = dp->d_ops->data_bestfree_p(hdr);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-
-	/*
-	 * Set the counts in the leaf header.
-	 */
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	leafhdr.count = be32_to_cpu(btp->count);
-	leafhdr.stale = be32_to_cpu(btp->stale);
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, lbp);
-
-	/*
-	 * Could compact these but I think we always do the conversion
-	 * after squeezing out stale entries.
-	 */
-	memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
-	needscan = 0;
-	needlog = 1;
-	/*
-	 * Make the space formerly occupied by the leaf entries and block
-	 * tail be free.
-	 */
-	xfs_dir2_data_make_free(args, dbp,
-		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-		(xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
-				       (char *)blp),
-		&needlog, &needscan);
-	/*
-	 * Fix up the block header, make it a data block.
-	 */
-	dbp->b_ops = &xfs_dir3_data_buf_ops;
-	xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-	else
-		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
-
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	/*
-	 * Set up leaf tail and bests table.
-	 */
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	ltp->bestcount = cpu_to_be32(1);
-	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	bestsp[0] =  bf[0].length;
-	/*
-	 * Log the data header and leaf bests table.
-	 */
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	xfs_dir3_leaf_check(dp, lbp);
-	xfs_dir3_data_check(dp, dbp);
-	xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
-	return 0;
-}
-
-STATIC void
-xfs_dir3_leaf_find_stale(
-	struct xfs_dir3_icleaf_hdr *leafhdr,
-	struct xfs_dir2_leaf_entry *ents,
-	int			index,
-	int			*lowstale,
-	int			*highstale)
-{
-	/*
-	 * Find the first stale entry before our index, if any.
-	 */
-	for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
-		if (ents[*lowstale].address ==
-		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			break;
-	}
-
-	/*
-	 * Find the first stale entry at or after our index, if any.
-	 * Stop if the result would require moving more entries than using
-	 * lowstale.
-	 */
-	for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
-		if (ents[*highstale].address ==
-		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			break;
-		if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
-			break;
-	}
-}
-
-struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_find_entry(
-	struct xfs_dir3_icleaf_hdr *leafhdr,
-	struct xfs_dir2_leaf_entry *ents,
-	int			index,		/* leaf table position */
-	int			compact,	/* need to compact leaves */
-	int			lowstale,	/* index of prev stale leaf */
-	int			highstale,	/* index of next stale leaf */
-	int			*lfloglow,	/* low leaf logging index */
-	int			*lfloghigh)	/* high leaf logging index */
-{
-	if (!leafhdr->stale) {
-		xfs_dir2_leaf_entry_t	*lep;	/* leaf entry table pointer */
-
-		/*
-		 * Now we need to make room to insert the leaf entry.
-		 *
-		 * If there are no stale entries, just insert a hole at index.
-		 */
-		lep = &ents[index];
-		if (index < leafhdr->count)
-			memmove(lep + 1, lep,
-				(leafhdr->count - index) * sizeof(*lep));
-
-		/*
-		 * Record low and high logging indices for the leaf.
-		 */
-		*lfloglow = index;
-		*lfloghigh = leafhdr->count++;
-		return lep;
-	}
-
-	/*
-	 * There are stale entries.
-	 *
-	 * We will use one of them for the new entry.  It's probably not at
-	 * the right location, so we'll have to shift some up or down first.
-	 *
-	 * If we didn't compact before, we need to find the nearest stale
-	 * entries before and after our insertion point.
-	 */
-	if (compact == 0)
-		xfs_dir3_leaf_find_stale(leafhdr, ents, index,
-					 &lowstale, &highstale);
-
-	/*
-	 * If the low one is better, use it.
-	 */
-	if (lowstale >= 0 &&
-	    (highstale == leafhdr->count ||
-	     index - lowstale - 1 < highstale - index)) {
-		ASSERT(index - lowstale - 1 >= 0);
-		ASSERT(ents[lowstale].address ==
-		       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
-
-		/*
-		 * Copy entries up to cover the stale entry and make room
-		 * for the new entry.
-		 */
-		if (index - lowstale - 1 > 0) {
-			memmove(&ents[lowstale], &ents[lowstale + 1],
-				(index - lowstale - 1) *
-					sizeof(xfs_dir2_leaf_entry_t));
-		}
-		*lfloglow = MIN(lowstale, *lfloglow);
-		*lfloghigh = MAX(index - 1, *lfloghigh);
-		leafhdr->stale--;
-		return &ents[index - 1];
-	}
-
-	/*
-	 * The high one is better, so use that one.
-	 */
-	ASSERT(highstale - index >= 0);
-	ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
-
-	/*
-	 * Copy entries down to cover the stale entry and make room for the
-	 * new entry.
-	 */
-	if (highstale - index > 0) {
-		memmove(&ents[index + 1], &ents[index],
-			(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
-	}
-	*lfloglow = MIN(index, *lfloglow);
-	*lfloghigh = MAX(highstale, *lfloghigh);
-	leafhdr->stale--;
-	return &ents[index];
-}
-
-/*
- * Add an entry to a leaf form directory.
- */
-int						/* error */
-xfs_dir2_leaf_addname(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	__be16			*bestsp;	/* freespace table in leaf */
-	int			compact;	/* need to compact leaves */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
-	int			error;		/* error return value */
-	int			grown;		/* allocated new data block */
-	int			highstale;	/* index of next stale leaf */
-	int			i;		/* temporary, index */
-	int			index;		/* leaf table position */
-	struct xfs_buf		*lbp;		/* leaf's buffer */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	int			length;		/* length of new entry */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
-	int			lfloglow;	/* low leaf logging index */
-	int			lfloghigh;	/* high leaf logging index */
-	int			lowstale;	/* index of prev stale leaf */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needbytes;	/* leaf block bytes needed */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data free */
-	__be16			*tagp;		/* end of data entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	xfs_dir2_db_t		use_block;	/* data block number */
-	struct xfs_dir2_data_free *bf;		/* bestfree table */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	trace_xfs_dir2_leaf_addname(args);
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-
-	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
-	if (error)
-		return error;
-
-	/*
-	 * Look up the entry by hash value and name.
-	 * We know it's not there, our caller has already done a lookup.
-	 * So the index is of the entry to insert in front of.
-	 * But if there are dup hash values the index is of the first of those.
-	 */
-	index = xfs_dir2_leaf_search_hash(args, lbp);
-	leaf = lbp->b_addr;
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	length = dp->d_ops->data_entsize(args->namelen);
-
-	/*
-	 * See if there are any entries with the same hash value
-	 * and space in their block for the new entry.
-	 * This is good because it puts multiple same-hash value entries
-	 * in a data block, improving the lookup of those entries.
-	 */
-	for (use_block = -1, lep = &ents[index];
-	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-	     index++, lep++) {
-		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-		ASSERT(i < be32_to_cpu(ltp->bestcount));
-		ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
-		if (be16_to_cpu(bestsp[i]) >= length) {
-			use_block = i;
-			break;
-		}
-	}
-	/*
-	 * Didn't find a block yet, linear search all the data blocks.
-	 */
-	if (use_block == -1) {
-		for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
-			/*
-			 * Remember a block we see that's missing.
-			 */
-			if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
-			    use_block == -1)
-				use_block = i;
-			else if (be16_to_cpu(bestsp[i]) >= length) {
-				use_block = i;
-				break;
-			}
-		}
-	}
-	/*
-	 * How many bytes do we need in the leaf block?
-	 */
-	needbytes = 0;
-	if (!leafhdr.stale)
-		needbytes += sizeof(xfs_dir2_leaf_entry_t);
-	if (use_block == -1)
-		needbytes += sizeof(xfs_dir2_data_off_t);
-
-	/*
-	 * Now kill use_block if it refers to a missing block, so we
-	 * can use it as an indication of allocation needed.
-	 */
-	if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
-		use_block = -1;
-	/*
-	 * If we don't have enough free bytes but we can make enough
-	 * by compacting out stale entries, we'll do that.
-	 */
-	if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
-	    leafhdr.stale > 1)
-		compact = 1;
-
-	/*
-	 * Otherwise if we don't have enough free bytes we need to
-	 * convert to node form.
-	 */
-	else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
-		/*
-		 * Just checking or no space reservation, give up.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
-							args->total == 0) {
-			xfs_trans_brelse(tp, lbp);
-			return ENOSPC;
-		}
-		/*
-		 * Convert to node form.
-		 */
-		error = xfs_dir2_leaf_to_node(args, lbp);
-		if (error)
-			return error;
-		/*
-		 * Then add the new entry.
-		 */
-		return xfs_dir2_node_addname(args);
-	}
-	/*
-	 * Otherwise it will fit without compaction.
-	 */
-	else
-		compact = 0;
-	/*
-	 * If just checking, then it will fit unless we needed to allocate
-	 * a new data block.
-	 */
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-		xfs_trans_brelse(tp, lbp);
-		return use_block == -1 ? ENOSPC : 0;
-	}
-	/*
-	 * If no allocations are allowed, return now before we've
-	 * changed anything.
-	 */
-	if (args->total == 0 && use_block == -1) {
-		xfs_trans_brelse(tp, lbp);
-		return ENOSPC;
-	}
-	/*
-	 * Need to compact the leaf entries, removing stale ones.
-	 * Leave one stale entry behind - the one closest to our
-	 * insertion index - and we'll shift that one to our insertion
-	 * point later.
-	 */
-	if (compact) {
-		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
-			&highstale, &lfloglow, &lfloghigh);
-	}
-	/*
-	 * There are stale entries, so we'll need log-low and log-high
-	 * impossibly bad values later.
-	 */
-	else if (leafhdr.stale) {
-		lfloglow = leafhdr.count;
-		lfloghigh = -1;
-	}
-	/*
-	 * If there was no data block space found, we need to allocate
-	 * a new one.
-	 */
-	if (use_block == -1) {
-		/*
-		 * Add the new data block.
-		 */
-		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
-				&use_block))) {
-			xfs_trans_brelse(tp, lbp);
-			return error;
-		}
-		/*
-		 * Initialize the block.
-		 */
-		if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
-			xfs_trans_brelse(tp, lbp);
-			return error;
-		}
-		/*
-		 * If we're adding a new data block on the end we need to
-		 * extend the bests table.  Copy it up one entry.
-		 */
-		if (use_block >= be32_to_cpu(ltp->bestcount)) {
-			bestsp--;
-			memmove(&bestsp[0], &bestsp[1],
-				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
-			be32_add_cpu(&ltp->bestcount, 1);
-			xfs_dir3_leaf_log_tail(args, lbp);
-			xfs_dir3_leaf_log_bests(args, lbp, 0,
-						be32_to_cpu(ltp->bestcount) - 1);
-		}
-		/*
-		 * If we're filling in a previously empty block just log it.
-		 */
-		else
-			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		bestsp[use_block] = bf[0].length;
-		grown = 1;
-	} else {
-		/*
-		 * Already had space in some data block.
-		 * Just read that one in.
-		 */
-		error = xfs_dir3_data_read(tp, dp,
-				   xfs_dir2_db_to_da(args->geo, use_block),
-				   -1, &dbp);
-		if (error) {
-			xfs_trans_brelse(tp, lbp);
-			return error;
-		}
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		grown = 0;
-	}
-	/*
-	 * Point to the biggest freespace in our data block.
-	 */
-	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(bf[0].offset));
-	ASSERT(be16_to_cpu(dup->length) >= length);
-	needscan = needlog = 0;
-	/*
-	 * Mark the initial part of our freespace in use for the new entry.
-	 */
-	xfs_dir2_data_use_free(args, dbp, dup,
-		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
-		&needlog, &needscan);
-	/*
-	 * Initialize our new entry (at last).
-	 */
-	dep = (xfs_dir2_data_entry_t *)dup;
-	dep->inumber = cpu_to_be64(args->inumber);
-	dep->namelen = args->namelen;
-	memcpy(dep->name, args->name, dep->namelen);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	/*
-	 * Need to scan fix up the bestfree table.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	/*
-	 * Need to log the data block's header.
-	 */
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	xfs_dir2_data_log_entry(args, dbp, dep);
-	/*
-	 * If the bests table needs to be changed, do it.
-	 * Log the change unless we've already done that.
-	 */
-	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
-		bestsp[use_block] = bf[0].length;
-		if (!grown)
-			xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
-	}
-
-	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
-				       highstale, &lfloglow, &lfloghigh);
-
-	/*
-	 * Fill in the new leaf entry.
-	 */
-	lep->hashval = cpu_to_be32(args->hashval);
-	lep->address = cpu_to_be32(
-				xfs_dir2_db_off_to_dataptr(args->geo, use_block,
-				be16_to_cpu(*tagp)));
-	/*
-	 * Log the leaf fields and give up the buffers.
-	 */
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, lbp);
-	xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
-	xfs_dir3_leaf_check(dp, lbp);
-	xfs_dir3_data_check(dp, dbp);
-	return 0;
-}
-
-/*
- * Compact out any stale entries in the leaf.
- * Log the header and changed leaf entries, if any.
- */
-void
-xfs_dir3_leaf_compact(
-	xfs_da_args_t	*args,		/* operation arguments */
-	struct xfs_dir3_icleaf_hdr *leafhdr,
-	struct xfs_buf	*bp)		/* leaf buffer */
-{
-	int		from;		/* source leaf index */
-	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
-	int		loglow;		/* first leaf entry to log */
-	int		to;		/* target leaf index */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_inode *dp = args->dp;
-
-	leaf = bp->b_addr;
-	if (!leafhdr->stale)
-		return;
-
-	/*
-	 * Compress out the stale entries in place.
-	 */
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
-		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			continue;
-		/*
-		 * Only actually copy the entries that are different.
-		 */
-		if (from > to) {
-			if (loglow == -1)
-				loglow = to;
-			ents[to] = ents[from];
-		}
-		to++;
-	}
-	/*
-	 * Update and log the header, log the leaf entries.
-	 */
-	ASSERT(leafhdr->stale == from - to);
-	leafhdr->count -= leafhdr->stale;
-	leafhdr->stale = 0;
-
-	dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
-	xfs_dir3_leaf_log_header(args, bp);
-	if (loglow != -1)
-		xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
-}
-
-/*
- * Compact the leaf entries, removing stale ones.
- * Leave one stale entry behind - the one closest to our
- * insertion index - and the caller will shift that one to our insertion
- * point later.
- * Return new insertion index, where the remaining stale entry is,
- * and leaf logging indices.
- */
-void
-xfs_dir3_leaf_compact_x1(
-	struct xfs_dir3_icleaf_hdr *leafhdr,
-	struct xfs_dir2_leaf_entry *ents,
-	int		*indexp,	/* insertion index */
-	int		*lowstalep,	/* out: stale entry before us */
-	int		*highstalep,	/* out: stale entry after us */
-	int		*lowlogp,	/* out: low log index */
-	int		*highlogp)	/* out: high log index */
-{
-	int		from;		/* source copy index */
-	int		highstale;	/* stale entry at/after index */
-	int		index;		/* insertion index */
-	int		keepstale;	/* source index of kept stale */
-	int		lowstale;	/* stale entry before index */
-	int		newindex=0;	/* new insertion index */
-	int		to;		/* destination copy index */
-
-	ASSERT(leafhdr->stale > 1);
-	index = *indexp;
-
-	xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
-
-	/*
-	 * Pick the better of lowstale and highstale.
-	 */
-	if (lowstale >= 0 &&
-	    (highstale == leafhdr->count ||
-	     index - lowstale <= highstale - index))
-		keepstale = lowstale;
-	else
-		keepstale = highstale;
-	/*
-	 * Copy the entries in place, removing all the stale entries
-	 * except keepstale.
-	 */
-	for (from = to = 0; from < leafhdr->count; from++) {
-		/*
-		 * Notice the new value of index.
-		 */
-		if (index == from)
-			newindex = to;
-		if (from != keepstale &&
-		    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-			if (from == to)
-				*lowlogp = to;
-			continue;
-		}
-		/*
-		 * Record the new keepstale value for the insertion.
-		 */
-		if (from == keepstale)
-			lowstale = highstale = to;
-		/*
-		 * Copy only the entries that have moved.
-		 */
-		if (from > to)
-			ents[to] = ents[from];
-		to++;
-	}
-	ASSERT(from > to);
-	/*
-	 * If the insertion point was past the last entry,
-	 * set the new insertion point accordingly.
-	 */
-	if (index == from)
-		newindex = to;
-	*indexp = newindex;
-	/*
-	 * Adjust the leaf header values.
-	 */
-	leafhdr->count -= from - to;
-	leafhdr->stale = 1;
-	/*
-	 * Remember the low/high stale value only in the "right"
-	 * direction.
-	 */
-	if (lowstale >= newindex)
-		lowstale = -1;
-	else
-		highstale = leafhdr->count;
-	*highlogp = leafhdr->count - 1;
-	*lowstalep = lowstale;
-	*highstalep = highstale;
-}
-
-/*
- * Log the bests entries indicated from a leaf1 block.
- */
-static void
-xfs_dir3_leaf_log_bests(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,		/* leaf buffer */
-	int			first,		/* first entry to log */
-	int			last)		/* last entry to log */
-{
-	__be16			*firstb;	/* pointer to first entry */
-	__be16			*lastb;		/* pointer to last entry */
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
-
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
-	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
-	xfs_trans_log_buf(args->trans, bp,
-		(uint)((char *)firstb - (char *)leaf),
-		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
-}
-
-/*
- * Log the leaf entries indicated from a leaf1 or leafn block.
- */
-void
-xfs_dir3_leaf_log_ents(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	int			first,
-	int			last)
-{
-	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
-	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	struct xfs_dir2_leaf_entry *ents;
-
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-	ents = args->dp->d_ops->leaf_ents_p(leaf);
-	firstlep = &ents[first];
-	lastlep = &ents[last];
-	xfs_trans_log_buf(args->trans, bp,
-		(uint)((char *)firstlep - (char *)leaf),
-		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
-}
-
-/*
- * Log the header of the leaf1 or leafn block.
- */
-void
-xfs_dir3_leaf_log_header(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-	xfs_trans_log_buf(args->trans, bp,
-			  (uint)((char *)&leaf->hdr - (char *)leaf),
-			  args->dp->d_ops->leaf_hdr_size - 1);
-}
-
-/*
- * Log the tail of the leaf1 block.
- */
-STATIC void
-xfs_dir3_leaf_log_tail(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
-
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
-		(uint)(args->geo->blksize - 1));
-}
-
-/*
- * Look up the entry referred to by args in the leaf format directory.
- * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
- * is also used by the node-format code.
- */
-int
-xfs_dir2_leaf_lookup(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	int			index;		/* found entry index */
-	struct xfs_buf		*lbp;		/* leaf buffer */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_leaf_entry *ents;
-
-	trace_xfs_dir2_leaf_lookup(args);
-
-	/*
-	 * Look up name in the leaf block, returning both buffers and index.
-	 */
-	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-		return error;
-	}
-	tp = args->trans;
-	dp = args->dp;
-	xfs_dir3_leaf_check(dp, lbp);
-	leaf = lbp->b_addr;
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	/*
-	 * Get to the leaf entry and contained data entry address.
-	 */
-	lep = &ents[index];
-
-	/*
-	 * Point to the data entry.
-	 */
-	dep = (xfs_dir2_data_entry_t *)
-	      ((char *)dbp->b_addr +
-	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-	/*
-	 * Return the found inode number & CI name if appropriate
-	 */
-	args->inumber = be64_to_cpu(dep->inumber);
-	args->filetype = dp->d_ops->data_get_ftype(dep);
-	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-	xfs_trans_brelse(tp, dbp);
-	xfs_trans_brelse(tp, lbp);
-	return error;
-}
-
-/*
- * Look up name/hash in the leaf block.
- * Fill in indexp with the found index, and dbpp with the data buffer.
- * If not found dbpp will be NULL, and ENOENT comes back.
- * lbpp will always be filled in with the leaf buffer unless there's an error.
- */
-static int					/* error */
-xfs_dir2_leaf_lookup_int(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		**lbpp,		/* out: leaf buffer */
-	int			*indexp,	/* out: index in leaf block */
-	struct xfs_buf		**dbpp)		/* out: data buffer */
-{
-	xfs_dir2_db_t		curdb = -1;	/* current data block number */
-	struct xfs_buf		*dbp = NULL;	/* data buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	int			index;		/* index in leaf block */
-	struct xfs_buf		*lbp;		/* leaf buffer */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_dir2_db_t		newdb;		/* new data block number */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
-	enum xfs_dacmp		cmp;		/* name compare result */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-
-	error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
-	if (error)
-		return error;
-
-	*lbpp = lbp;
-	leaf = lbp->b_addr;
-	xfs_dir3_leaf_check(dp, lbp);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	/*
-	 * Look for the first leaf entry with our hash value.
-	 */
-	index = xfs_dir2_leaf_search_hash(args, lbp);
-	/*
-	 * Loop over all the entries with the right hash value
-	 * looking to match the name.
-	 */
-	for (lep = &ents[index];
-	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-	     lep++, index++) {
-		/*
-		 * Skip over stale leaf entries.
-		 */
-		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		/*
-		 * Get the new data block number.
-		 */
-		newdb = xfs_dir2_dataptr_to_db(args->geo,
-					       be32_to_cpu(lep->address));
-		/*
-		 * If it's not the same as the old data block number,
-		 * need to pitch the old one and read the new one.
-		 */
-		if (newdb != curdb) {
-			if (dbp)
-				xfs_trans_brelse(tp, dbp);
-			error = xfs_dir3_data_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, newdb),
-					   -1, &dbp);
-			if (error) {
-				xfs_trans_brelse(tp, lbp);
-				return error;
-			}
-			curdb = newdb;
-		}
-		/*
-		 * Point to the data entry.
-		 */
-		dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
-			xfs_dir2_dataptr_to_off(args->geo,
-						be32_to_cpu(lep->address)));
-		/*
-		 * Compare name and if it's an exact match, return the index
-		 * and buffer. If it's the first case-insensitive match, store
-		 * the index and buffer and continue looking for an exact match.
-		 */
-		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-			args->cmpresult = cmp;
-			*indexp = index;
-			/* case exact match: return the current buffer. */
-			if (cmp == XFS_CMP_EXACT) {
-				*dbpp = dbp;
-				return 0;
-			}
-			cidb = curdb;
-		}
-	}
-	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-	/*
-	 * Here, we can only be doing a lookup (not a rename or remove).
-	 * If a case-insensitive match was found earlier, re-read the
-	 * appropriate data block if required and return it.
-	 */
-	if (args->cmpresult == XFS_CMP_CASE) {
-		ASSERT(cidb != -1);
-		if (cidb != curdb) {
-			xfs_trans_brelse(tp, dbp);
-			error = xfs_dir3_data_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, cidb),
-					   -1, &dbp);
-			if (error) {
-				xfs_trans_brelse(tp, lbp);
-				return error;
-			}
-		}
-		*dbpp = dbp;
-		return 0;
-	}
-	/*
-	 * No match found, return ENOENT.
-	 */
-	ASSERT(cidb == -1);
-	if (dbp)
-		xfs_trans_brelse(tp, dbp);
-	xfs_trans_brelse(tp, lbp);
-	return ENOENT;
-}
-
-/*
- * Remove an entry from a leaf format directory.
- */
-int						/* error */
-xfs_dir2_leaf_removename(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	__be16			*bestsp;	/* leaf block best freespace */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_db_t		db;		/* data block number */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	xfs_dir2_db_t		i;		/* temporary data block # */
-	int			index;		/* index into leaf entries */
-	struct xfs_buf		*lbp;		/* leaf buffer */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data frees */
-	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_data_free *bf;		/* bestfree table */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	trace_xfs_dir2_leaf_removename(args);
-
-	/*
-	 * Lookup the leaf entry, get the leaf and data blocks read in.
-	 */
-	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-		return error;
-	}
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	leaf = lbp->b_addr;
-	hdr = dbp->b_addr;
-	xfs_dir3_data_check(dp, dbp);
-	bf = dp->d_ops->data_bestfree_p(hdr);
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	/*
-	 * Point to the leaf entry, use that to point to the data entry.
-	 */
-	lep = &ents[index];
-	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-	dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-		xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-	needscan = needlog = 0;
-	oldbest = be16_to_cpu(bf[0].length);
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
-	/*
-	 * Mark the former data entry unused.
-	 */
-	xfs_dir2_data_make_free(args, dbp,
-		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-	/*
-	 * We just mark the leaf entry stale by putting a null in it.
-	 */
-	leafhdr.stale++;
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, lbp);
-
-	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir3_leaf_log_ents(args, lbp, index, index);
-
-	/*
-	 * Scan the freespace in the data block again if necessary,
-	 * log the data block header if necessary.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	/*
-	 * If the longest freespace in the data block has changed,
-	 * put the new value in the bests table and log that.
-	 */
-	if (be16_to_cpu(bf[0].length) != oldbest) {
-		bestsp[db] = bf[0].length;
-		xfs_dir3_leaf_log_bests(args, lbp, db, db);
-	}
-	xfs_dir3_data_check(dp, dbp);
-	/*
-	 * If the data block is now empty then get rid of the data block.
-	 */
-	if (be16_to_cpu(bf[0].length) ==
-			args->geo->blksize - dp->d_ops->data_entry_offset) {
-		ASSERT(db != args->geo->datablk);
-		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-			/*
-			 * Nope, can't get rid of it because it caused
-			 * allocation of a bmap btree block to do so.
-			 * Just go on, returning success, leaving the
-			 * empty block in place.
-			 */
-			if (error == ENOSPC && args->total == 0)
-				error = 0;
-			xfs_dir3_leaf_check(dp, lbp);
-			return error;
-		}
-		dbp = NULL;
-		/*
-		 * If this is the last data block then compact the
-		 * bests table by getting rid of entries.
-		 */
-		if (db == be32_to_cpu(ltp->bestcount) - 1) {
-			/*
-			 * Look for the last active entry (i).
-			 */
-			for (i = db - 1; i > 0; i--) {
-				if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
-					break;
-			}
-			/*
-			 * Copy the table down so inactive entries at the
-			 * end are removed.
-			 */
-			memmove(&bestsp[db - i], bestsp,
-				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
-			be32_add_cpu(&ltp->bestcount, -(db - i));
-			xfs_dir3_leaf_log_tail(args, lbp);
-			xfs_dir3_leaf_log_bests(args, lbp, 0,
-						be32_to_cpu(ltp->bestcount) - 1);
-		} else
-			bestsp[db] = cpu_to_be16(NULLDATAOFF);
-	}
-	/*
-	 * If the data block was not the first one, drop it.
-	 */
-	else if (db != args->geo->datablk)
-		dbp = NULL;
-
-	xfs_dir3_leaf_check(dp, lbp);
-	/*
-	 * See if we can convert to block form.
-	 */
-	return xfs_dir2_leaf_to_block(args, lbp, dbp);
-}
-
-/*
- * Replace the inode number in a leaf format directory entry.
- */
-int						/* error */
-xfs_dir2_leaf_replace(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	int			index;		/* index of leaf entry */
-	struct xfs_buf		*lbp;		/* leaf buffer */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_leaf_entry *ents;
-
-	trace_xfs_dir2_leaf_replace(args);
-
-	/*
-	 * Look up the entry.
-	 */
-	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
-		return error;
-	}
-	dp = args->dp;
-	leaf = lbp->b_addr;
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	/*
-	 * Point to the leaf entry, get data address from it.
-	 */
-	lep = &ents[index];
-	/*
-	 * Point to the data entry.
-	 */
-	dep = (xfs_dir2_data_entry_t *)
-	      ((char *)dbp->b_addr +
-	       xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
-	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
-	/*
-	 * Put the new inode number in, log it.
-	 */
-	dep->inumber = cpu_to_be64(args->inumber);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tp = args->trans;
-	xfs_dir2_data_log_entry(args, dbp, dep);
-	xfs_dir3_leaf_check(dp, lbp);
-	xfs_trans_brelse(tp, lbp);
-	return 0;
-}
-
-/*
- * Return index in the leaf block (lbp) which is either the first
- * one with this hash value, or if there are none, the insert point
- * for that hash value.
- */
-int						/* index value */
-xfs_dir2_leaf_search_hash(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*lbp)		/* leaf buffer */
-{
-	xfs_dahash_t		hash=0;		/* hash from this entry */
-	xfs_dahash_t		hashwant;	/* hash value looking for */
-	int			high;		/* high leaf index */
-	int			low;		/* low leaf index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	int			mid=0;		/* current leaf index */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	leaf = lbp->b_addr;
-	ents = args->dp->d_ops->leaf_ents_p(leaf);
-	args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	/*
-	 * Note, the table cannot be empty, so we have to go through the loop.
-	 * Binary search the leaf entries looking for our hash value.
-	 */
-	for (lep = ents, low = 0, high = leafhdr.count - 1,
-		hashwant = args->hashval;
-	     low <= high; ) {
-		mid = (low + high) >> 1;
-		if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
-			break;
-		if (hash < hashwant)
-			low = mid + 1;
-		else
-			high = mid - 1;
-	}
-	/*
-	 * Found one, back up through all the equal hash values.
-	 */
-	if (hash == hashwant) {
-		while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
-			mid--;
-		}
-	}
-	/*
-	 * Need to point to an entry higher than ours.
-	 */
-	else if (hash < hashwant)
-		mid++;
-	return mid;
-}
-
-/*
- * Trim off a trailing data block.  We know it's empty since the leaf
- * freespace table says so.
- */
-int						/* error */
-xfs_dir2_leaf_trim_data(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*lbp,		/* leaf buffer */
-	xfs_dir2_db_t		db)		/* data block number */
-{
-	__be16			*bestsp;	/* leaf bests table */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return value */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	/*
-	 * Read the offending data block.  We need its buffer.
-	 */
-	error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
-				   -1, &dbp);
-	if (error)
-		return error;
-
-	leaf = lbp->b_addr;
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-
-#ifdef DEBUG
-{
-	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
-	struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
-
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-	ASSERT(be16_to_cpu(bf[0].length) ==
-	       args->geo->blksize - dp->d_ops->data_entry_offset);
-	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
-}
-#endif
-
-	/*
-	 * Get rid of the data block.
-	 */
-	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-		ASSERT(error != ENOSPC);
-		xfs_trans_brelse(tp, dbp);
-		return error;
-	}
-	/*
-	 * Eliminate the last bests entry from the table.
-	 */
-	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	be32_add_cpu(&ltp->bestcount, -1);
-	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
-	xfs_dir3_leaf_log_tail(args, lbp);
-	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
-	return 0;
-}
-
-static inline size_t
-xfs_dir3_leaf_size(
-	struct xfs_dir3_icleaf_hdr	*hdr,
-	int				counts)
-{
-	int	entries;
-	int	hdrsize;
-
-	entries = hdr->count - hdr->stale;
-	if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
-	    hdr->magic == XFS_DIR2_LEAFN_MAGIC)
-		hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
-	else
-		hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
-
-	return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
-	               + counts * sizeof(xfs_dir2_data_off_t)
-		       + sizeof(xfs_dir2_leaf_tail_t);
-}
-
-/*
- * Convert node form directory to leaf form directory.
- * The root of the node form dir needs to already be a LEAFN block.
- * Just return if we can't do anything.
- */
-int						/* error */
-xfs_dir2_node_to_leaf(
-	xfs_da_state_t		*state)		/* directory operation state */
-{
-	xfs_da_args_t		*args;		/* operation arguments */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	struct xfs_buf		*fbp;		/* buffer for freespace block */
-	xfs_fileoff_t		fo;		/* freespace file offset */
-	xfs_dir2_free_t		*free;		/* freespace structure */
-	struct xfs_buf		*lbp;		/* buffer for leaf block */
-	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			rval;		/* successful free trim? */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_dir3_icfree_hdr freehdr;
-
-	/*
-	 * There's more than a leaf level in the btree, so there must
-	 * be multiple leafn blocks.  Give up.
-	 */
-	if (state->path.active > 1)
-		return 0;
-	args = state->args;
-
-	trace_xfs_dir2_node_to_leaf(args);
-
-	mp = state->mp;
-	dp = args->dp;
-	tp = args->trans;
-	/*
-	 * Get the last offset in the file.
-	 */
-	if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
-		return error;
-	}
-	fo -= args->geo->fsbcount;
-	/*
-	 * If there are freespace blocks other than the first one,
-	 * take this opportunity to remove trailing empty freespace blocks
-	 * that may have been left behind during no-space-reservation
-	 * operations.
-	 */
-	while (fo > args->geo->freeblk) {
-		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
-			return error;
-		}
-		if (rval)
-			fo -= args->geo->fsbcount;
-		else
-			return 0;
-	}
-	/*
-	 * Now find the block just before the freespace block.
-	 */
-	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
-		return error;
-	}
-	/*
-	 * If it's not the single leaf block, give up.
-	 */
-	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
-		return 0;
-	lbp = state->path.blk[0].bp;
-	leaf = lbp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-
-	/*
-	 * Read the freespace block.
-	 */
-	error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
-	if (error)
-		return error;
-	free = fbp->b_addr;
-	dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-	ASSERT(!freehdr.firstdb);
-
-	/*
-	 * Now see if the leafn and free data will fit in a leaf1.
-	 * If not, release the buffer and give up.
-	 */
-	if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
-		xfs_trans_brelse(tp, fbp);
-		return 0;
-	}
-
-	/*
-	 * If the leaf has any stale entries in it, compress them out.
-	 */
-	if (leafhdr.stale)
-		xfs_dir3_leaf_compact(args, &leafhdr, lbp);
-
-	lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
-	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
-	leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
-					? XFS_DIR2_LEAF1_MAGIC
-					: XFS_DIR3_LEAF1_MAGIC;
-
-	/*
-	 * Set up the leaf tail from the freespace block.
-	 */
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	ltp->bestcount = cpu_to_be32(freehdr.nvalid);
-
-	/*
-	 * Set up the leaf bests table.
-	 */
-	memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
-		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
-
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, lbp);
-	xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
-	xfs_dir3_leaf_log_tail(args, lbp);
-	xfs_dir3_leaf_check(dp, lbp);
-
-	/*
-	 * Get rid of the freespace block.
-	 */
-	error = xfs_dir2_shrink_inode(args,
-			xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
-			fbp);
-	if (error) {
-		/*
-		 * This can't fail here because it can only happen when
-		 * punching out the middle of an extent, and this is an
-		 * isolated block.
-		 */
-		ASSERT(error != ENOSPC);
-		return error;
-	}
-	fbp = NULL;
-	/*
-	 * Now see if we can convert the single-leaf directory
-	 * down to a block form directory.
-	 * This routine always kills the dabuf for the leaf, so
-	 * eliminate it from the path.
-	 */
-	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
-	state->path.blk[0].bp = NULL;
-	return error;
-}
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
deleted file mode 100644
index 4cf8b99d09a4..000000000000
--- a/fs/xfs/xfs_dir2_node.c
+++ /dev/null
@@ -1,2284 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-
-/*
- * Function declarations.
- */
-static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
-			      int index);
-static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
-				     xfs_da_state_blk_t *blk1,
-				     xfs_da_state_blk_t *blk2);
-static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
-				 int index, xfs_da_state_blk_t *dblk,
-				 int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-				     xfs_da_state_blk_t *fblk);
-
-/*
- * Check internal consistency of a leafn block.
- */
-#ifdef DEBUG
-#define	xfs_dir3_leaf_check(dp, bp) \
-do { \
-	if (!xfs_dir3_leafn_check((dp), (bp))) \
-		ASSERT(0); \
-} while (0);
-
-static bool
-xfs_dir3_leafn_check(
-	struct xfs_inode	*dp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
-		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-			return false;
-	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
-		return false;
-
-	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
-}
-#else
-#define	xfs_dir3_leaf_check(dp, bp)
-#endif
-
-static bool
-xfs_dir3_free_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
-
-		if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
-			return false;
-		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
-			return false;
-	} else {
-		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
-			return false;
-	}
-
-	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
-
-	return true;
-}
-
-static void
-xfs_dir3_free_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_dir3_free_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_dir3_free_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
-
-	if (!xfs_dir3_free_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-	xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
-	.verify_read = xfs_dir3_free_read_verify,
-	.verify_write = xfs_dir3_free_write_verify,
-};
-
-
-static int
-__xfs_dir3_free_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		fbno,
-	xfs_daddr_t		mappedbno,
-	struct xfs_buf		**bpp)
-{
-	int			err;
-
-	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
-
-	/* try read returns without an error or *bpp if it lands in a hole */
-	if (!err && tp && *bpp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
-	return err;
-}
-
-int
-xfs_dir2_free_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		fbno,
-	struct xfs_buf		**bpp)
-{
-	return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
-}
-
-static int
-xfs_dir2_free_try_read(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*dp,
-	xfs_dablk_t		fbno,
-	struct xfs_buf		**bpp)
-{
-	return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
-}
-
-static int
-xfs_dir3_free_get_buf(
-	xfs_da_args_t		*args,
-	xfs_dir2_db_t		fbno,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_trans	*tp = args->trans;
-	struct xfs_inode	*dp = args->dp;
-	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_buf		*bp;
-	int			error;
-	struct xfs_dir3_icfree_hdr hdr;
-
-	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
-				   -1, &bp, XFS_DATA_FORK);
-	if (error)
-		return error;
-
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
-	bp->b_ops = &xfs_dir3_free_buf_ops;
-
-	/*
-	 * Initialize the new block to be empty, and remember
-	 * its first slot as our empty slot.
-	 */
-	memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
-	memset(&hdr, 0, sizeof(hdr));
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
-
-		hdr.magic = XFS_DIR3_FREE_MAGIC;
-
-		hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
-		hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
-		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
-	} else
-		hdr.magic = XFS_DIR2_FREE_MAGIC;
-	dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Log entries from a freespace block.
- */
-STATIC void
-xfs_dir2_free_log_bests(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp,
-	int			first,		/* first entry to log */
-	int			last)		/* last entry to log */
-{
-	xfs_dir2_free_t		*free;		/* freespace structure */
-	__be16			*bests;
-
-	free = bp->b_addr;
-	bests = args->dp->d_ops->free_bests_p(free);
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-	xfs_trans_log_buf(args->trans, bp,
-		(uint)((char *)&bests[first] - (char *)free),
-		(uint)((char *)&bests[last] - (char *)free +
-		       sizeof(bests[0]) - 1));
-}
-
-/*
- * Log header from a freespace block.
- */
-static void
-xfs_dir2_free_log_header(
-	struct xfs_da_args	*args,
-	struct xfs_buf		*bp)
-{
-#ifdef DEBUG
-	xfs_dir2_free_t		*free;		/* freespace structure */
-
-	free = bp->b_addr;
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-#endif
-	xfs_trans_log_buf(args->trans, bp, 0,
-			  args->dp->d_ops->free_hdr_size - 1);
-}
-
-/*
- * Convert a leaf-format directory to a node-format directory.
- * We need to change the magic number of the leaf block, and copy
- * the freespace table out of the leaf block into its own block.
- */
-int						/* error */
-xfs_dir2_leaf_to_node(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*lbp)		/* leaf buffer */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return value */
-	struct xfs_buf		*fbp;		/* freespace buffer */
-	xfs_dir2_db_t		fdb;		/* freespace block number */
-	xfs_dir2_free_t		*free;		/* freespace structure */
-	__be16			*from;		/* pointer to freespace entry */
-	int			i;		/* leaf freespace index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			n;		/* count of live freespc ents */
-	xfs_dir2_data_off_t	off;		/* freespace entry value */
-	__be16			*to;		/* pointer to freespace entry */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir3_icfree_hdr freehdr;
-
-	trace_xfs_dir2_leaf_to_node(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	/*
-	 * Add a freespace block to the directory.
-	 */
-	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
-		return error;
-	}
-	ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
-	/*
-	 * Get the buffer for the new freespace block.
-	 */
-	error = xfs_dir3_free_get_buf(args, fdb, &fbp);
-	if (error)
-		return error;
-
-	free = fbp->b_addr;
-	dp->d_ops->free_hdr_from_disk(&freehdr, free);
-	leaf = lbp->b_addr;
-	ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
-	ASSERT(be32_to_cpu(ltp->bestcount) <=
-				(uint)dp->i_d.di_size / args->geo->blksize);
-
-	/*
-	 * Copy freespace entries from the leaf block to the new block.
-	 * Count active entries.
-	 */
-	from = xfs_dir2_leaf_bests_p(ltp);
-	to = dp->d_ops->free_bests_p(free);
-	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
-		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
-			n++;
-		*to = cpu_to_be16(off);
-	}
-
-	/*
-	 * Now initialize the freespace block header.
-	 */
-	freehdr.nused = n;
-	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
-
-	dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-	xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
-	xfs_dir2_free_log_header(args, fbp);
-
-	/*
-	 * Converting the leaf to a leafnode is just a matter of changing the
-	 * magic number and the ops. Do the change directly to the buffer as
-	 * it's less work (and less code) than decoding the header to host
-	 * format and back again.
-	 */
-	if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
-		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
-	else
-		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
-	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
-	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
-	xfs_dir3_leaf_log_header(args, lbp);
-	xfs_dir3_leaf_check(dp, lbp);
-	return 0;
-}
-
-/*
- * Add a leaf entry to a leaf block in a node-form directory.
- * The other work necessary is done from the caller.
- */
-static int					/* error */
-xfs_dir2_leafn_add(
-	struct xfs_buf		*bp,		/* leaf buffer */
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			index)		/* insertion pt for new entry */
-{
-	int			compact;	/* compacting stale leaves */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			highstale;	/* next stale entry */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	int			lfloghigh;	/* high leaf entry logging */
-	int			lfloglow;	/* low leaf entry logging */
-	int			lowstale;	/* previous stale entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_dir2_leaf_entry *ents;
-
-	trace_xfs_dir2_leafn_add(args, index);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	leaf = bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-
-	/*
-	 * Quick check just to make sure we are not going to index
-	 * into other peoples memory
-	 */
-	if (index < 0)
-		return EFSCORRUPTED;
-
-	/*
-	 * If there are already the maximum number of leaf entries in
-	 * the block, if there are no stale entries it won't fit.
-	 * Caller will do a split.  If there are stale entries we'll do
-	 * a compact.
-	 */
-
-	if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
-		if (!leafhdr.stale)
-			return ENOSPC;
-		compact = leafhdr.stale > 1;
-	} else
-		compact = 0;
-	ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
-	ASSERT(index == leafhdr.count ||
-	       be32_to_cpu(ents[index].hashval) >= args->hashval);
-
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-		return 0;
-
-	/*
-	 * Compact out all but one stale leaf entry.  Leaves behind
-	 * the entry closest to index.
-	 */
-	if (compact)
-		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
-					 &highstale, &lfloglow, &lfloghigh);
-	else if (leafhdr.stale) {
-		/*
-		 * Set impossible logging indices for this case.
-		 */
-		lfloglow = leafhdr.count;
-		lfloghigh = -1;
-	}
-
-	/*
-	 * Insert the new entry, log everything.
-	 */
-	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
-				       highstale, &lfloglow, &lfloghigh);
-
-	lep->hashval = cpu_to_be32(args->hashval);
-	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
-				args->blkno, args->index));
-
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, bp);
-	xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
-	xfs_dir3_leaf_check(dp, bp);
-	return 0;
-}
-
-#ifdef DEBUG
-static void
-xfs_dir2_free_hdr_check(
-	struct xfs_inode *dp,
-	struct xfs_buf	*bp,
-	xfs_dir2_db_t	db)
-{
-	struct xfs_dir3_icfree_hdr hdr;
-
-	dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
-
-	ASSERT((hdr.firstdb %
-		dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
-	ASSERT(hdr.firstdb <= db);
-	ASSERT(db < hdr.firstdb + hdr.nvalid);
-}
-#else
-#define xfs_dir2_free_hdr_check(dp, bp, db)
-#endif	/* DEBUG */
-
-/*
- * Return the last hash value in the leaf.
- * Stale entries are ok.
- */
-xfs_dahash_t					/* hash value */
-xfs_dir2_leafn_lasthash(
-	struct xfs_inode *dp,
-	struct xfs_buf	*bp,			/* leaf buffer */
-	int		*count)			/* count of entries in leaf */
-{
-	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-
-	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
-	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
-
-	if (count)
-		*count = leafhdr.count;
-	if (!leafhdr.count)
-		return 0;
-
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	return be32_to_cpu(ents[leafhdr.count - 1].hashval);
-}
-
-/*
- * Look up a leaf entry for space to add a name in a node-format leaf block.
- * The extrablk in state is a freespace block.
- */
-STATIC int
-xfs_dir2_leafn_lookup_for_addname(
-	struct xfs_buf		*bp,		/* leaf buffer */
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			*indexp,	/* out: leaf entry index */
-	xfs_da_state_t		*state)		/* state to fill in */
-{
-	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
-	xfs_dir2_db_t		curdb = -1;	/* current data block number */
-	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return value */
-	int			fi;		/* free entry index */
-	xfs_dir2_free_t		*free = NULL;	/* free block structure */
-	int			index;		/* leaf entry index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	int			length;		/* length of new data entry */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_dir2_db_t		newdb;		/* new data block number */
-	xfs_dir2_db_t		newfdb;		/* new free block number */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	leaf = bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-
-	xfs_dir3_leaf_check(dp, bp);
-	ASSERT(leafhdr.count > 0);
-
-	/*
-	 * Look up the hash value in the leaf entries.
-	 */
-	index = xfs_dir2_leaf_search_hash(args, bp);
-	/*
-	 * Do we have a buffer coming in?
-	 */
-	if (state->extravalid) {
-		/* If so, it's a free block buffer, get the block number. */
-		curbp = state->extrablk.bp;
-		curfdb = state->extrablk.blkno;
-		free = curbp->b_addr;
-		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
-		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
-	}
-	length = dp->d_ops->data_entsize(args->namelen);
-	/*
-	 * Loop over leaf entries with the right hash value.
-	 */
-	for (lep = &ents[index];
-	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-	     lep++, index++) {
-		/*
-		 * Skip stale leaf entries.
-		 */
-		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		/*
-		 * Pull the data block number from the entry.
-		 */
-		newdb = xfs_dir2_dataptr_to_db(args->geo,
-					       be32_to_cpu(lep->address));
-		/*
-		 * For addname, we're looking for a place to put the new entry.
-		 * We want to use a data block with an entry of equal
-		 * hash value to ours if there is one with room.
-		 *
-		 * If this block isn't the data block we already have
-		 * in hand, take a look at it.
-		 */
-		if (newdb != curdb) {
-			__be16 *bests;
-
-			curdb = newdb;
-			/*
-			 * Convert the data block to the free block
-			 * holding its freespace information.
-			 */
-			newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
-			/*
-			 * If it's not the one we have in hand, read it in.
-			 */
-			if (newfdb != curfdb) {
-				/*
-				 * If we had one before, drop it.
-				 */
-				if (curbp)
-					xfs_trans_brelse(tp, curbp);
-
-				error = xfs_dir2_free_read(tp, dp,
-						xfs_dir2_db_to_da(args->geo,
-								  newfdb),
-						&curbp);
-				if (error)
-					return error;
-				free = curbp->b_addr;
-
-				xfs_dir2_free_hdr_check(dp, curbp, curdb);
-			}
-			/*
-			 * Get the index for our entry.
-			 */
-			fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
-			/*
-			 * If it has room, return it.
-			 */
-			bests = dp->d_ops->free_bests_p(free);
-			if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
-				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
-							XFS_ERRLEVEL_LOW, mp);
-				if (curfdb != newfdb)
-					xfs_trans_brelse(tp, curbp);
-				return EFSCORRUPTED;
-			}
-			curfdb = newfdb;
-			if (be16_to_cpu(bests[fi]) >= length)
-				goto out;
-		}
-	}
-	/* Didn't find any space */
-	fi = -1;
-out:
-	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-	if (curbp) {
-		/* Giving back a free block. */
-		state->extravalid = 1;
-		state->extrablk.bp = curbp;
-		state->extrablk.index = fi;
-		state->extrablk.blkno = curfdb;
-
-		/*
-		 * Important: this magic number is not in the buffer - it's for
-		 * buffer type information and therefore only the free/data type
-		 * matters here, not whether CRCs are enabled or not.
-		 */
-		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
-	} else {
-		state->extravalid = 0;
-	}
-	/*
-	 * Return the index, that will be the insertion point.
-	 */
-	*indexp = index;
-	return ENOENT;
-}
-
-/*
- * Look up a leaf entry in a node-format leaf block.
- * The extrablk in state a data block.
- */
-STATIC int
-xfs_dir2_leafn_lookup_for_entry(
-	struct xfs_buf		*bp,		/* leaf buffer */
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			*indexp,	/* out: leaf entry index */
-	xfs_da_state_t		*state)		/* state to fill in */
-{
-	struct xfs_buf		*curbp = NULL;	/* current data/free buffer */
-	xfs_dir2_db_t		curdb = -1;	/* current data block number */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return value */
-	int			index;		/* leaf entry index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_dir2_db_t		newdb;		/* new data block number */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	enum xfs_dacmp		cmp;		/* comparison result */
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_dir3_icleaf_hdr leafhdr;
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	leaf = bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-
-	xfs_dir3_leaf_check(dp, bp);
-	ASSERT(leafhdr.count > 0);
-
-	/*
-	 * Look up the hash value in the leaf entries.
-	 */
-	index = xfs_dir2_leaf_search_hash(args, bp);
-	/*
-	 * Do we have a buffer coming in?
-	 */
-	if (state->extravalid) {
-		curbp = state->extrablk.bp;
-		curdb = state->extrablk.blkno;
-	}
-	/*
-	 * Loop over leaf entries with the right hash value.
-	 */
-	for (lep = &ents[index];
-	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
-	     lep++, index++) {
-		/*
-		 * Skip stale leaf entries.
-		 */
-		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		/*
-		 * Pull the data block number from the entry.
-		 */
-		newdb = xfs_dir2_dataptr_to_db(args->geo,
-					       be32_to_cpu(lep->address));
-		/*
-		 * Not adding a new entry, so we really want to find
-		 * the name given to us.
-		 *
-		 * If it's a different data block, go get it.
-		 */
-		if (newdb != curdb) {
-			/*
-			 * If we had a block before that we aren't saving
-			 * for a CI name, drop it
-			 */
-			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
-						curdb != state->extrablk.blkno))
-				xfs_trans_brelse(tp, curbp);
-			/*
-			 * If needing the block that is saved with a CI match,
-			 * use it otherwise read in the new data block.
-			 */
-			if (args->cmpresult != XFS_CMP_DIFFERENT &&
-					newdb == state->extrablk.blkno) {
-				ASSERT(state->extravalid);
-				curbp = state->extrablk.bp;
-			} else {
-				error = xfs_dir3_data_read(tp, dp,
-						xfs_dir2_db_to_da(args->geo,
-								  newdb),
-						-1, &curbp);
-				if (error)
-					return error;
-			}
-			xfs_dir3_data_check(dp, curbp);
-			curdb = newdb;
-		}
-		/*
-		 * Point to the data entry.
-		 */
-		dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
-			xfs_dir2_dataptr_to_off(args->geo,
-						be32_to_cpu(lep->address)));
-		/*
-		 * Compare the entry and if it's an exact match, return
-		 * EEXIST immediately. If it's the first case-insensitive
-		 * match, store the block & inode number and continue looking.
-		 */
-		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-			/* If there is a CI match block, drop it */
-			if (args->cmpresult != XFS_CMP_DIFFERENT &&
-						curdb != state->extrablk.blkno)
-				xfs_trans_brelse(tp, state->extrablk.bp);
-			args->cmpresult = cmp;
-			args->inumber = be64_to_cpu(dep->inumber);
-			args->filetype = dp->d_ops->data_get_ftype(dep);
-			*indexp = index;
-			state->extravalid = 1;
-			state->extrablk.bp = curbp;
-			state->extrablk.blkno = curdb;
-			state->extrablk.index = (int)((char *)dep -
-							(char *)curbp->b_addr);
-			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_ops = &xfs_dir3_data_buf_ops;
-			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
-			if (cmp == XFS_CMP_EXACT)
-				return EEXIST;
-		}
-	}
-	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
-	if (curbp) {
-		if (args->cmpresult == XFS_CMP_DIFFERENT) {
-			/* Giving back last used data block. */
-			state->extravalid = 1;
-			state->extrablk.bp = curbp;
-			state->extrablk.index = -1;
-			state->extrablk.blkno = curdb;
-			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_ops = &xfs_dir3_data_buf_ops;
-			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
-		} else {
-			/* If the curbp is not the CI match block, drop it */
-			if (state->extrablk.bp != curbp)
-				xfs_trans_brelse(tp, curbp);
-		}
-	} else {
-		state->extravalid = 0;
-	}
-	*indexp = index;
-	return ENOENT;
-}
-
-/*
- * Look up a leaf entry in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
- * otherwise it's a data block.
- */
-int
-xfs_dir2_leafn_lookup_int(
-	struct xfs_buf		*bp,		/* leaf buffer */
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			*indexp,	/* out: leaf entry index */
-	xfs_da_state_t		*state)		/* state to fill in */
-{
-	if (args->op_flags & XFS_DA_OP_ADDNAME)
-		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
-							state);
-	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
-}
-
-/*
- * Move count leaf entries from source to destination leaf.
- * Log entries and headers.  Stale entries are preserved.
- */
-static void
-xfs_dir3_leafn_moveents(
-	xfs_da_args_t			*args,	/* operation arguments */
-	struct xfs_buf			*bp_s,	/* source */
-	struct xfs_dir3_icleaf_hdr	*shdr,
-	struct xfs_dir2_leaf_entry	*sents,
-	int				start_s,/* source leaf index */
-	struct xfs_buf			*bp_d,	/* destination */
-	struct xfs_dir3_icleaf_hdr	*dhdr,
-	struct xfs_dir2_leaf_entry	*dents,
-	int				start_d,/* destination leaf index */
-	int				count)	/* count of leaves to copy */
-{
-	int				stale;	/* count stale leaves copied */
-
-	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
-
-	/*
-	 * Silently return if nothing to do.
-	 */
-	if (count == 0)
-		return;
-
-	/*
-	 * If the destination index is not the end of the current
-	 * destination leaf entries, open up a hole in the destination
-	 * to hold the new entries.
-	 */
-	if (start_d < dhdr->count) {
-		memmove(&dents[start_d + count], &dents[start_d],
-			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
-				       count + dhdr->count - 1);
-	}
-	/*
-	 * If the source has stale leaves, count the ones in the copy range
-	 * so we can update the header correctly.
-	 */
-	if (shdr->stale) {
-		int	i;			/* temp leaf index */
-
-		for (i = start_s, stale = 0; i < start_s + count; i++) {
-			if (sents[i].address ==
-					cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-				stale++;
-		}
-	} else
-		stale = 0;
-	/*
-	 * Copy the leaf entries from source to destination.
-	 */
-	memcpy(&dents[start_d], &sents[start_s],
-		count * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
-
-	/*
-	 * If there are source entries after the ones we copied,
-	 * delete the ones we copied by sliding the next ones down.
-	 */
-	if (start_s + count < shdr->count) {
-		memmove(&sents[start_s], &sents[start_s + count],
-			count * sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
-	}
-
-	/*
-	 * Update the headers and log them.
-	 */
-	shdr->count -= count;
-	shdr->stale -= stale;
-	dhdr->count += count;
-	dhdr->stale += stale;
-}
-
-/*
- * Determine the sort order of two leaf blocks.
- * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
- */
-int						/* sort order */
-xfs_dir2_leafn_order(
-	struct xfs_inode	*dp,
-	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
-	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
-{
-	struct xfs_dir2_leaf	*leaf1 = leaf1_bp->b_addr;
-	struct xfs_dir2_leaf	*leaf2 = leaf2_bp->b_addr;
-	struct xfs_dir2_leaf_entry *ents1;
-	struct xfs_dir2_leaf_entry *ents2;
-	struct xfs_dir3_icleaf_hdr hdr1;
-	struct xfs_dir3_icleaf_hdr hdr2;
-
-	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
-	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
-	ents1 = dp->d_ops->leaf_ents_p(leaf1);
-	ents2 = dp->d_ops->leaf_ents_p(leaf2);
-
-	if (hdr1.count > 0 && hdr2.count > 0 &&
-	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
-	     be32_to_cpu(ents2[hdr2.count - 1].hashval) <
-				be32_to_cpu(ents1[hdr1.count - 1].hashval)))
-		return 1;
-	return 0;
-}
-
-/*
- * Rebalance leaf entries between two leaf blocks.
- * This is actually only called when the second block is new,
- * though the code deals with the general case.
- * A new entry will be inserted in one of the blocks, and that
- * entry is taken into account when balancing.
- */
-static void
-xfs_dir2_leafn_rebalance(
-	xfs_da_state_t		*state,		/* btree cursor */
-	xfs_da_state_blk_t	*blk1,		/* first btree block */
-	xfs_da_state_blk_t	*blk2)		/* second btree block */
-{
-	xfs_da_args_t		*args;		/* operation arguments */
-	int			count;		/* count (& direction) leaves */
-	int			isleft;		/* new goes in left leaf */
-	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
-	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
-	int			mid;		/* midpoint leaf index */
-#if defined(DEBUG) || defined(XFS_WARN)
-	int			oldstale;	/* old count of stale leaves */
-#endif
-	int			oldsum;		/* old total leaf count */
-	int			swap;		/* swapped leaf blocks */
-	struct xfs_dir2_leaf_entry *ents1;
-	struct xfs_dir2_leaf_entry *ents2;
-	struct xfs_dir3_icleaf_hdr hdr1;
-	struct xfs_dir3_icleaf_hdr hdr2;
-	struct xfs_inode	*dp = state->args->dp;
-
-	args = state->args;
-	/*
-	 * If the block order is wrong, swap the arguments.
-	 */
-	if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
-		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
-
-		tmp = blk1;
-		blk1 = blk2;
-		blk2 = tmp;
-	}
-	leaf1 = blk1->bp->b_addr;
-	leaf2 = blk2->bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
-	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
-	ents1 = dp->d_ops->leaf_ents_p(leaf1);
-	ents2 = dp->d_ops->leaf_ents_p(leaf2);
-
-	oldsum = hdr1.count + hdr2.count;
-#if defined(DEBUG) || defined(XFS_WARN)
-	oldstale = hdr1.stale + hdr2.stale;
-#endif
-	mid = oldsum >> 1;
-
-	/*
-	 * If the old leaf count was odd then the new one will be even,
-	 * so we need to divide the new count evenly.
-	 */
-	if (oldsum & 1) {
-		xfs_dahash_t	midhash;	/* middle entry hash value */
-
-		if (mid >= hdr1.count)
-			midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
-		else
-			midhash = be32_to_cpu(ents1[mid].hashval);
-		isleft = args->hashval <= midhash;
-	}
-	/*
-	 * If the old count is even then the new count is odd, so there's
-	 * no preferred side for the new entry.
-	 * Pick the left one.
-	 */
-	else
-		isleft = 1;
-	/*
-	 * Calculate moved entry count.  Positive means left-to-right,
-	 * negative means right-to-left.  Then move the entries.
-	 */
-	count = hdr1.count - mid + (isleft == 0);
-	if (count > 0)
-		xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
-					hdr1.count - count, blk2->bp,
-					&hdr2, ents2, 0, count);
-	else if (count < 0)
-		xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
-					blk1->bp, &hdr1, ents1,
-					hdr1.count, count);
-
-	ASSERT(hdr1.count + hdr2.count == oldsum);
-	ASSERT(hdr1.stale + hdr2.stale == oldstale);
-
-	/* log the changes made when moving the entries */
-	dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
-	dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
-	xfs_dir3_leaf_log_header(args, blk1->bp);
-	xfs_dir3_leaf_log_header(args, blk2->bp);
-
-	xfs_dir3_leaf_check(dp, blk1->bp);
-	xfs_dir3_leaf_check(dp, blk2->bp);
-
-	/*
-	 * Mark whether we're inserting into the old or new leaf.
-	 */
-	if (hdr1.count < hdr2.count)
-		state->inleaf = swap;
-	else if (hdr1.count > hdr2.count)
-		state->inleaf = !swap;
-	else
-		state->inleaf = swap ^ (blk1->index <= hdr1.count);
-	/*
-	 * Adjust the expected index for insertion.
-	 */
-	if (!state->inleaf)
-		blk2->index = blk1->index - hdr1.count;
-
-	/*
-	 * Finally sanity check just to make sure we are not returning a
-	 * negative index
-	 */
-	if (blk2->index < 0) {
-		state->inleaf = 1;
-		blk2->index = 0;
-		xfs_alert(dp->i_mount,
-	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
-			__func__, blk1->index);
-	}
-}
-
-static int
-xfs_dir3_data_block_free(
-	xfs_da_args_t		*args,
-	struct xfs_dir2_data_hdr *hdr,
-	struct xfs_dir2_free	*free,
-	xfs_dir2_db_t		fdb,
-	int			findex,
-	struct xfs_buf		*fbp,
-	int			longest)
-{
-	int			logfree = 0;
-	__be16			*bests;
-	struct xfs_dir3_icfree_hdr freehdr;
-	struct xfs_inode	*dp = args->dp;
-
-	dp->d_ops->free_hdr_from_disk(&freehdr, free);
-	bests = dp->d_ops->free_bests_p(free);
-	if (hdr) {
-		/*
-		 * Data block is not empty, just set the free entry to the new
-		 * value.
-		 */
-		bests[findex] = cpu_to_be16(longest);
-		xfs_dir2_free_log_bests(args, fbp, findex, findex);
-		return 0;
-	}
-
-	/* One less used entry in the free table. */
-	freehdr.nused--;
-
-	/*
-	 * If this was the last entry in the table, we can trim the table size
-	 * back.  There might be other entries at the end referring to
-	 * non-existent data blocks, get those too.
-	 */
-	if (findex == freehdr.nvalid - 1) {
-		int	i;		/* free entry index */
-
-		for (i = findex - 1; i >= 0; i--) {
-			if (bests[i] != cpu_to_be16(NULLDATAOFF))
-				break;
-		}
-		freehdr.nvalid = i + 1;
-		logfree = 0;
-	} else {
-		/* Not the last entry, just punch it out.  */
-		bests[findex] = cpu_to_be16(NULLDATAOFF);
-		logfree = 1;
-	}
-
-	dp->d_ops->free_hdr_to_disk(free, &freehdr);
-	xfs_dir2_free_log_header(args, fbp);
-
-	/*
-	 * If there are no useful entries left in the block, get rid of the
-	 * block if we can.
-	 */
-	if (!freehdr.nused) {
-		int error;
-
-		error = xfs_dir2_shrink_inode(args, fdb, fbp);
-		if (error == 0) {
-			fbp = NULL;
-			logfree = 0;
-		} else if (error != ENOSPC || args->total != 0)
-			return error;
-		/*
-		 * It's possible to get ENOSPC if there is no
-		 * space reservation.  In this case some one
-		 * else will eventually get rid of this block.
-		 */
-	}
-
-	/* Log the free entry that changed, unless we got rid of it.  */
-	if (logfree)
-		xfs_dir2_free_log_bests(args, fbp, findex, findex);
-	return 0;
-}
-
-/*
- * Remove an entry from a node directory.
- * This removes the leaf entry and the data entry,
- * and updates the free block if necessary.
- */
-static int					/* error */
-xfs_dir2_leafn_remove(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*bp,		/* leaf buffer */
-	int			index,		/* leaf entry index */
-	xfs_da_state_blk_t	*dblk,		/* data block */
-	int			*rval)		/* resulting block needs join */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_db_t		db;		/* data block number */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
-	int			longest;	/* longest data free entry */
-	int			off;		/* data block entry offset */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data frees */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir2_data_free *bf;		/* bestfree table */
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_dir2_leaf_entry *ents;
-
-	trace_xfs_dir2_leafn_remove(args, index);
-
-	dp = args->dp;
-	tp = args->trans;
-	mp = dp->i_mount;
-	leaf = bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-
-	/*
-	 * Point to the entry we're removing.
-	 */
-	lep = &ents[index];
-
-	/*
-	 * Extract the data block and offset from the entry.
-	 */
-	db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
-	ASSERT(dblk->blkno == db);
-	off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
-	ASSERT(dblk->index == off);
-
-	/*
-	 * Kill the leaf entry by marking it stale.
-	 * Log the leaf block changes.
-	 */
-	leafhdr.stale++;
-	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(args, bp);
-
-	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir3_leaf_log_ents(args, bp, index, index);
-
-	/*
-	 * Make the data entry free.  Keep track of the longest freespace
-	 * in the data block in case it changes.
-	 */
-	dbp = dblk->bp;
-	hdr = dbp->b_addr;
-	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
-	bf = dp->d_ops->data_bestfree_p(hdr);
-	longest = be16_to_cpu(bf[0].length);
-	needlog = needscan = 0;
-	xfs_dir2_data_make_free(args, dbp, off,
-		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
-	/*
-	 * Rescan the data block freespaces for bestfree.
-	 * Log the data block header if needed.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	xfs_dir3_data_check(dp, dbp);
-	/*
-	 * If the longest data block freespace changes, need to update
-	 * the corresponding freeblock entry.
-	 */
-	if (longest < be16_to_cpu(bf[0].length)) {
-		int		error;		/* error return value */
-		struct xfs_buf	*fbp;		/* freeblock buffer */
-		xfs_dir2_db_t	fdb;		/* freeblock block number */
-		int		findex;		/* index in freeblock entries */
-		xfs_dir2_free_t	*free;		/* freeblock structure */
-
-		/*
-		 * Convert the data block number to a free block,
-		 * read in the free block.
-		 */
-		fdb = dp->d_ops->db_to_fdb(args->geo, db);
-		error = xfs_dir2_free_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, fdb),
-					   &fbp);
-		if (error)
-			return error;
-		free = fbp->b_addr;
-#ifdef DEBUG
-	{
-		struct xfs_dir3_icfree_hdr freehdr;
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
-			(fdb - xfs_dir2_byte_to_db(args->geo,
-						   XFS_DIR2_FREE_OFFSET)));
-	}
-#endif
-		/*
-		 * Calculate which entry we need to fix.
-		 */
-		findex = dp->d_ops->db_to_fdindex(args->geo, db);
-		longest = be16_to_cpu(bf[0].length);
-		/*
-		 * If the data block is now empty we can get rid of it
-		 * (usually).
-		 */
-		if (longest == args->geo->blksize -
-			       dp->d_ops->data_entry_offset) {
-			/*
-			 * Try to punch out the data block.
-			 */
-			error = xfs_dir2_shrink_inode(args, db, dbp);
-			if (error == 0) {
-				dblk->bp = NULL;
-				hdr = NULL;
-			}
-			/*
-			 * We can get ENOSPC if there's no space reservation.
-			 * In this case just drop the buffer and some one else
-			 * will eventually get rid of the empty block.
-			 */
-			else if (!(error == ENOSPC && args->total == 0))
-				return error;
-		}
-		/*
-		 * If we got rid of the data block, we can eliminate that entry
-		 * in the free block.
-		 */
-		error = xfs_dir3_data_block_free(args, hdr, free,
-						 fdb, findex, fbp, longest);
-		if (error)
-			return error;
-	}
-
-	xfs_dir3_leaf_check(dp, bp);
-	/*
-	 * Return indication of whether this leaf block is empty enough
-	 * to justify trying to join it with a neighbor.
-	 */
-	*rval = (dp->d_ops->leaf_hdr_size +
-		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
-		args->geo->magicpct;
-	return 0;
-}
-
-/*
- * Split the leaf entries in the old block into old and new blocks.
- */
-int						/* error */
-xfs_dir2_leafn_split(
-	xfs_da_state_t		*state,		/* btree cursor */
-	xfs_da_state_blk_t	*oldblk,	/* original block */
-	xfs_da_state_blk_t	*newblk)	/* newly created block */
-{
-	xfs_da_args_t		*args;		/* operation arguments */
-	xfs_dablk_t		blkno;		/* new leaf block number */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	struct xfs_inode	*dp;
-
-	/*
-	 * Allocate space for a new leaf node.
-	 */
-	args = state->args;
-	dp = args->dp;
-	mp = dp->i_mount;
-	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Initialize the new leaf block.
-	 */
-	error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
-				      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
-	if (error)
-		return error;
-
-	newblk->blkno = blkno;
-	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
-	/*
-	 * Rebalance the entries across the two leaves, link the new
-	 * block into the leaves.
-	 */
-	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
-	error = xfs_da3_blk_link(state, oldblk, newblk);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Insert the new entry in the correct block.
-	 */
-	if (state->inleaf)
-		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
-	else
-		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
-	/*
-	 * Update last hashval in each block since we added the name.
-	 */
-	oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
-	newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
-	xfs_dir3_leaf_check(dp, oldblk->bp);
-	xfs_dir3_leaf_check(dp, newblk->bp);
-	return error;
-}
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int						/* error */
-xfs_dir2_leafn_toosmall(
-	xfs_da_state_t		*state,		/* btree cursor */
-	int			*action)	/* resulting action to take */
-{
-	xfs_da_state_blk_t	*blk;		/* leaf block */
-	xfs_dablk_t		blkno;		/* leaf block number */
-	struct xfs_buf		*bp;		/* leaf buffer */
-	int			bytes;		/* bytes in use */
-	int			count;		/* leaf live entry count */
-	int			error;		/* error return value */
-	int			forward;	/* sibling block direction */
-	int			i;		/* sibling counter */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	int			rval;		/* result from path_shift */
-	struct xfs_dir3_icleaf_hdr leafhdr;
-	struct xfs_dir2_leaf_entry *ents;
-	struct xfs_inode	*dp = state->args->dp;
-
-	/*
-	 * Check for the degenerate case of the block being over 50% full.
-	 * If so, it's not worth even looking to see if we might be able
-	 * to coalesce with a sibling.
-	 */
-	blk = &state->path.blk[state->path.active - 1];
-	leaf = blk->bp->b_addr;
-	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = dp->d_ops->leaf_ents_p(leaf);
-	xfs_dir3_leaf_check(dp, blk->bp);
-
-	count = leafhdr.count - leafhdr.stale;
-	bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
-	if (bytes > (state->args->geo->blksize >> 1)) {
-		/*
-		 * Blk over 50%, don't try to join.
-		 */
-		*action = 0;
-		return 0;
-	}
-	/*
-	 * Check for the degenerate case of the block being empty.
-	 * If the block is empty, we'll simply delete it, no need to
-	 * coalesce it with a sibling block.  We choose (arbitrarily)
-	 * to merge with the forward block unless it is NULL.
-	 */
-	if (count == 0) {
-		/*
-		 * Make altpath point to the block we want to keep and
-		 * path point to the block we want to drop (this one).
-		 */
-		forward = (leafhdr.forw != 0);
-		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
-			&rval);
-		if (error)
-			return error;
-		*action = rval ? 2 : 0;
-		return 0;
-	}
-	/*
-	 * Examine each sibling block to see if we can coalesce with
-	 * at least 25% free space to spare.  We need to figure out
-	 * whether to merge with the forward or the backward block.
-	 * We prefer coalescing with the lower numbered sibling so as
-	 * to shrink a directory over time.
-	 */
-	forward = leafhdr.forw < leafhdr.back;
-	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
-		struct xfs_dir3_icleaf_hdr hdr2;
-
-		blkno = forward ? leafhdr.forw : leafhdr.back;
-		if (blkno == 0)
-			continue;
-		/*
-		 * Read the sibling leaf block.
-		 */
-		error = xfs_dir3_leafn_read(state->args->trans, dp,
-					    blkno, -1, &bp);
-		if (error)
-			return error;
-
-		/*
-		 * Count bytes in the two blocks combined.
-		 */
-		count = leafhdr.count - leafhdr.stale;
-		bytes = state->args->geo->blksize -
-			(state->args->geo->blksize >> 2);
-
-		leaf = bp->b_addr;
-		dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
-		ents = dp->d_ops->leaf_ents_p(leaf);
-		count += hdr2.count - hdr2.stale;
-		bytes -= count * sizeof(ents[0]);
-
-		/*
-		 * Fits with at least 25% to spare.
-		 */
-		if (bytes >= 0)
-			break;
-		xfs_trans_brelse(state->args->trans, bp);
-	}
-	/*
-	 * Didn't like either block, give up.
-	 */
-	if (i >= 2) {
-		*action = 0;
-		return 0;
-	}
-
-	/*
-	 * Make altpath point to the block we want to keep (the lower
-	 * numbered block) and path point to the block we want to drop.
-	 */
-	memcpy(&state->altpath, &state->path, sizeof(state->path));
-	if (blkno < blk->blkno)
-		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
-			&rval);
-	else
-		error = xfs_da3_path_shift(state, &state->path, forward, 0,
-			&rval);
-	if (error) {
-		return error;
-	}
-	*action = rval ? 0 : 1;
-	return 0;
-}
-
-/*
- * Move all the leaf entries from drop_blk to save_blk.
- * This is done as part of a join operation.
- */
-void
-xfs_dir2_leafn_unbalance(
-	xfs_da_state_t		*state,		/* cursor */
-	xfs_da_state_blk_t	*drop_blk,	/* dead block */
-	xfs_da_state_blk_t	*save_blk)	/* surviving block */
-{
-	xfs_da_args_t		*args;		/* operation arguments */
-	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
-	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
-	struct xfs_dir3_icleaf_hdr savehdr;
-	struct xfs_dir3_icleaf_hdr drophdr;
-	struct xfs_dir2_leaf_entry *sents;
-	struct xfs_dir2_leaf_entry *dents;
-	struct xfs_inode	*dp = state->args->dp;
-
-	args = state->args;
-	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
-	drop_leaf = drop_blk->bp->b_addr;
-	save_leaf = save_blk->bp->b_addr;
-
-	dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
-	dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
-	sents = dp->d_ops->leaf_ents_p(save_leaf);
-	dents = dp->d_ops->leaf_ents_p(drop_leaf);
-
-	/*
-	 * If there are any stale leaf entries, take this opportunity
-	 * to purge them.
-	 */
-	if (drophdr.stale)
-		xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
-	if (savehdr.stale)
-		xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
-
-	/*
-	 * Move the entries from drop to the appropriate end of save.
-	 */
-	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
-	if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
-		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
-					save_blk->bp, &savehdr, sents, 0,
-					drophdr.count);
-	else
-		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
-					save_blk->bp, &savehdr, sents,
-					savehdr.count, drophdr.count);
-	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
-
-	/* log the changes made when moving the entries */
-	dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
-	dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
-	xfs_dir3_leaf_log_header(args, save_blk->bp);
-	xfs_dir3_leaf_log_header(args, drop_blk->bp);
-
-	xfs_dir3_leaf_check(dp, save_blk->bp);
-	xfs_dir3_leaf_check(dp, drop_blk->bp);
-}
-
-/*
- * Top-level node form directory addname routine.
- */
-int						/* error */
-xfs_dir2_node_addname(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
-	int			error;		/* error return value */
-	int			rval;		/* sub-return value */
-	xfs_da_state_t		*state;		/* btree cursor */
-
-	trace_xfs_dir2_node_addname(args);
-
-	/*
-	 * Allocate and initialize the state (btree cursor).
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	/*
-	 * Look up the name.  We're not supposed to find it, but
-	 * this gives us the insertion point.
-	 */
-	error = xfs_da3_node_lookup_int(state, &rval);
-	if (error)
-		rval = error;
-	if (rval != ENOENT) {
-		goto done;
-	}
-	/*
-	 * Add the data entry to a data block.
-	 * Extravalid is set to a freeblock found by lookup.
-	 */
-	rval = xfs_dir2_node_addname_int(args,
-		state->extravalid ? &state->extrablk : NULL);
-	if (rval) {
-		goto done;
-	}
-	blk = &state->path.blk[state->path.active - 1];
-	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-	/*
-	 * Add the new leaf entry.
-	 */
-	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
-	if (rval == 0) {
-		/*
-		 * It worked, fix the hash values up the btree.
-		 */
-		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
-			xfs_da3_fixhashpath(state, &state->path);
-	} else {
-		/*
-		 * It didn't work, we need to split the leaf block.
-		 */
-		if (args->total == 0) {
-			ASSERT(rval == ENOSPC);
-			goto done;
-		}
-		/*
-		 * Split the leaf block and insert the new entry.
-		 */
-		rval = xfs_da3_split(state);
-	}
-done:
-	xfs_da_state_free(state);
-	return rval;
-}
-
-/*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int					/* error */
-xfs_dir2_node_addname_int(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_db_t		dbno;		/* data block number */
-	struct xfs_buf		*dbp;		/* data block buffer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
-	int			error;		/* error return value */
-	xfs_dir2_db_t		fbno;		/* freespace block number */
-	struct xfs_buf		*fbp;		/* freespace buffer */
-	int			findex;		/* freespace entry index */
-	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
-	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
-	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
-	int			length;		/* length of the new entry */
-	int			logfree;	/* need to log free entry */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			needlog;	/* need to log data header */
-	int			needscan;	/* need to rescan data frees */
-	__be16			*tagp;		/* data entry tag pointer */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	__be16			*bests;
-	struct xfs_dir3_icfree_hdr freehdr;
-	struct xfs_dir2_data_free *bf;
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	length = dp->d_ops->data_entsize(args->namelen);
-	/*
-	 * If we came in with a freespace block that means that lookup
-	 * found an entry with our hash value.  This is the freespace
-	 * block for that data entry.
-	 */
-	if (fblk) {
-		fbp = fblk->bp;
-		/*
-		 * Remember initial freespace block number.
-		 */
-		ifbno = fblk->blkno;
-		free = fbp->b_addr;
-		findex = fblk->index;
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-		/*
-		 * This means the free entry showed that the data block had
-		 * space for our entry, so we remembered it.
-		 * Use that data block.
-		 */
-		if (findex >= 0) {
-			ASSERT(findex < freehdr.nvalid);
-			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
-			ASSERT(be16_to_cpu(bests[findex]) >= length);
-			dbno = freehdr.firstdb + findex;
-		} else {
-			/*
-			 * The data block looked at didn't have enough room.
-			 * We'll start at the beginning of the freespace entries.
-			 */
-			dbno = -1;
-			findex = 0;
-		}
-	} else {
-		/*
-		 * Didn't come in with a freespace block, so no data block.
-		 */
-		ifbno = dbno = -1;
-		fbp = NULL;
-		findex = 0;
-	}
-
-	/*
-	 * If we don't have a data block yet, we're going to scan the
-	 * freespace blocks looking for one.  Figure out what the
-	 * highest freespace block number is.
-	 */
-	if (dbno == -1) {
-		xfs_fileoff_t	fo;		/* freespace block number */
-
-		if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-			return error;
-		lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-		fbno = ifbno;
-	}
-	/*
-	 * While we haven't identified a data block, search the freeblock
-	 * data for a good data block.  If we find a null freeblock entry,
-	 * indicating a hole in the data blocks, remember that.
-	 */
-	while (dbno == -1) {
-		/*
-		 * If we don't have a freeblock in hand, get the next one.
-		 */
-		if (fbp == NULL) {
-			/*
-			 * Happens the first time through unless lookup gave
-			 * us a freespace block to start with.
-			 */
-			if (++fbno == 0)
-				fbno = xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET);
-			/*
-			 * If it's ifbno we already looked at it.
-			 */
-			if (fbno == ifbno)
-				fbno++;
-			/*
-			 * If it's off the end we're done.
-			 */
-			if (fbno >= lastfbno)
-				break;
-			/*
-			 * Read the block.  There can be holes in the
-			 * freespace blocks, so this might not succeed.
-			 * This should be really rare, so there's no reason
-			 * to avoid it.
-			 */
-			error = xfs_dir2_free_try_read(tp, dp,
-					xfs_dir2_db_to_da(args->geo, fbno),
-					&fbp);
-			if (error)
-				return error;
-			if (!fbp)
-				continue;
-			free = fbp->b_addr;
-			findex = 0;
-		}
-		/*
-		 * Look at the current free entry.  Is it good enough?
-		 *
-		 * The bests initialisation should be where the bufer is read in
-		 * the above branch. But gcc is too stupid to realise that bests
-		 * and the freehdr are actually initialised if they are placed
-		 * there, so we have to do it here to avoid warnings. Blech.
-		 */
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-		    be16_to_cpu(bests[findex]) >= length)
-			dbno = freehdr.firstdb + findex;
-		else {
-			/*
-			 * Are we done with the freeblock?
-			 */
-			if (++findex == freehdr.nvalid) {
-				/*
-				 * Drop the block.
-				 */
-				xfs_trans_brelse(tp, fbp);
-				fbp = NULL;
-				if (fblk && fblk->bp)
-					fblk->bp = NULL;
-			}
-		}
-	}
-	/*
-	 * If we don't have a data block, we need to allocate one and make
-	 * the freespace entries refer to it.
-	 */
-	if (unlikely(dbno == -1)) {
-		/*
-		 * Not allowed to allocate, return failure.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return ENOSPC;
-
-		/*
-		 * Allocate and initialize the new data block.
-		 */
-		if (unlikely((error = xfs_dir2_grow_inode(args,
-							 XFS_DIR2_DATA_SPACE,
-							 &dbno)) ||
-		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
-			return error;
-
-		/*
-		 * If (somehow) we have a freespace block, get rid of it.
-		 */
-		if (fbp)
-			xfs_trans_brelse(tp, fbp);
-		if (fblk && fblk->bp)
-			fblk->bp = NULL;
-
-		/*
-		 * Get the freespace block corresponding to the data block
-		 * that was just allocated.
-		 */
-		fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
-		error = xfs_dir2_free_try_read(tp, dp,
-				       xfs_dir2_db_to_da(args->geo, fbno),
-				       &fbp);
-		if (error)
-			return error;
-
-		/*
-		 * If there wasn't a freespace block, the read will
-		 * return a NULL fbp.  Allocate and initialize a new one.
-		 */
-		if (!fbp) {
-			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-						    &fbno);
-			if (error)
-				return error;
-
-			if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
-				xfs_alert(mp,
-			"%s: dir ino %llu needed freesp block %lld for\n"
-			"  data block %lld, got %lld ifbno %llu lastfbno %d",
-					__func__, (unsigned long long)dp->i_ino,
-					(long long)dp->d_ops->db_to_fdb(
-								args->geo, dbno),
-					(long long)dbno, (long long)fbno,
-					(unsigned long long)ifbno, lastfbno);
-				if (fblk) {
-					xfs_alert(mp,
-				" fblk 0x%p blkno %llu index %d magic 0x%x",
-						fblk,
-						(unsigned long long)fblk->blkno,
-						fblk->index,
-						fblk->magic);
-				} else {
-					xfs_alert(mp, " ... fblk is NULL");
-				}
-				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-						 XFS_ERRLEVEL_LOW, mp);
-				return EFSCORRUPTED;
-			}
-
-			/*
-			 * Get a buffer for the new block.
-			 */
-			error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-			if (error)
-				return error;
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-			/*
-			 * Remember the first slot as our empty slot.
-			 */
-			freehdr.firstdb =
-				(fbno - xfs_dir2_byte_to_db(args->geo,
-							XFS_DIR2_FREE_OFFSET)) *
-					dp->d_ops->free_max_bests(args->geo);
-		} else {
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		}
-
-		/*
-		 * Set the freespace block index from the data block number.
-		 */
-		findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-		/*
-		 * If it's after the end of the current entries in the
-		 * freespace block, extend that table.
-		 */
-		if (findex >= freehdr.nvalid) {
-			ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-			freehdr.nvalid = findex + 1;
-			/*
-			 * Tag new entry so nused will go up.
-			 */
-			bests[findex] = cpu_to_be16(NULLDATAOFF);
-		}
-		/*
-		 * If this entry was for an empty data block
-		 * (this should always be true) then update the header.
-		 */
-		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-			freehdr.nused++;
-			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-			xfs_dir2_free_log_header(args, fbp);
-		}
-		/*
-		 * Update the real value in the table.
-		 * We haven't allocated the data entry yet so this will
-		 * change again.
-		 */
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * We had a data block so we don't have to make a new one.
-	 */
-	else {
-		/*
-		 * If just checking, we succeeded.
-		 */
-		if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-			return 0;
-
-		/*
-		 * Read the data block in.
-		 */
-		error = xfs_dir3_data_read(tp, dp,
-					   xfs_dir2_db_to_da(args->geo, dbno),
-					   -1, &dbp);
-		if (error)
-			return error;
-		hdr = dbp->b_addr;
-		bf = dp->d_ops->data_bestfree_p(hdr);
-		logfree = 0;
-	}
-	ASSERT(be16_to_cpu(bf[0].length) >= length);
-	/*
-	 * Point to the existing unused space.
-	 */
-	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(bf[0].offset));
-	needscan = needlog = 0;
-	/*
-	 * Mark the first part of the unused space, inuse for us.
-	 */
-	xfs_dir2_data_use_free(args, dbp, dup,
-		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
-		&needlog, &needscan);
-	/*
-	 * Fill in the new entry and log it.
-	 */
-	dep = (xfs_dir2_data_entry_t *)dup;
-	dep->inumber = cpu_to_be64(args->inumber);
-	dep->namelen = args->namelen;
-	memcpy(dep->name, args->name, dep->namelen);
-	dp->d_ops->data_put_ftype(dep, args->filetype);
-	tagp = dp->d_ops->data_entry_tag_p(dep);
-	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(args, dbp, dep);
-	/*
-	 * Rescan the block for bestfree if needed.
-	 */
-	if (needscan)
-		xfs_dir2_data_freescan(dp, hdr, &needlog);
-	/*
-	 * Log the data block header if needed.
-	 */
-	if (needlog)
-		xfs_dir2_data_log_header(args, dbp);
-	/*
-	 * If the freespace entry is now wrong, update it.
-	 */
-	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
-	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
-		bests[findex] = bf[0].length;
-		logfree = 1;
-	}
-	/*
-	 * Log the freespace entry if needed.
-	 */
-	if (logfree)
-		xfs_dir2_free_log_bests(args, fbp, findex, findex);
-	/*
-	 * Return the data block and offset in args, then drop the data block.
-	 */
-	args->blkno = (xfs_dablk_t)dbno;
-	args->index = be16_to_cpu(*tagp);
-	return 0;
-}
-
-/*
- * Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da3_node_lookup_int.
- * The only real output is the inode number of the entry.
- */
-int						/* error */
-xfs_dir2_node_lookup(
-	xfs_da_args_t	*args)			/* operation arguments */
-{
-	int		error;			/* error return value */
-	int		i;			/* btree level */
-	int		rval;			/* operation return value */
-	xfs_da_state_t	*state;			/* btree cursor */
-
-	trace_xfs_dir2_node_lookup(args);
-
-	/*
-	 * Allocate and initialize the btree cursor.
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	/*
-	 * Fill in the path to the entry in the cursor.
-	 */
-	error = xfs_da3_node_lookup_int(state, &rval);
-	if (error)
-		rval = error;
-	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
-		/* If a CI match, dup the actual name and return EEXIST */
-		xfs_dir2_data_entry_t	*dep;
-
-		dep = (xfs_dir2_data_entry_t *)
-			((char *)state->extrablk.bp->b_addr +
-						 state->extrablk.index);
-		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
-	}
-	/*
-	 * Release the btree blocks and leaf block.
-	 */
-	for (i = 0; i < state->path.active; i++) {
-		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-	/*
-	 * Release the data block if we have it.
-	 */
-	if (state->extravalid && state->extrablk.bp) {
-		xfs_trans_brelse(args->trans, state->extrablk.bp);
-		state->extrablk.bp = NULL;
-	}
-	xfs_da_state_free(state);
-	return rval;
-}
-
-/*
- * Remove an entry from a node-format directory.
- */
-int						/* error */
-xfs_dir2_node_removename(
-	struct xfs_da_args	*args)		/* operation arguments */
-{
-	struct xfs_da_state_blk	*blk;		/* leaf block */
-	int			error;		/* error return value */
-	int			rval;		/* operation return value */
-	struct xfs_da_state	*state;		/* btree cursor */
-
-	trace_xfs_dir2_node_removename(args);
-
-	/*
-	 * Allocate and initialize the btree cursor.
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-
-	/* Look up the entry we're deleting, set up the cursor. */
-	error = xfs_da3_node_lookup_int(state, &rval);
-	if (error)
-		goto out_free;
-
-	/* Didn't find it, upper layer screwed up. */
-	if (rval != EEXIST) {
-		error = rval;
-		goto out_free;
-	}
-
-	blk = &state->path.blk[state->path.active - 1];
-	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-	ASSERT(state->extravalid);
-	/*
-	 * Remove the leaf and data entries.
-	 * Extrablk refers to the data block.
-	 */
-	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
-		&state->extrablk, &rval);
-	if (error)
-		goto out_free;
-	/*
-	 * Fix the hash values up the btree.
-	 */
-	xfs_da3_fixhashpath(state, &state->path);
-	/*
-	 * If we need to join leaf blocks, do it.
-	 */
-	if (rval && state->path.active > 1)
-		error = xfs_da3_join(state);
-	/*
-	 * If no errors so far, try conversion to leaf format.
-	 */
-	if (!error)
-		error = xfs_dir2_node_to_leaf(state);
-out_free:
-	xfs_da_state_free(state);
-	return error;
-}
-
-/*
- * Replace an entry's inode number in a node-format directory.
- */
-int						/* error */
-xfs_dir2_node_replace(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_da_state_blk_t	*blk;		/* leaf block */
-	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
-	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
-	int			error;		/* error return value */
-	int			i;		/* btree level */
-	xfs_ino_t		inum;		/* new inode number */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
-	int			rval;		/* internal return value */
-	xfs_da_state_t		*state;		/* btree cursor */
-
-	trace_xfs_dir2_node_replace(args);
-
-	/*
-	 * Allocate and initialize the btree cursor.
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	inum = args->inumber;
-	/*
-	 * Lookup the entry to change in the btree.
-	 */
-	error = xfs_da3_node_lookup_int(state, &rval);
-	if (error) {
-		rval = error;
-	}
-	/*
-	 * It should be found, since the vnodeops layer has looked it up
-	 * and locked it.  But paranoia is good.
-	 */
-	if (rval == EEXIST) {
-		struct xfs_dir2_leaf_entry *ents;
-		/*
-		 * Find the leaf entry.
-		 */
-		blk = &state->path.blk[state->path.active - 1];
-		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
-		leaf = blk->bp->b_addr;
-		ents = args->dp->d_ops->leaf_ents_p(leaf);
-		lep = &ents[blk->index];
-		ASSERT(state->extravalid);
-		/*
-		 * Point to the data entry.
-		 */
-		hdr = state->extrablk.bp->b_addr;
-		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-		       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
-		dep = (xfs_dir2_data_entry_t *)
-		      ((char *)hdr +
-		       xfs_dir2_dataptr_to_off(args->geo,
-					       be32_to_cpu(lep->address)));
-		ASSERT(inum != be64_to_cpu(dep->inumber));
-		/*
-		 * Fill in the new inode number and log the entry.
-		 */
-		dep->inumber = cpu_to_be64(inum);
-		args->dp->d_ops->data_put_ftype(dep, args->filetype);
-		xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
-		rval = 0;
-	}
-	/*
-	 * Didn't find it, and we're holding a data block.  Drop it.
-	 */
-	else if (state->extravalid) {
-		xfs_trans_brelse(args->trans, state->extrablk.bp);
-		state->extrablk.bp = NULL;
-	}
-	/*
-	 * Release all the buffers in the cursor.
-	 */
-	for (i = 0; i < state->path.active; i++) {
-		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-	xfs_da_state_free(state);
-	return rval;
-}
-
-/*
- * Trim off a trailing empty freespace block.
- * Return (in rvalp) 1 if we did it, 0 if not.
- */
-int						/* error */
-xfs_dir2_node_trim_free(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_fileoff_t		fo,		/* free block number */
-	int			*rvalp)		/* out: did something */
-{
-	struct xfs_buf		*bp;		/* freespace buffer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	xfs_dir2_free_t		*free;		/* freespace structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	struct xfs_dir3_icfree_hdr freehdr;
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	tp = args->trans;
-	/*
-	 * Read the freespace block.
-	 */
-	error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
-	if (error)
-		return error;
-	/*
-	 * There can be holes in freespace.  If fo is a hole, there's
-	 * nothing to do.
-	 */
-	if (!bp)
-		return 0;
-	free = bp->b_addr;
-	dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-	/*
-	 * If there are used entries, there's nothing to do.
-	 */
-	if (freehdr.nused > 0) {
-		xfs_trans_brelse(tp, bp);
-		*rvalp = 0;
-		return 0;
-	}
-	/*
-	 * Blow the block away.
-	 */
-	error = xfs_dir2_shrink_inode(args,
-			xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
-	if (error) {
-		/*
-		 * Can't fail with ENOSPC since that only happens with no
-		 * space reservation, when breaking up an extent into two
-		 * pieces.  This is the last block of an extent.
-		 */
-		ASSERT(error != ENOSPC);
-		xfs_trans_brelse(tp, bp);
-		return error;
-	}
-	/*
-	 * Return that we succeeded.
-	 */
-	*rvalp = 1;
-	return 0;
-}
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
deleted file mode 100644
index 27ce0794d196..000000000000
--- a/fs/xfs/xfs_dir2_priv.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_PRIV_H__
-#define __XFS_DIR2_PRIV_H__
-
-struct dir_context;
-
-/*
- * Directory offset/block conversion functions.
- *
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
-{
-	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr.  It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
-{
-	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-	return (xfs_dir2_db_t)(by >> geo->blklog);
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-	return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-	return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-	return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-			xfs_dir2_data_aoff_t o)
-{
-	return ((xfs_dir2_off_t)db << geo->blklog) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-	return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-	return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-			   xfs_dir2_data_aoff_t o)
-{
-	return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-	return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-	return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
-}
-
-/*
- * Directory tail pointer accessor functions. Based on block geometry.
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
-{
-	return ((struct xfs_dir2_block_tail *)
-		((char *)hdr + geo->blksize)) - 1;
-}
-
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
-{
-	return (struct xfs_dir2_leaf_tail *)
-		((char *)lp + geo->blksize -
-		  sizeof(struct xfs_dir2_leaf_tail));
-}
-
-/* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
-				xfs_dir2_db_t *dbp);
-extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
-				const unsigned char *name, int len);
-
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
-
-extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
-					__uint8_t filetype);
-
-
-/* xfs_dir2_block.c */
-extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
-			       struct xfs_buf **bpp);
-extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_block_removename(struct xfs_da_args *args);
-extern int xfs_dir2_block_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
-		struct xfs_buf *lbp, struct xfs_buf *dbp);
-
-/* xfs_dir2_data.c */
-#ifdef DEBUG
-#define	xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
-#else
-#define	xfs_dir3_data_check(dp,bp)
-#endif
-
-extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
-extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
-		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
-		xfs_daddr_t mapped_bno);
-
-extern struct xfs_dir2_data_free *
-xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
-		struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
-		int *loghead);
-extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
-		struct xfs_buf **bpp);
-
-/* xfs_dir2_leaf.c */
-extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
-		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
-extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
-		struct xfs_buf *dbp);
-extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
-		struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
-extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
-		struct xfs_dir2_leaf_entry *ents, int *indexp,
-		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
-		struct xfs_buf **bpp, __uint16_t magic);
-extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
-		struct xfs_buf *bp, int first, int last);
-extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
-		struct xfs_buf *bp);
-extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
-		struct xfs_buf *lbp);
-extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
-		struct xfs_buf *lbp, xfs_dir2_db_t db);
-extern struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
-		struct xfs_dir2_leaf_entry *ents, int index, int compact,
-		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
-extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
-
-extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
-		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
-
-/* xfs_dir2_node.c */
-extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
-		struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
-		struct xfs_buf *bp, int *count);
-extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
-		struct xfs_da_args *args, int *indexp,
-		struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
-		struct xfs_buf *leaf2_bp);
-extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
-	struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
-extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
-extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
-		struct xfs_da_state_blk *drop_blk,
-		struct xfs_da_state_blk *save_blk);
-extern int xfs_dir2_node_addname(struct xfs_da_args *args);
-extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_node_removename(struct xfs_da_args *args);
-extern int xfs_dir2_node_replace(struct xfs_da_args *args);
-extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
-		int *rvalp);
-extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
-		xfs_dablk_t fbno, struct xfs_buf **bpp);
-
-/* xfs_dir2_sf.c */
-extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
-		struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
-extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
-		int size, xfs_dir2_sf_hdr_t *sfhp);
-extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
-extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-
-/* xfs_dir2_readdir.c */
-extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
-		       size_t bufsize);
-
-#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
deleted file mode 100644
index ab3563b87995..000000000000
--- a/fs/xfs/xfs_dir2_sf.c
+++ /dev/null
@@ -1,1184 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_trace.h"
-#include "xfs_dinode.h"
-
-/*
- * Prototypes for internal functions.
- */
-static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
-				     xfs_dir2_sf_entry_t *sfep,
-				     xfs_dir2_data_aoff_t offset,
-				     int new_isize);
-static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
-				     int new_isize);
-static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
-				    xfs_dir2_sf_entry_t **sfepp,
-				    xfs_dir2_data_aoff_t *offsetp);
-#ifdef DEBUG
-static void xfs_dir2_sf_check(xfs_da_args_t *args);
-#else
-#define	xfs_dir2_sf_check(args)
-#endif /* DEBUG */
-#if XFS_BIG_INUMS
-static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
-static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-#endif /* XFS_BIG_INUMS */
-
-/*
- * Given a block directory (dp/block), calculate its size as a shortform (sf)
- * directory and a header for the sf directory, if it will fit it the
- * space currently present in the inode.  If it won't fit, the output
- * size is too big (but not accurate).
- */
-int						/* size for sf form */
-xfs_dir2_block_sfsize(
-	xfs_inode_t		*dp,		/* incore inode pointer */
-	xfs_dir2_data_hdr_t	*hdr,		/* block directory data */
-	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
-{
-	xfs_dir2_dataptr_t	addr;		/* data entry address */
-	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
-	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
-	int			count;		/* shortform entry count */
-	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
-	int			i;		/* block entry index */
-	int			i8count;	/* count of big-inode entries */
-	int			isdot;		/* entry is "." */
-	int			isdotdot;	/* entry is ".." */
-	xfs_mount_t		*mp;		/* mount structure pointer */
-	int			namelen;	/* total name bytes */
-	xfs_ino_t		parent = 0;	/* parent inode number */
-	int			size=0;		/* total computed size */
-	int			has_ftype;
-	struct xfs_da_geometry	*geo;
-
-	mp = dp->i_mount;
-	geo = mp->m_dir_geo;
-
-	/*
-	 * if there is a filetype field, add the extra byte to the namelen
-	 * for each entry that we see.
-	 */
-	has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
-
-	count = i8count = namelen = 0;
-	btp = xfs_dir2_block_tail_p(geo, hdr);
-	blp = xfs_dir2_block_leaf_p(btp);
-
-	/*
-	 * Iterate over the block's data entries by using the leaf pointers.
-	 */
-	for (i = 0; i < be32_to_cpu(btp->count); i++) {
-		if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
-			continue;
-		/*
-		 * Calculate the pointer to the entry at hand.
-		 */
-		dep = (xfs_dir2_data_entry_t *)((char *)hdr +
-				xfs_dir2_dataptr_to_off(geo, addr));
-		/*
-		 * Detect . and .., so we can special-case them.
-		 * . is not included in sf directories.
-		 * .. is included by just the parent inode number.
-		 */
-		isdot = dep->namelen == 1 && dep->name[0] == '.';
-		isdotdot =
-			dep->namelen == 2 &&
-			dep->name[0] == '.' && dep->name[1] == '.';
-#if XFS_BIG_INUMS
-		if (!isdot)
-			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
-#endif
-		/* take into account the file type field */
-		if (!isdot && !isdotdot) {
-			count++;
-			namelen += dep->namelen + has_ftype;
-		} else if (isdotdot)
-			parent = be64_to_cpu(dep->inumber);
-		/*
-		 * Calculate the new size, see if we should give up yet.
-		 */
-		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
-		       count +					/* namelen */
-		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
-		       namelen +				/* name */
-		       (i8count ?				/* inumber */
-				(uint)sizeof(xfs_dir2_ino8_t) * count :
-				(uint)sizeof(xfs_dir2_ino4_t) * count);
-		if (size > XFS_IFORK_DSIZE(dp))
-			return size;		/* size value is a failure */
-	}
-	/*
-	 * Create the output header, if it worked.
-	 */
-	sfhp->count = count;
-	sfhp->i8count = i8count;
-	dp->d_ops->sf_put_parent_ino(sfhp, parent);
-	return size;
-}
-
-/*
- * Convert a block format directory to shortform.
- * Caller has already checked that it will fit, and built us a header.
- */
-int						/* error */
-xfs_dir2_block_to_sf(
-	xfs_da_args_t		*args,		/* operation arguments */
-	struct xfs_buf		*bp,
-	int			size,		/* shortform directory size */
-	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
-{
-	xfs_dir2_data_hdr_t	*hdr;		/* block header */
-	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
-	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
-	char			*endptr;	/* end of data entries */
-	int			error;		/* error return value */
-	int			logflags;	/* inode logging flags */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	char			*ptr;		/* current data pointer */
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform directory header */
-	xfs_dir2_sf_hdr_t	*dst;		/* temporary data buffer */
-
-	trace_xfs_dir2_block_to_sf(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	/*
-	 * allocate a temporary destination buffer the size of the inode
-	 * to format the data into. Once we have formatted the data, we
-	 * can free the block and copy the formatted data into the inode literal
-	 * area.
-	 */
-	dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
-	hdr = bp->b_addr;
-
-	/*
-	 * Copy the header into the newly allocate local space.
-	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dst;
-	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
-
-	/*
-	 * Set up to loop over the block's entries.
-	 */
-	btp = xfs_dir2_block_tail_p(args->geo, hdr);
-	ptr = (char *)dp->d_ops->data_entry_p(hdr);
-	endptr = (char *)xfs_dir2_block_leaf_p(btp);
-	sfep = xfs_dir2_sf_firstentry(sfp);
-	/*
-	 * Loop over the active and unused entries.
-	 * Stop when we reach the leaf/tail portion of the block.
-	 */
-	while (ptr < endptr) {
-		/*
-		 * If it's unused, just skip over it.
-		 */
-		dup = (xfs_dir2_data_unused_t *)ptr;
-		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			ptr += be16_to_cpu(dup->length);
-			continue;
-		}
-		dep = (xfs_dir2_data_entry_t *)ptr;
-		/*
-		 * Skip .
-		 */
-		if (dep->namelen == 1 && dep->name[0] == '.')
-			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
-		/*
-		 * Skip .., but make sure the inode number is right.
-		 */
-		else if (dep->namelen == 2 &&
-			 dep->name[0] == '.' && dep->name[1] == '.')
-			ASSERT(be64_to_cpu(dep->inumber) ==
-			       dp->d_ops->sf_get_parent_ino(sfp));
-		/*
-		 * Normal entry, copy it into shortform.
-		 */
-		else {
-			sfep->namelen = dep->namelen;
-			xfs_dir2_sf_put_offset(sfep,
-				(xfs_dir2_data_aoff_t)
-				((char *)dep - (char *)hdr));
-			memcpy(sfep->name, dep->name, dep->namelen);
-			dp->d_ops->sf_put_ino(sfp, sfep,
-					      be64_to_cpu(dep->inumber));
-			dp->d_ops->sf_put_ftype(sfep,
-					dp->d_ops->data_get_ftype(dep));
-
-			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-		}
-		ptr += dp->d_ops->data_entsize(dep->namelen);
-	}
-	ASSERT((char *)sfep - (char *)sfp == size);
-
-	/* now we are done with the block, we can shrink the inode */
-	logflags = XFS_ILOG_CORE;
-	error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
-	if (error) {
-		ASSERT(error != ENOSPC);
-		goto out;
-	}
-
-	/*
-	 * The buffer is now unconditionally gone, whether
-	 * xfs_dir2_shrink_inode worked or not.
-	 *
-	 * Convert the inode to local format and copy the data in.
-	 */
-	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-	dp->i_df.if_flags |= XFS_IFINLINE;
-	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-	ASSERT(dp->i_df.if_bytes == 0);
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-
-	logflags |= XFS_ILOG_DDATA;
-	memcpy(dp->i_df.if_u1.if_data, dst, size);
-	dp->i_d.di_size = size;
-	xfs_dir2_sf_check(args);
-out:
-	xfs_trans_log_inode(args->trans, dp, logflags);
-	kmem_free(dst);
-	return error;
-}
-
-/*
- * Add a name to a shortform directory.
- * There are two algorithms, "easy" and "hard" which we decide on
- * before changing anything.
- * Convert to block form if necessary, if the new entry won't fit.
- */
-int						/* error */
-xfs_dir2_sf_addname(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return value */
-	int			incr_isize;	/* total change in size */
-	int			new_isize;	/* di_size after adding name */
-	int			objchange;	/* changing to 8-byte inodes */
-	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
-	int			pick;		/* which algorithm to use */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
-
-	trace_xfs_dir2_sf_addname(args);
-
-	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Make sure the shortform value has some of its header.
-	 */
-	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return EIO;
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-	/*
-	 * Compute entry (and change in) size.
-	 */
-	incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
-	objchange = 0;
-#if XFS_BIG_INUMS
-	/*
-	 * Do we have to change to 8 byte inodes?
-	 */
-	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
-		/*
-		 * Yes, adjust the inode size.  old count + (parent + new)
-		 */
-		incr_isize +=
-			(sfp->count + 2) *
-			((uint)sizeof(xfs_dir2_ino8_t) -
-			 (uint)sizeof(xfs_dir2_ino4_t));
-		objchange = 1;
-	}
-#endif
-	new_isize = (int)dp->i_d.di_size + incr_isize;
-	/*
-	 * Won't fit as shortform any more (due to size),
-	 * or the pick routine says it won't (due to offset values).
-	 */
-	if (new_isize > XFS_IFORK_DSIZE(dp) ||
-	    (pick =
-	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
-		/*
-		 * Just checking or no space reservation, it doesn't fit.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return ENOSPC;
-		/*
-		 * Convert to block form then add the name.
-		 */
-		error = xfs_dir2_sf_to_block(args);
-		if (error)
-			return error;
-		return xfs_dir2_block_addname(args);
-	}
-	/*
-	 * Just checking, it fits.
-	 */
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-		return 0;
-	/*
-	 * Do it the easy way - just add it at the end.
-	 */
-	if (pick == 1)
-		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
-	/*
-	 * Do it the hard way - look for a place to insert the new entry.
-	 * Convert to 8 byte inode numbers first if necessary.
-	 */
-	else {
-		ASSERT(pick == 2);
-#if XFS_BIG_INUMS
-		if (objchange)
-			xfs_dir2_sf_toino8(args);
-#endif
-		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
-	}
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-	return 0;
-}
-
-/*
- * Add the new entry the "easy" way.
- * This is copying the old directory and adding the new entry at the end.
- * Since it's sorted by "offset" we need room after the last offset
- * that's already there, and then room to convert to a block directory.
- * This is already checked by the pick routine.
- */
-static void
-xfs_dir2_sf_addname_easy(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
-	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
-	int			new_isize)	/* new directory size */
-{
-	int			byteoff;	/* byte offset in sf dir */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-
-	dp = args->dp;
-
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	byteoff = (int)((char *)sfep - (char *)sfp);
-	/*
-	 * Grow the in-inode space.
-	 */
-	xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
-			  XFS_DATA_FORK);
-	/*
-	 * Need to set up again due to realloc of the inode data.
-	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
-	/*
-	 * Fill in the new entry.
-	 */
-	sfep->namelen = args->namelen;
-	xfs_dir2_sf_put_offset(sfep, offset);
-	memcpy(sfep->name, args->name, sfep->namelen);
-	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-	dp->d_ops->sf_put_ftype(sfep, args->filetype);
-
-	/*
-	 * Update the header and inode.
-	 */
-	sfp->count++;
-#if XFS_BIG_INUMS
-	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
-		sfp->i8count++;
-#endif
-	dp->i_d.di_size = new_isize;
-	xfs_dir2_sf_check(args);
-}
-
-/*
- * Add the new entry the "hard" way.
- * The caller has already converted to 8 byte inode numbers if necessary,
- * in which case we need to leave the i8count at 1.
- * Find a hole that the new entry will fit into, and copy
- * the first part of the entries, the new entry, and the last part of
- * the entries.
- */
-/* ARGSUSED */
-static void
-xfs_dir2_sf_addname_hard(
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			objchange,	/* changing inode number size */
-	int			new_isize)	/* new directory size */
-{
-	int			add_datasize;	/* data size need for new ent */
-	char			*buf;		/* buffer for old */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			eof;		/* reached end of old dir */
-	int			nbytes;		/* temp for byte copies */
-	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
-	xfs_dir2_data_aoff_t	offset;		/* current offset value */
-	int			old_isize;	/* previous di_size */
-	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* original shortform dir */
-	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
-	xfs_dir2_sf_hdr_t	*sfp;		/* new shortform dir */
-	struct xfs_mount	*mp;
-
-	/*
-	 * Copy the old directory to the stack buffer.
-	 */
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	old_isize = (int)dp->i_d.di_size;
-	buf = kmem_alloc(old_isize, KM_SLEEP);
-	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	memcpy(oldsfp, sfp, old_isize);
-	/*
-	 * Loop over the old directory finding the place we're going
-	 * to insert the new entry.
-	 * If it's going to end up at the end then oldsfep will point there.
-	 */
-	for (offset = dp->d_ops->data_first_offset,
-	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
-	      add_datasize = dp->d_ops->data_entsize(args->namelen),
-	      eof = (char *)oldsfep == &buf[old_isize];
-	     !eof;
-	     offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
-	      oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
-	      eof = (char *)oldsfep == &buf[old_isize]) {
-		new_offset = xfs_dir2_sf_get_offset(oldsfep);
-		if (offset + add_datasize <= new_offset)
-			break;
-	}
-	/*
-	 * Get rid of the old directory, then allocate space for
-	 * the new one.  We do this so xfs_idata_realloc won't copy
-	 * the data.
-	 */
-	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
-	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
-	/*
-	 * Reset the pointer since the buffer was reallocated.
-	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	/*
-	 * Copy the first part of the directory, including the header.
-	 */
-	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
-	memcpy(sfp, oldsfp, nbytes);
-	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
-	/*
-	 * Fill in the new entry, and update the header counts.
-	 */
-	sfep->namelen = args->namelen;
-	xfs_dir2_sf_put_offset(sfep, offset);
-	memcpy(sfep->name, args->name, sfep->namelen);
-	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-	dp->d_ops->sf_put_ftype(sfep, args->filetype);
-	sfp->count++;
-#if XFS_BIG_INUMS
-	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
-		sfp->i8count++;
-#endif
-	/*
-	 * If there's more left to copy, do that.
-	 */
-	if (!eof) {
-		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-		memcpy(sfep, oldsfep, old_isize - nbytes);
-	}
-	kmem_free(buf);
-	dp->i_d.di_size = new_isize;
-	xfs_dir2_sf_check(args);
-}
-
-/*
- * Decide if the new entry will fit at all.
- * If it will fit, pick between adding the new entry to the end (easy)
- * or somewhere else (hard).
- * Return 0 (won't fit), 1 (easy), 2 (hard).
- */
-/*ARGSUSED*/
-static int					/* pick result */
-xfs_dir2_sf_addname_pick(
-	xfs_da_args_t		*args,		/* operation arguments */
-	int			objchange,	/* inode # size changes */
-	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
-	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			holefit;	/* found hole it will fit in */
-	int			i;		/* entry number */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_dir2_data_aoff_t	offset;		/* data block offset */
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-	int			size;		/* entry's data size */
-	int			used;		/* data bytes used */
-
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	size = dp->d_ops->data_entsize(args->namelen);
-	offset = dp->d_ops->data_first_offset;
-	sfep = xfs_dir2_sf_firstentry(sfp);
-	holefit = 0;
-	/*
-	 * Loop over sf entries.
-	 * Keep track of data offset and whether we've seen a place
-	 * to insert the new entry.
-	 */
-	for (i = 0; i < sfp->count; i++) {
-		if (!holefit)
-			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
-		offset = xfs_dir2_sf_get_offset(sfep) +
-			 dp->d_ops->data_entsize(sfep->namelen);
-		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
-	}
-	/*
-	 * Calculate data bytes used excluding the new entry, if this
-	 * was a data block (block form directory).
-	 */
-	used = offset +
-	       (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-	       (uint)sizeof(xfs_dir2_block_tail_t);
-	/*
-	 * If it won't fit in a block form then we can't insert it,
-	 * we'll go back, convert to block, then try the insert and convert
-	 * to leaf.
-	 */
-	if (used + (holefit ? 0 : size) > args->geo->blksize)
-		return 0;
-	/*
-	 * If changing the inode number size, do it the hard way.
-	 */
-#if XFS_BIG_INUMS
-	if (objchange) {
-		return 2;
-	}
-#else
-	ASSERT(objchange == 0);
-#endif
-	/*
-	 * If it won't fit at the end then do it the hard way (use the hole).
-	 */
-	if (used + size > args->geo->blksize)
-		return 2;
-	/*
-	 * Do it the easy way.
-	 */
-	*sfepp = sfep;
-	*offsetp = offset;
-	return 1;
-}
-
-#ifdef DEBUG
-/*
- * Check consistency of shortform directory, assert if bad.
- */
-static void
-xfs_dir2_sf_check(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			i;		/* entry number */
-	int			i8count;	/* number of big inode#s */
-	xfs_ino_t		ino;		/* entry inode number */
-	int			offset;		/* data offset */
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-	struct xfs_mount	*mp;
-
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = dp->d_ops->data_first_offset;
-	ino = dp->d_ops->sf_get_parent_ino(sfp);
-	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
-
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-	     i < sfp->count;
-	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
-		ino = dp->d_ops->sf_get_ino(sfp, sfep);
-		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
-		offset =
-			xfs_dir2_sf_get_offset(sfep) +
-			dp->d_ops->data_entsize(sfep->namelen);
-		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
-	}
-	ASSERT(i8count == sfp->i8count);
-	ASSERT(XFS_BIG_INUMS || i8count == 0);
-	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
-	ASSERT(offset +
-	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-	       (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
-}
-#endif	/* DEBUG */
-
-/*
- * Create a new (shortform) directory.
- */
-int					/* error, always 0 */
-xfs_dir2_sf_create(
-	xfs_da_args_t	*args,		/* operation arguments */
-	xfs_ino_t	pino)		/* parent inode number */
-{
-	xfs_inode_t	*dp;		/* incore directory inode */
-	int		i8count;	/* parent inode is an 8-byte number */
-	xfs_dir2_sf_hdr_t *sfp;		/* shortform structure */
-	int		size;		/* directory size */
-
-	trace_xfs_dir2_sf_create(args);
-
-	dp = args->dp;
-
-	ASSERT(dp != NULL);
-	ASSERT(dp->i_d.di_size == 0);
-	/*
-	 * If it's currently a zero-length extent file,
-	 * convert it to local format.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
-		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
-		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-		dp->i_df.if_flags |= XFS_IFINLINE;
-	}
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	ASSERT(dp->i_df.if_bytes == 0);
-	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
-	size = xfs_dir2_sf_hdr_size(i8count);
-	/*
-	 * Make a buffer for the data.
-	 */
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-	/*
-	 * Fill in the header,
-	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	sfp->i8count = i8count;
-	/*
-	 * Now can put in the inode number, since i8count is set.
-	 */
-	dp->d_ops->sf_put_parent_ino(sfp, pino);
-	sfp->count = 0;
-	dp->i_d.di_size = size;
-	xfs_dir2_sf_check(args);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-	return 0;
-}
-
-/*
- * Lookup an entry in a shortform directory.
- * Returns EEXIST if found, ENOENT if not found.
- */
-int						/* error */
-xfs_dir2_sf_lookup(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			i;		/* entry index */
-	int			error;
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-	enum xfs_dacmp		cmp;		/* comparison result */
-	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
-
-	trace_xfs_dir2_sf_lookup(args);
-
-	xfs_dir2_sf_check(args);
-	dp = args->dp;
-
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Bail out if the directory is way too short.
-	 */
-	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return EIO;
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-	/*
-	 * Special case for .
-	 */
-	if (args->namelen == 1 && args->name[0] == '.') {
-		args->inumber = dp->i_ino;
-		args->cmpresult = XFS_CMP_EXACT;
-		args->filetype = XFS_DIR3_FT_DIR;
-		return EEXIST;
-	}
-	/*
-	 * Special case for ..
-	 */
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-		args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
-		args->cmpresult = XFS_CMP_EXACT;
-		args->filetype = XFS_DIR3_FT_DIR;
-		return EEXIST;
-	}
-	/*
-	 * Loop over all the entries trying to match ours.
-	 */
-	ci_sfep = NULL;
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-		/*
-		 * Compare name and if it's an exact match, return the inode
-		 * number. If it's the first case-insensitive match, store the
-		 * inode number and continue looking for an exact match.
-		 */
-		cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
-								sfep->namelen);
-		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-			args->cmpresult = cmp;
-			args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
-			args->filetype = dp->d_ops->sf_get_ftype(sfep);
-			if (cmp == XFS_CMP_EXACT)
-				return EEXIST;
-			ci_sfep = sfep;
-		}
-	}
-	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-	/*
-	 * Here, we can only be doing a lookup (not a rename or replace).
-	 * If a case-insensitive match was not found, return ENOENT.
-	 */
-	if (!ci_sfep)
-		return ENOENT;
-	/* otherwise process the CI match as required by the caller */
-	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
-	return error;
-}
-
-/*
- * Remove an entry from a shortform directory.
- */
-int						/* error */
-xfs_dir2_sf_removename(
-	xfs_da_args_t		*args)
-{
-	int			byteoff;	/* offset of removed entry */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			entsize;	/* this entry's size */
-	int			i;		/* shortform entry index */
-	int			newsize;	/* new inode size */
-	int			oldsize;	/* old inode size */
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-
-	trace_xfs_dir2_sf_removename(args);
-
-	dp = args->dp;
-
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	oldsize = (int)dp->i_d.di_size;
-	/*
-	 * Bail out if the directory is way too short.
-	 */
-	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return EIO;
-	}
-	ASSERT(dp->i_df.if_bytes == oldsize);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
-	/*
-	 * Loop over the old directory entries.
-	 * Find the one we're deleting.
-	 */
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-								XFS_CMP_EXACT) {
-			ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
-			       args->inumber);
-			break;
-		}
-	}
-	/*
-	 * Didn't find it.
-	 */
-	if (i == sfp->count)
-		return ENOENT;
-	/*
-	 * Calculate sizes.
-	 */
-	byteoff = (int)((char *)sfep - (char *)sfp);
-	entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
-	newsize = oldsize - entsize;
-	/*
-	 * Copy the part if any after the removed entry, sliding it down.
-	 */
-	if (byteoff + entsize < oldsize)
-		memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
-			oldsize - (byteoff + entsize));
-	/*
-	 * Fix up the header and file size.
-	 */
-	sfp->count--;
-	dp->i_d.di_size = newsize;
-	/*
-	 * Reallocate, making it smaller.
-	 */
-	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-#if XFS_BIG_INUMS
-	/*
-	 * Are we changing inode number size?
-	 */
-	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
-		if (sfp->i8count == 1)
-			xfs_dir2_sf_toino4(args);
-		else
-			sfp->i8count--;
-	}
-#endif
-	xfs_dir2_sf_check(args);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-	return 0;
-}
-
-/*
- * Replace the inode number of an entry in a shortform directory.
- */
-int						/* error */
-xfs_dir2_sf_replace(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			i;		/* entry index */
-#if XFS_BIG_INUMS || defined(DEBUG)
-	xfs_ino_t		ino=0;		/* entry old inode number */
-#endif
-#if XFS_BIG_INUMS
-	int			i8elevated;	/* sf_toino8 set i8count=1 */
-#endif
-	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
-
-	trace_xfs_dir2_sf_replace(args);
-
-	dp = args->dp;
-
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Bail out if the shortform directory is way too small.
-	 */
-	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return EIO;
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-#if XFS_BIG_INUMS
-	/*
-	 * New inode number is large, and need to convert to 8-byte inodes.
-	 */
-	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
-		int	error;			/* error return value */
-		int	newsize;		/* new inode size */
-
-		newsize =
-			dp->i_df.if_bytes +
-			(sfp->count + 1) *
-			((uint)sizeof(xfs_dir2_ino8_t) -
-			 (uint)sizeof(xfs_dir2_ino4_t));
-		/*
-		 * Won't fit as shortform, convert to block then do replace.
-		 */
-		if (newsize > XFS_IFORK_DSIZE(dp)) {
-			error = xfs_dir2_sf_to_block(args);
-			if (error) {
-				return error;
-			}
-			return xfs_dir2_block_replace(args);
-		}
-		/*
-		 * Still fits, convert to 8-byte now.
-		 */
-		xfs_dir2_sf_toino8(args);
-		i8elevated = 1;
-		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	} else
-		i8elevated = 0;
-#endif
-	ASSERT(args->namelen != 1 || args->name[0] != '.');
-	/*
-	 * Replace ..'s entry.
-	 */
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-#if XFS_BIG_INUMS || defined(DEBUG)
-		ino = dp->d_ops->sf_get_parent_ino(sfp);
-		ASSERT(args->inumber != ino);
-#endif
-		dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
-	}
-	/*
-	 * Normal entry, look for the name.
-	 */
-	else {
-		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-		     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
-			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-								XFS_CMP_EXACT) {
-#if XFS_BIG_INUMS || defined(DEBUG)
-				ino = dp->d_ops->sf_get_ino(sfp, sfep);
-				ASSERT(args->inumber != ino);
-#endif
-				dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
-				dp->d_ops->sf_put_ftype(sfep, args->filetype);
-				break;
-			}
-		}
-		/*
-		 * Didn't find it.
-		 */
-		if (i == sfp->count) {
-			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-#if XFS_BIG_INUMS
-			if (i8elevated)
-				xfs_dir2_sf_toino4(args);
-#endif
-			return ENOENT;
-		}
-	}
-#if XFS_BIG_INUMS
-	/*
-	 * See if the old number was large, the new number is small.
-	 */
-	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
-	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
-		/*
-		 * And the old count was one, so need to convert to small.
-		 */
-		if (sfp->i8count == 1)
-			xfs_dir2_sf_toino4(args);
-		else
-			sfp->i8count--;
-	}
-	/*
-	 * See if the old number was small, the new number is large.
-	 */
-	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
-	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
-		/*
-		 * add to the i8count unless we just converted to 8-byte
-		 * inodes (which does an implied i8count = 1)
-		 */
-		ASSERT(sfp->i8count != 0);
-		if (!i8elevated)
-			sfp->i8count++;
-	}
-#endif
-	xfs_dir2_sf_check(args);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-	return 0;
-}
-
-#if XFS_BIG_INUMS
-/*
- * Convert from 8-byte inode numbers to 4-byte inode numbers.
- * The last 8-byte inode number is gone, but the count is still 1.
- */
-static void
-xfs_dir2_sf_toino4(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	char			*buf;		/* old dir's buffer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			i;		/* entry index */
-	int			newsize;	/* new inode size */
-	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
-	int			oldsize;	/* old inode size */
-	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
-	struct xfs_mount	*mp;
-
-	trace_xfs_dir2_sf_toino4(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	/*
-	 * Copy the old directory to the buffer.
-	 * Then nuke it from the inode, and add the new buffer to the inode.
-	 * Don't want xfs_idata_realloc copying the data here.
-	 */
-	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
-	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(oldsfp->i8count == 1);
-	memcpy(buf, oldsfp, oldsize);
-	/*
-	 * Compute the new inode size.
-	 */
-	newsize =
-		oldsize -
-		(oldsfp->count + 1) *
-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
-	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
-	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
-	/*
-	 * Reset our pointers, the data has moved.
-	 */
-	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	/*
-	 * Fill in the new header.
-	 */
-	sfp->count = oldsfp->count;
-	sfp->i8count = 0;
-	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
-	/*
-	 * Copy the entries field by field.
-	 */
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
-		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
-	     i < sfp->count;
-	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
-		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
-		sfep->namelen = oldsfep->namelen;
-		sfep->offset = oldsfep->offset;
-		memcpy(sfep->name, oldsfep->name, sfep->namelen);
-		dp->d_ops->sf_put_ino(sfp, sfep,
-				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
-		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
-	}
-	/*
-	 * Clean up the inode.
-	 */
-	kmem_free(buf);
-	dp->i_d.di_size = newsize;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-}
-
-/*
- * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
- * The new entry w/ an 8-byte inode number is not there yet; we leave with
- * i8count set to 1, but no corresponding 8-byte entry.
- */
-static void
-xfs_dir2_sf_toino8(
-	xfs_da_args_t		*args)		/* operation arguments */
-{
-	char			*buf;		/* old dir's buffer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			i;		/* entry index */
-	int			newsize;	/* new inode size */
-	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
-	int			oldsize;	/* old inode size */
-	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
-	struct xfs_mount	*mp;
-
-	trace_xfs_dir2_sf_toino8(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-
-	/*
-	 * Copy the old directory to the buffer.
-	 * Then nuke it from the inode, and add the new buffer to the inode.
-	 * Don't want xfs_idata_realloc copying the data here.
-	 */
-	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, KM_SLEEP);
-	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	ASSERT(oldsfp->i8count == 0);
-	memcpy(buf, oldsfp, oldsize);
-	/*
-	 * Compute the new inode size (nb: entry count + 1 for parent)
-	 */
-	newsize =
-		oldsize +
-		(oldsfp->count + 1) *
-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
-	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
-	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
-	/*
-	 * Reset our pointers, the data has moved.
-	 */
-	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	/*
-	 * Fill in the new header.
-	 */
-	sfp->count = oldsfp->count;
-	sfp->i8count = 1;
-	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
-	/*
-	 * Copy the entries field by field.
-	 */
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
-		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
-	     i < sfp->count;
-	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
-		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
-		sfep->namelen = oldsfep->namelen;
-		sfep->offset = oldsfep->offset;
-		memcpy(sfep->name, oldsfep->name, sfep->namelen);
-		dp->d_ops->sf_put_ino(sfp, sfep,
-				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
-		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
-	}
-	/*
-	 * Clean up the inode.
-	 */
-	kmem_free(buf);
-	dp->i_d.di_size = newsize;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-}
-#endif	/* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
deleted file mode 100644
index c2ac0c611ad8..000000000000
--- a/fs/xfs/xfs_dquot_buf.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_qm.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
-
-int
-xfs_calc_dquots_per_chunk(
-	unsigned int		nbblks)	/* basic block units */
-{
-	unsigned int	ndquots;
-
-	ASSERT(nbblks > 0);
-	ndquots = BBTOB(nbblks);
-	do_div(ndquots, sizeof(xfs_dqblk_t));
-
-	return ndquots;
-}
-
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_dqcheck(
-	struct xfs_mount *mp,
-	xfs_disk_dquot_t *ddq,
-	xfs_dqid_t	 id,
-	uint		 type,	  /* used only when IO_dorepair is true */
-	uint		 flags,
-	char		 *str)
-{
-	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
-	int		errs = 0;
-
-	/*
-	 * We can encounter an uninitialized dquot buffer for 2 reasons:
-	 * 1. If we crash while deleting the quotainode(s), and those blks got
-	 *    used for user data. This is because we take the path of regular
-	 *    file deletion; however, the size field of quotainodes is never
-	 *    updated, so all the tricks that we play in itruncate_finish
-	 *    don't quite matter.
-	 *
-	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
-	 *    But the allocation will be replayed so we'll end up with an
-	 *    uninitialized quota block.
-	 *
-	 * This is all fine; things are still consistent, and we haven't lost
-	 * any quota information. Just don't complain about bad dquot blks.
-	 */
-	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
-			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
-		errs++;
-	}
-	if (ddq->d_version != XFS_DQUOT_VERSION) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
-			str, id, ddq->d_version, XFS_DQUOT_VERSION);
-		errs++;
-	}
-
-	if (ddq->d_flags != XFS_DQ_USER &&
-	    ddq->d_flags != XFS_DQ_PROJ &&
-	    ddq->d_flags != XFS_DQ_GROUP) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
-			str, id, ddq->d_flags);
-		errs++;
-	}
-
-	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : ondisk-dquot 0x%p, ID mismatch: "
-			"0x%x expected, found id 0x%x",
-			str, ddq, id, be32_to_cpu(ddq->d_id));
-		errs++;
-	}
-
-	if (!errs && ddq->d_id) {
-		if (ddq->d_blk_softlimit &&
-		    be64_to_cpu(ddq->d_bcount) >
-				be64_to_cpu(ddq->d_blk_softlimit)) {
-			if (!ddq->d_btimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-		if (ddq->d_ino_softlimit &&
-		    be64_to_cpu(ddq->d_icount) >
-				be64_to_cpu(ddq->d_ino_softlimit)) {
-			if (!ddq->d_itimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-		if (ddq->d_rtb_softlimit &&
-		    be64_to_cpu(ddq->d_rtbcount) >
-				be64_to_cpu(ddq->d_rtb_softlimit)) {
-			if (!ddq->d_rtbtimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-	}
-
-	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
-		return errs;
-
-	if (flags & XFS_QMOPT_DOWARN)
-		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
-
-	/*
-	 * Typically, a repair is only requested by quotacheck.
-	 */
-	ASSERT(id != -1);
-	ASSERT(flags & XFS_QMOPT_DQREPAIR);
-	memset(d, 0, sizeof(xfs_dqblk_t));
-
-	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-	d->dd_diskdq.d_flags = type;
-	d->dd_diskdq.d_id = cpu_to_be32(id);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
-		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
-				 XFS_DQUOT_CRC_OFF);
-	}
-
-	return errs;
-}
-
-STATIC bool
-xfs_dquot_buf_verify_crc(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	int			ndquots;
-	int			i;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return true;
-
-	/*
-	 * if we are in log recovery, the quota subsystem has not been
-	 * initialised so we have no quotainfo structure. In that case, we need
-	 * to manually calculate the number of dquots in the buffer.
-	 */
-	if (mp->m_quotainfo)
-		ndquots = mp->m_quotainfo->qi_dqperchunk;
-	else
-		ndquots = xfs_calc_dquots_per_chunk(
-					XFS_BB_TO_FSB(mp, bp->b_length));
-
-	for (i = 0; i < ndquots; i++, d++) {
-		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
-				 XFS_DQUOT_CRC_OFF))
-			return false;
-		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
-			return false;
-	}
-	return true;
-}
-
-STATIC bool
-xfs_dquot_buf_verify(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	xfs_dqid_t		id = 0;
-	int			ndquots;
-	int			i;
-
-	/*
-	 * if we are in log recovery, the quota subsystem has not been
-	 * initialised so we have no quotainfo structure. In that case, we need
-	 * to manually calculate the number of dquots in the buffer.
-	 */
-	if (mp->m_quotainfo)
-		ndquots = mp->m_quotainfo->qi_dqperchunk;
-	else
-		ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
-
-	/*
-	 * On the first read of the buffer, verify that each dquot is valid.
-	 * We don't know what the id of the dquot is supposed to be, just that
-	 * they should be increasing monotonically within the buffer. If the
-	 * first id is corrupt, then it will fail on the second dquot in the
-	 * buffer so corruptions could point to the wrong dquot in this case.
-	 */
-	for (i = 0; i < ndquots; i++) {
-		struct xfs_disk_dquot	*ddq;
-		int			error;
-
-		ddq = &d[i].dd_diskdq;
-
-		if (i == 0)
-			id = be32_to_cpu(ddq->d_id);
-
-		error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-				       "xfs_dquot_buf_verify");
-		if (error)
-			return false;
-	}
-	return true;
-}
-
-static void
-xfs_dquot_buf_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (!xfs_dquot_buf_verify_crc(mp, bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_dquot_buf_verify(mp, bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-/*
- * we don't calculate the CRC here as that is done when the dquot is flushed to
- * the buffer after the update is done. This ensures that the dquot in the
- * buffer always has an up-to-date CRC value.
- */
-static void
-xfs_dquot_buf_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (!xfs_dquot_buf_verify(mp, bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-}
-
-const struct xfs_buf_ops xfs_dquot_buf_ops = {
-	.verify_read = xfs_dquot_buf_read_verify,
-	.verify_write = xfs_dquot_buf_write_verify,
-};
-
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
deleted file mode 100644
index 16fb63a9bc5e..000000000000
--- a/fs/xfs/xfs_ialloc.c
+++ /dev/null
@@ -1,2189 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_icreate_item.h"
-#include "xfs_icache.h"
-#include "xfs_dinode.h"
-#include "xfs_trace.h"
-
-
-/*
- * Allocation group level functions.
- */
-static inline int
-xfs_ialloc_cluster_alignment(
-	xfs_alloc_arg_t	*args)
-{
-	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
-	    args->mp->m_sb.sb_inoalignmt >=
-	     XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
-		return args->mp->m_sb.sb_inoalignmt;
-	return 1;
-}
-
-/*
- * Lookup a record by ino in the btree given by cur.
- */
-int					/* error */
-xfs_inobt_lookup(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	xfs_lookup_t		dir,	/* <=, >=, == */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = 0;
-	cur->bc_rec.i.ir_free = 0;
-	return xfs_btree_lookup(cur, dir, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given.
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-STATIC int				/* error */
-xfs_inobt_update(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_inobt_rec_incore_t	*irec)	/* btree record */
-{
-	union xfs_btree_rec	rec;
-
-	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
-	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
-	return xfs_btree_update(cur, &rec);
-}
-
-/*
- * Get the data from the pointed-to record.
- */
-int					/* error */
-xfs_inobt_get_rec(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_inobt_rec_incore_t	*irec,	/* btree record */
-	int			*stat)	/* output: success/failure */
-{
-	union xfs_btree_rec	*rec;
-	int			error;
-
-	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (!error && *stat == 1) {
-		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
-	}
-	return error;
-}
-
-/*
- * Insert a single inobt record. Cursor must already point to desired location.
- */
-STATIC int
-xfs_inobt_insert_rec(
-	struct xfs_btree_cur	*cur,
-	__int32_t		freecount,
-	xfs_inofree_t		free,
-	int			*stat)
-{
-	cur->bc_rec.i.ir_freecount = freecount;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_insert(cur, stat);
-}
-
-/*
- * Insert records describing a newly allocated inode chunk into the inobt.
- */
-STATIC int
-xfs_inobt_insert(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_agino_t		newino,
-	xfs_agino_t		newlen,
-	xfs_btnum_t		btnum)
-{
-	struct xfs_btree_cur	*cur;
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
-	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
-	xfs_agino_t		thisino;
-	int			i;
-	int			error;
-
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
-
-	for (thisino = newino;
-	     thisino < newino + newlen;
-	     thisino += XFS_INODES_PER_CHUNK) {
-		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
-		if (error) {
-			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-			return error;
-		}
-		ASSERT(i == 0);
-
-		error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
-					     XFS_INOBT_ALL_FREE, &i);
-		if (error) {
-			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-			return error;
-		}
-		ASSERT(i == 1);
-	}
-
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-
-	return 0;
-}
-
-/*
- * Verify that the number of free inodes in the AGI is correct.
- */
-#ifdef DEBUG
-STATIC int
-xfs_check_agi_freecount(
-	struct xfs_btree_cur	*cur,
-	struct xfs_agi		*agi)
-{
-	if (cur->bc_nlevels == 1) {
-		xfs_inobt_rec_incore_t rec;
-		int		freecount = 0;
-		int		error;
-		int		i;
-
-		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-		if (error)
-			return error;
-
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				return error;
-
-			if (i) {
-				freecount += rec.ir_freecount;
-				error = xfs_btree_increment(cur, 0, &i);
-				if (error)
-					return error;
-			}
-		} while (i == 1);
-
-		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
-			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
-	}
-	return 0;
-}
-#else
-#define xfs_check_agi_freecount(cur, agi)	0
-#endif
-
-/*
- * Initialise a new set of inodes. When called without a transaction context
- * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
- * than logging them (which in a transaction context puts them into the AIL
- * for writeback rather than the xfsbufd queue).
- */
-int
-xfs_ialloc_inode_init(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct list_head	*buffer_list,
-	xfs_agnumber_t		agno,
-	xfs_agblock_t		agbno,
-	xfs_agblock_t		length,
-	unsigned int		gen)
-{
-	struct xfs_buf		*fbuf;
-	struct xfs_dinode	*free;
-	int			nbufs, blks_per_cluster, inodes_per_cluster;
-	int			version;
-	int			i, j;
-	xfs_daddr_t		d;
-	xfs_ino_t		ino = 0;
-
-	/*
-	 * Loop over the new block(s), filling in the inodes.  For small block
-	 * sizes, manipulate the inodes in buffers  which are multiples of the
-	 * blocks size.
-	 */
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
-	nbufs = length / blks_per_cluster;
-
-	/*
-	 * Figure out what version number to use in the inodes we create.  If
-	 * the superblock version has caught up to the one that supports the new
-	 * inode format, then use the new inode version.  Otherwise use the old
-	 * version so that old kernels will continue to be able to use the file
-	 * system.
-	 *
-	 * For v3 inodes, we also need to write the inode number into the inode,
-	 * so calculate the first inode number of the chunk here as
-	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
-	 * across multiple filesystem blocks (such as a cluster) and so cannot
-	 * be used in the cluster buffer loop below.
-	 *
-	 * Further, because we are writing the inode directly into the buffer
-	 * and calculating a CRC on the entire inode, we have ot log the entire
-	 * inode so that the entire range the CRC covers is present in the log.
-	 * That means for v3 inode we log the entire buffer rather than just the
-	 * inode cores.
-	 */
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		version = 3;
-		ino = XFS_AGINO_TO_INO(mp, agno,
-				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
-
-		/*
-		 * log the initialisation that is about to take place as an
-		 * logical operation. This means the transaction does not
-		 * need to log the physical changes to the inode buffers as log
-		 * recovery will know what initialisation is actually needed.
-		 * Hence we only need to log the buffers as "ordered" buffers so
-		 * they track in the AIL as if they were physically logged.
-		 */
-		if (tp)
-			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
-					mp->m_sb.sb_inodesize, length, gen);
-	} else
-		version = 2;
-
-	for (j = 0; j < nbufs; j++) {
-		/*
-		 * Get the block.
-		 */
-		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
-		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					 mp->m_bsize * blks_per_cluster,
-					 XBF_UNMAPPED);
-		if (!fbuf)
-			return ENOMEM;
-
-		/* Initialize the inode buffers and log them appropriately. */
-		fbuf->b_ops = &xfs_inode_buf_ops;
-		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
-		for (i = 0; i < inodes_per_cluster; i++) {
-			int	ioffset = i << mp->m_sb.sb_inodelog;
-			uint	isize = xfs_dinode_size(version);
-
-			free = xfs_make_iptr(mp, fbuf, i);
-			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-			free->di_version = version;
-			free->di_gen = cpu_to_be32(gen);
-			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-
-			if (version == 3) {
-				free->di_ino = cpu_to_be64(ino);
-				ino++;
-				uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
-				xfs_dinode_calc_crc(mp, free);
-			} else if (tp) {
-				/* just log the inode core */
-				xfs_trans_log_buf(tp, fbuf, ioffset,
-						  ioffset + isize - 1);
-			}
-		}
-
-		if (tp) {
-			/*
-			 * Mark the buffer as an inode allocation buffer so it
-			 * sticks in AIL at the point of this allocation
-			 * transaction. This ensures the they are on disk before
-			 * the tail of the log can be moved past this
-			 * transaction (i.e. by preventing relogging from moving
-			 * it forward in the log).
-			 */
-			xfs_trans_inode_alloc_buf(tp, fbuf);
-			if (version == 3) {
-				/*
-				 * Mark the buffer as ordered so that they are
-				 * not physically logged in the transaction but
-				 * still tracked in the AIL as part of the
-				 * transaction and pin the log appropriately.
-				 */
-				xfs_trans_ordered_buf(tp, fbuf);
-				xfs_trans_log_buf(tp, fbuf, 0,
-						  BBTOB(fbuf->b_length) - 1);
-			}
-		} else {
-			fbuf->b_flags |= XBF_DONE;
-			xfs_buf_delwri_queue(fbuf, buffer_list);
-			xfs_buf_relse(fbuf);
-		}
-	}
-	return 0;
-}
-
-/*
- * Allocate new inodes in the allocation group specified by agbp.
- * Return 0 for success, else error code.
- */
-STATIC int				/* error code or 0 */
-xfs_ialloc_ag_alloc(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_buf_t	*agbp,		/* alloc group buffer */
-	int		*alloc)
-{
-	xfs_agi_t	*agi;		/* allocation group header */
-	xfs_alloc_arg_t	args;		/* allocation argument structure */
-	xfs_agnumber_t	agno;
-	int		error;
-	xfs_agino_t	newino;		/* new first inode's number */
-	xfs_agino_t	newlen;		/* new number of inodes */
-	int		isaligned = 0;	/* inode allocation at stripe unit */
-					/* boundary */
-	struct xfs_perag *pag;
-
-	memset(&args, 0, sizeof(args));
-	args.tp = tp;
-	args.mp = tp->t_mountp;
-
-	/*
-	 * Locking will ensure that we don't have two callers in here
-	 * at one time.
-	 */
-	newlen = args.mp->m_ialloc_inos;
-	if (args.mp->m_maxicount &&
-	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
-		return ENOSPC;
-	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
-	/*
-	 * First try to allocate inodes contiguous with the last-allocated
-	 * chunk of inodes.  If the filesystem is striped, this will fill
-	 * an entire stripe unit with inodes.
-	 */
-	agi = XFS_BUF_TO_AGI(agbp);
-	newino = be32_to_cpu(agi->agi_newino);
-	agno = be32_to_cpu(agi->agi_seqno);
-	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-		     args.mp->m_ialloc_blks;
-	if (likely(newino != NULLAGINO &&
-		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
-		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-		args.type = XFS_ALLOCTYPE_THIS_BNO;
-		args.prod = 1;
-
-		/*
-		 * We need to take into account alignment here to ensure that
-		 * we don't modify the free list if we fail to have an exact
-		 * block. If we don't have an exact match, and every oher
-		 * attempt allocation attempt fails, we'll end up cancelling
-		 * a dirty transaction and shutting down.
-		 *
-		 * For an exact allocation, alignment must be 1,
-		 * however we need to take cluster alignment into account when
-		 * fixing up the freelist. Use the minalignslop field to
-		 * indicate that extra blocks might be required for alignment,
-		 * but not to use them in the actual exact allocation.
-		 */
-		args.alignment = 1;
-		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
-
-		/* Allow space for the inode btree to split. */
-		args.minleft = args.mp->m_in_maxlevels - 1;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-
-		/*
-		 * This request might have dirtied the transaction if the AG can
-		 * satisfy the request, but the exact block was not available.
-		 * If the allocation did fail, subsequent requests will relax
-		 * the exact agbno requirement and increase the alignment
-		 * instead. It is critical that the total size of the request
-		 * (len + alignment + slop) does not increase from this point
-		 * on, so reset minalignslop to ensure it is not included in
-		 * subsequent requests.
-		 */
-		args.minalignslop = 0;
-	} else
-		args.fsbno = NULLFSBLOCK;
-
-	if (unlikely(args.fsbno == NULLFSBLOCK)) {
-		/*
-		 * Set the alignment for the allocation.
-		 * If stripe alignment is turned on then align at stripe unit
-		 * boundary.
-		 * If the cluster size is smaller than a filesystem block
-		 * then we're doing I/O for inodes in filesystem block size
-		 * pieces, so don't need alignment anyway.
-		 */
-		isaligned = 0;
-		if (args.mp->m_sinoalign) {
-			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
-			args.alignment = args.mp->m_dalign;
-			isaligned = 1;
-		} else
-			args.alignment = xfs_ialloc_cluster_alignment(&args);
-		/*
-		 * Need to figure out where to allocate the inode blocks.
-		 * Ideally they should be spaced out through the a.g.
-		 * For now, just allocate blocks up front.
-		 */
-		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-		/*
-		 * Allocate a fixed-size extent of inodes.
-		 */
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.prod = 1;
-		/*
-		 * Allow space for the inode btree to split.
-		 */
-		args.minleft = args.mp->m_in_maxlevels - 1;
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-	}
-
-	/*
-	 * If stripe alignment is turned on, then try again with cluster
-	 * alignment.
-	 */
-	if (isaligned && args.fsbno == NULLFSBLOCK) {
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-		args.alignment = xfs_ialloc_cluster_alignment(&args);
-		if ((error = xfs_alloc_vextent(&args)))
-			return error;
-	}
-
-	if (args.fsbno == NULLFSBLOCK) {
-		*alloc = 0;
-		return 0;
-	}
-	ASSERT(args.len == args.minlen);
-
-	/*
-	 * Stamp and write the inode buffers.
-	 *
-	 * Seed the new inode cluster with a random generation number. This
-	 * prevents short-term reuse of generation numbers if a chunk is
-	 * freed and then immediately reallocated. We use random numbers
-	 * rather than a linear progression to prevent the next generation
-	 * number from being easily guessable.
-	 */
-	error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
-			args.len, prandom_u32());
-
-	if (error)
-		return error;
-	/*
-	 * Convert the results.
-	 */
-	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-	be32_add_cpu(&agi->agi_count, newlen);
-	be32_add_cpu(&agi->agi_freecount, newlen);
-	pag = xfs_perag_get(args.mp, agno);
-	pag->pagi_freecount += newlen;
-	xfs_perag_put(pag);
-	agi->agi_newino = cpu_to_be32(newino);
-
-	/*
-	 * Insert records describing the new inode chunk into the btrees.
-	 */
-	error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-				 XFS_BTNUM_INO);
-	if (error)
-		return error;
-
-	if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-					 XFS_BTNUM_FINO);
-		if (error)
-			return error;
-	}
-	/*
-	 * Log allocation group header fields
-	 */
-	xfs_ialloc_log_agi(tp, agbp,
-		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
-	/*
-	 * Modify/log superblock values for inode count and inode free count.
-	 */
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
-	*alloc = 1;
-	return 0;
-}
-
-STATIC xfs_agnumber_t
-xfs_ialloc_next_ag(
-	xfs_mount_t	*mp)
-{
-	xfs_agnumber_t	agno;
-
-	spin_lock(&mp->m_agirotor_lock);
-	agno = mp->m_agirotor;
-	if (++mp->m_agirotor >= mp->m_maxagi)
-		mp->m_agirotor = 0;
-	spin_unlock(&mp->m_agirotor_lock);
-
-	return agno;
-}
-
-/*
- * Select an allocation group to look for a free inode in, based on the parent
- * inode and the mode.  Return the allocation group buffer.
- */
-STATIC xfs_agnumber_t
-xfs_ialloc_ag_select(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_ino_t	parent,		/* parent directory inode number */
-	umode_t		mode,		/* bits set to indicate file type */
-	int		okalloc)	/* ok to allocate more space */
-{
-	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
-	xfs_agnumber_t	agno;		/* current ag number */
-	int		flags;		/* alloc buffer locking flags */
-	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
-	xfs_extlen_t	longest = 0;	/* longest extent available */
-	xfs_mount_t	*mp;		/* mount point structure */
-	int		needspace;	/* file mode implies space allocated */
-	xfs_perag_t	*pag;		/* per allocation group data */
-	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
-	int		error;
-
-	/*
-	 * Files of these types need at least one block if length > 0
-	 * (and they won't fit in the inode, but that's hard to figure out).
-	 */
-	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
-	mp = tp->t_mountp;
-	agcount = mp->m_maxagi;
-	if (S_ISDIR(mode))
-		pagno = xfs_ialloc_next_ag(mp);
-	else {
-		pagno = XFS_INO_TO_AGNO(mp, parent);
-		if (pagno >= agcount)
-			pagno = 0;
-	}
-
-	ASSERT(pagno < agcount);
-
-	/*
-	 * Loop through allocation groups, looking for one with a little
-	 * free space in it.  Note we don't look for free inodes, exactly.
-	 * Instead, we include whether there is a need to allocate inodes
-	 * to mean that blocks must be allocated for them,
-	 * if none are currently free.
-	 */
-	agno = pagno;
-	flags = XFS_ALLOC_FLAG_TRYLOCK;
-	for (;;) {
-		pag = xfs_perag_get(mp, agno);
-		if (!pag->pagi_inodeok) {
-			xfs_ialloc_next_ag(mp);
-			goto nextag;
-		}
-
-		if (!pag->pagi_init) {
-			error = xfs_ialloc_pagi_init(mp, tp, agno);
-			if (error)
-				goto nextag;
-		}
-
-		if (pag->pagi_freecount) {
-			xfs_perag_put(pag);
-			return agno;
-		}
-
-		if (!okalloc)
-			goto nextag;
-
-		if (!pag->pagf_init) {
-			error = xfs_alloc_pagf_init(mp, tp, agno, flags);
-			if (error)
-				goto nextag;
-		}
-
-		/*
-		 * Is there enough free space for the file plus a block of
-		 * inodes? (if we need to allocate some)?
-		 */
-		ineed = mp->m_ialloc_blks;
-		longest = pag->pagf_longest;
-		if (!longest)
-			longest = pag->pagf_flcount > 0;
-
-		if (pag->pagf_freeblks >= needspace + ineed &&
-		    longest >= ineed) {
-			xfs_perag_put(pag);
-			return agno;
-		}
-nextag:
-		xfs_perag_put(pag);
-		/*
-		 * No point in iterating over the rest, if we're shutting
-		 * down.
-		 */
-		if (XFS_FORCED_SHUTDOWN(mp))
-			return NULLAGNUMBER;
-		agno++;
-		if (agno >= agcount)
-			agno = 0;
-		if (agno == pagno) {
-			if (flags == 0)
-				return NULLAGNUMBER;
-			flags = 0;
-		}
-	}
-}
-
-/*
- * Try to retrieve the next record to the left/right from the current one.
- */
-STATIC int
-xfs_ialloc_next_rec(
-	struct xfs_btree_cur	*cur,
-	xfs_inobt_rec_incore_t	*rec,
-	int			*done,
-	int			left)
-{
-	int                     error;
-	int			i;
-
-	if (left)
-		error = xfs_btree_decrement(cur, 0, &i);
-	else
-		error = xfs_btree_increment(cur, 0, &i);
-
-	if (error)
-		return error;
-	*done = !i;
-	if (i) {
-		error = xfs_inobt_get_rec(cur, rec, &i);
-		if (error)
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-
-	return 0;
-}
-
-STATIC int
-xfs_ialloc_get_rec(
-	struct xfs_btree_cur	*cur,
-	xfs_agino_t		agino,
-	xfs_inobt_rec_incore_t	*rec,
-	int			*done)
-{
-	int                     error;
-	int			i;
-
-	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
-	if (error)
-		return error;
-	*done = !i;
-	if (i) {
-		error = xfs_inobt_get_rec(cur, rec, &i);
-		if (error)
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-	}
-
-	return 0;
-}
-
-/*
- * Allocate an inode using the inobt-only algorithm.
- */
-STATIC int
-xfs_dialloc_ag_inobt(
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_ino_t		parent,
-	xfs_ino_t		*inop)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
-	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno);
-	xfs_agnumber_t		pagno = XFS_INO_TO_AGNO(mp, parent);
-	xfs_agino_t		pagino = XFS_INO_TO_AGINO(mp, parent);
-	struct xfs_perag	*pag;
-	struct xfs_btree_cur	*cur, *tcur;
-	struct xfs_inobt_rec_incore rec, trec;
-	xfs_ino_t		ino;
-	int			error;
-	int			offset;
-	int			i, j;
-
-	pag = xfs_perag_get(mp, agno);
-
-	ASSERT(pag->pagi_init);
-	ASSERT(pag->pagi_inodeok);
-	ASSERT(pag->pagi_freecount > 0);
-
- restart_pagno:
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-	/*
-	 * If pagino is 0 (this is the root inode allocation) use newino.
-	 * This must work because we've just allocated some.
-	 */
-	if (!pagino)
-		pagino = be32_to_cpu(agi->agi_newino);
-
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error0;
-
-	/*
-	 * If in the same AG as the parent, try to get near the parent.
-	 */
-	if (pagno == agno) {
-		int		doneleft;	/* done, to the left */
-		int		doneright;	/* done, to the right */
-		int		searchdistance = 10;
-
-		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		error = xfs_inobt_get_rec(cur, &rec, &j);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
-
-		if (rec.ir_freecount > 0) {
-			/*
-			 * Found a free inode in the same chunk
-			 * as the parent, done.
-			 */
-			goto alloc_inode;
-		}
-
-
-		/*
-		 * In the same AG as parent, but parent's chunk is full.
-		 */
-
-		/* duplicate the cursor, search left & right simultaneously */
-		error = xfs_btree_dup_cursor(cur, &tcur);
-		if (error)
-			goto error0;
-
-		/*
-		 * Skip to last blocks looked up if same parent inode.
-		 */
-		if (pagino != NULLAGINO &&
-		    pag->pagl_pagino == pagino &&
-		    pag->pagl_leftrec != NULLAGINO &&
-		    pag->pagl_rightrec != NULLAGINO) {
-			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
-						   &trec, &doneleft);
-			if (error)
-				goto error1;
-
-			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
-						   &rec, &doneright);
-			if (error)
-				goto error1;
-		} else {
-			/* search left with tcur, back up 1 record */
-			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-			if (error)
-				goto error1;
-
-			/* search right with cur, go forward 1 record. */
-			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
-			if (error)
-				goto error1;
-		}
-
-		/*
-		 * Loop until we find an inode chunk with a free inode.
-		 */
-		while (!doneleft || !doneright) {
-			int	useleft;  /* using left inode chunk this time */
-
-			if (!--searchdistance) {
-				/*
-				 * Not in range - save last search
-				 * location and allocate a new inode
-				 */
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-				pag->pagl_leftrec = trec.ir_startino;
-				pag->pagl_rightrec = rec.ir_startino;
-				pag->pagl_pagino = pagino;
-				goto newino;
-			}
-
-			/* figure out the closer block if both are valid. */
-			if (!doneleft && !doneright) {
-				useleft = pagino -
-				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
-				  rec.ir_startino - pagino;
-			} else {
-				useleft = !doneleft;
-			}
-
-			/* free inodes to the left? */
-			if (useleft && trec.ir_freecount) {
-				rec = trec;
-				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-				cur = tcur;
-
-				pag->pagl_leftrec = trec.ir_startino;
-				pag->pagl_rightrec = rec.ir_startino;
-				pag->pagl_pagino = pagino;
-				goto alloc_inode;
-			}
-
-			/* free inodes to the right? */
-			if (!useleft && rec.ir_freecount) {
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-
-				pag->pagl_leftrec = trec.ir_startino;
-				pag->pagl_rightrec = rec.ir_startino;
-				pag->pagl_pagino = pagino;
-				goto alloc_inode;
-			}
-
-			/* get next record to check */
-			if (useleft) {
-				error = xfs_ialloc_next_rec(tcur, &trec,
-								 &doneleft, 1);
-			} else {
-				error = xfs_ialloc_next_rec(cur, &rec,
-								 &doneright, 0);
-			}
-			if (error)
-				goto error1;
-		}
-
-		/*
-		 * We've reached the end of the btree. because
-		 * we are only searching a small chunk of the
-		 * btree each search, there is obviously free
-		 * inodes closer to the parent inode than we
-		 * are now. restart the search again.
-		 */
-		pag->pagl_pagino = NULLAGINO;
-		pag->pagl_leftrec = NULLAGINO;
-		pag->pagl_rightrec = NULLAGINO;
-		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-		goto restart_pagno;
-	}
-
-	/*
-	 * In a different AG from the parent.
-	 * See if the most recently allocated block has any free.
-	 */
-newino:
-	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
-					 XFS_LOOKUP_EQ, &i);
-		if (error)
-			goto error0;
-
-		if (i == 1) {
-			error = xfs_inobt_get_rec(cur, &rec, &j);
-			if (error)
-				goto error0;
-
-			if (j == 1 && rec.ir_freecount > 0) {
-				/*
-				 * The last chunk allocated in the group
-				 * still has a free inode.
-				 */
-				goto alloc_inode;
-			}
-		}
-	}
-
-	/*
-	 * None left in the last group, search the whole AG
-	 */
-	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-	if (error)
-		goto error0;
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-	for (;;) {
-		error = xfs_inobt_get_rec(cur, &rec, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if (rec.ir_freecount > 0)
-			break;
-		error = xfs_btree_increment(cur, 0, &i);
-		if (error)
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	}
-
-alloc_inode:
-	offset = xfs_lowbit64(rec.ir_free);
-	ASSERT(offset >= 0);
-	ASSERT(offset < XFS_INODES_PER_CHUNK);
-	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
-				   XFS_INODES_PER_CHUNK) == 0);
-	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-	rec.ir_free &= ~XFS_INOBT_MASK(offset);
-	rec.ir_freecount--;
-	error = xfs_inobt_update(cur, &rec);
-	if (error)
-		goto error0;
-	be32_add_cpu(&agi->agi_freecount, -1);
-	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-	pag->pagi_freecount--;
-
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error0;
-
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
-	xfs_perag_put(pag);
-	*inop = ino;
-	return 0;
-error1:
-	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-error0:
-	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-	xfs_perag_put(pag);
-	return error;
-}
-
-/*
- * Use the free inode btree to allocate an inode based on distance from the
- * parent. Note that the provided cursor may be deleted and replaced.
- */
-STATIC int
-xfs_dialloc_ag_finobt_near(
-	xfs_agino_t			pagino,
-	struct xfs_btree_cur		**ocur,
-	struct xfs_inobt_rec_incore	*rec)
-{
-	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */
-	struct xfs_btree_cur		*rcur;	/* right search cursor */
-	struct xfs_inobt_rec_incore	rrec;
-	int				error;
-	int				i, j;
-
-	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
-	if (error)
-		return error;
-
-	if (i == 1) {
-		error = xfs_inobt_get_rec(lcur, rec, &i);
-		if (error)
-			return error;
-		XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-		/*
-		 * See if we've landed in the parent inode record. The finobt
-		 * only tracks chunks with at least one free inode, so record
-		 * existence is enough.
-		 */
-		if (pagino >= rec->ir_startino &&
-		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
-			return 0;
-	}
-
-	error = xfs_btree_dup_cursor(lcur, &rcur);
-	if (error)
-		return error;
-
-	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
-	if (error)
-		goto error_rcur;
-	if (j == 1) {
-		error = xfs_inobt_get_rec(rcur, &rrec, &j);
-		if (error)
-			goto error_rcur;
-		XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
-	}
-
-	XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
-	if (i == 1 && j == 1) {
-		/*
-		 * Both the left and right records are valid. Choose the closer
-		 * inode chunk to the target.
-		 */
-		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
-		    (rrec.ir_startino - pagino)) {
-			*rec = rrec;
-			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
-			*ocur = rcur;
-		} else {
-			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
-		}
-	} else if (j == 1) {
-		/* only the right record is valid */
-		*rec = rrec;
-		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
-		*ocur = rcur;
-	} else if (i == 1) {
-		/* only the left record is valid */
-		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
-	}
-
-	return 0;
-
-error_rcur:
-	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Use the free inode btree to find a free inode based on a newino hint. If
- * the hint is NULL, find the first free inode in the AG.
- */
-STATIC int
-xfs_dialloc_ag_finobt_newino(
-	struct xfs_agi			*agi,
-	struct xfs_btree_cur		*cur,
-	struct xfs_inobt_rec_incore	*rec)
-{
-	int error;
-	int i;
-
-	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
-		error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
-					 &i);
-		if (error)
-			return error;
-		if (i == 1) {
-			error = xfs_inobt_get_rec(cur, rec, &i);
-			if (error)
-				return error;
-			XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-			return 0;
-		}
-	}
-
-	/*
-	 * Find the first inode available in the AG.
-	 */
-	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
-	if (error)
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-	error = xfs_inobt_get_rec(cur, rec, &i);
-	if (error)
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-	return 0;
-}
-
-/*
- * Update the inobt based on a modification made to the finobt. Also ensure that
- * the records from both trees are equivalent post-modification.
- */
-STATIC int
-xfs_dialloc_ag_update_inobt(
-	struct xfs_btree_cur		*cur,	/* inobt cursor */
-	struct xfs_inobt_rec_incore	*frec,	/* finobt record */
-	int				offset) /* inode offset */
-{
-	struct xfs_inobt_rec_incore	rec;
-	int				error;
-	int				i;
-
-	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
-	if (error)
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(i == 1);
-
-	error = xfs_inobt_get_rec(cur, &rec, &i);
-	if (error)
-		return error;
-	XFS_WANT_CORRUPTED_RETURN(i == 1);
-	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
-				   XFS_INODES_PER_CHUNK) == 0);
-
-	rec.ir_free &= ~XFS_INOBT_MASK(offset);
-	rec.ir_freecount--;
-
-	XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
-				  (rec.ir_freecount == frec->ir_freecount));
-
-	error = xfs_inobt_update(cur, &rec);
-	if (error)
-		return error;
-
-	return 0;
-}
-
-/*
- * Allocate an inode using the free inode btree, if available. Otherwise, fall
- * back to the inobt search algorithm.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
- */
-STATIC int
-xfs_dialloc_ag(
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_ino_t		parent,
-	xfs_ino_t		*inop)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
-	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
-	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent);
-	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent);
-	struct xfs_perag		*pag;
-	struct xfs_btree_cur		*cur;	/* finobt cursor */
-	struct xfs_btree_cur		*icur;	/* inobt cursor */
-	struct xfs_inobt_rec_incore	rec;
-	xfs_ino_t			ino;
-	int				error;
-	int				offset;
-	int				i;
-
-	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
-		return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
-
-	pag = xfs_perag_get(mp, agno);
-
-	/*
-	 * If pagino is 0 (this is the root inode allocation) use newino.
-	 * This must work because we've just allocated some.
-	 */
-	if (!pagino)
-		pagino = be32_to_cpu(agi->agi_newino);
-
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
-
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error_cur;
-
-	/*
-	 * The search algorithm depends on whether we're in the same AG as the
-	 * parent. If so, find the closest available inode to the parent. If
-	 * not, consider the agi hint or find the first free inode in the AG.
-	 */
-	if (agno == pagno)
-		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
-	else
-		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
-	if (error)
-		goto error_cur;
-
-	offset = xfs_lowbit64(rec.ir_free);
-	ASSERT(offset >= 0);
-	ASSERT(offset < XFS_INODES_PER_CHUNK);
-	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
-				   XFS_INODES_PER_CHUNK) == 0);
-	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-
-	/*
-	 * Modify or remove the finobt record.
-	 */
-	rec.ir_free &= ~XFS_INOBT_MASK(offset);
-	rec.ir_freecount--;
-	if (rec.ir_freecount)
-		error = xfs_inobt_update(cur, &rec);
-	else
-		error = xfs_btree_delete(cur, &i);
-	if (error)
-		goto error_cur;
-
-	/*
-	 * The finobt has now been updated appropriately. We haven't updated the
-	 * agi and superblock yet, so we can create an inobt cursor and validate
-	 * the original freecount. If all is well, make the equivalent update to
-	 * the inobt using the finobt record and offset information.
-	 */
-	icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-
-	error = xfs_check_agi_freecount(icur, agi);
-	if (error)
-		goto error_icur;
-
-	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
-	if (error)
-		goto error_icur;
-
-	/*
-	 * Both trees have now been updated. We must update the perag and
-	 * superblock before we can check the freecount for each btree.
-	 */
-	be32_add_cpu(&agi->agi_freecount, -1);
-	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-	pag->pagi_freecount--;
-
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
-
-	error = xfs_check_agi_freecount(icur, agi);
-	if (error)
-		goto error_icur;
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error_icur;
-
-	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	xfs_perag_put(pag);
-	*inop = ino;
-	return 0;
-
-error_icur:
-	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
-error_cur:
-	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-	xfs_perag_put(pag);
-	return error;
-}
-
-/*
- * Allocate an inode on disk.
- *
- * Mode is used to tell whether the new inode will need space, and whether it
- * is a directory.
- *
- * This function is designed to be called twice if it has to do an allocation
- * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
- * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
- * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
- * The caller should then commit the current transaction, allocate a
- * new transaction, and call xfs_dialloc() again, passing in the previous value
- * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
- * buffer is locked across the two calls, the second call is guaranteed to have
- * a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the on-disk
- * data structures are updated.  The inode itself is not read in, since doing so
- * would break ordering constraints with xfs_reclaim.
- */
-int
-xfs_dialloc(
-	struct xfs_trans	*tp,
-	xfs_ino_t		parent,
-	umode_t			mode,
-	int			okalloc,
-	struct xfs_buf		**IO_agbp,
-	xfs_ino_t		*inop)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_buf		*agbp;
-	xfs_agnumber_t		agno;
-	int			error;
-	int			ialloced;
-	int			noroom = 0;
-	xfs_agnumber_t		start_agno;
-	struct xfs_perag	*pag;
-
-	if (*IO_agbp) {
-		/*
-		 * If the caller passes in a pointer to the AGI buffer,
-		 * continue where we left off before.  In this case, we
-		 * know that the allocation group has free inodes.
-		 */
-		agbp = *IO_agbp;
-		goto out_alloc;
-	}
-
-	/*
-	 * We do not have an agbp, so select an initial allocation
-	 * group for inode allocation.
-	 */
-	start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
-	if (start_agno == NULLAGNUMBER) {
-		*inop = NULLFSINO;
-		return 0;
-	}
-
-	/*
-	 * If we have already hit the ceiling of inode blocks then clear
-	 * okalloc so we scan all available agi structures for a free
-	 * inode.
-	 */
-	if (mp->m_maxicount &&
-	    mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
-		noroom = 1;
-		okalloc = 0;
-	}
-
-	/*
-	 * Loop until we find an allocation group that either has free inodes
-	 * or in which we can allocate some inodes.  Iterate through the
-	 * allocation groups upward, wrapping at the end.
-	 */
-	agno = start_agno;
-	for (;;) {
-		pag = xfs_perag_get(mp, agno);
-		if (!pag->pagi_inodeok) {
-			xfs_ialloc_next_ag(mp);
-			goto nextag;
-		}
-
-		if (!pag->pagi_init) {
-			error = xfs_ialloc_pagi_init(mp, tp, agno);
-			if (error)
-				goto out_error;
-		}
-
-		/*
-		 * Do a first racy fast path check if this AG is usable.
-		 */
-		if (!pag->pagi_freecount && !okalloc)
-			goto nextag;
-
-		/*
-		 * Then read in the AGI buffer and recheck with the AGI buffer
-		 * lock held.
-		 */
-		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-		if (error)
-			goto out_error;
-
-		if (pag->pagi_freecount) {
-			xfs_perag_put(pag);
-			goto out_alloc;
-		}
-
-		if (!okalloc)
-			goto nextag_relse_buffer;
-
-
-		error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
-		if (error) {
-			xfs_trans_brelse(tp, agbp);
-
-			if (error != ENOSPC)
-				goto out_error;
-
-			xfs_perag_put(pag);
-			*inop = NULLFSINO;
-			return 0;
-		}
-
-		if (ialloced) {
-			/*
-			 * We successfully allocated some inodes, return
-			 * the current context to the caller so that it
-			 * can commit the current transaction and call
-			 * us again where we left off.
-			 */
-			ASSERT(pag->pagi_freecount > 0);
-			xfs_perag_put(pag);
-
-			*IO_agbp = agbp;
-			*inop = NULLFSINO;
-			return 0;
-		}
-
-nextag_relse_buffer:
-		xfs_trans_brelse(tp, agbp);
-nextag:
-		xfs_perag_put(pag);
-		if (++agno == mp->m_sb.sb_agcount)
-			agno = 0;
-		if (agno == start_agno) {
-			*inop = NULLFSINO;
-			return noroom ? ENOSPC : 0;
-		}
-	}
-
-out_alloc:
-	*IO_agbp = NULL;
-	return xfs_dialloc_ag(tp, agbp, parent, inop);
-out_error:
-	xfs_perag_put(pag);
-	return error;
-}
-
-STATIC int
-xfs_difree_inobt(
-	struct xfs_mount		*mp,
-	struct xfs_trans		*tp,
-	struct xfs_buf			*agbp,
-	xfs_agino_t			agino,
-	struct xfs_bmap_free		*flist,
-	int				*deleted,
-	xfs_ino_t			*first_ino,
-	struct xfs_inobt_rec_incore	*orec)
-{
-	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
-	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
-	struct xfs_perag		*pag;
-	struct xfs_btree_cur		*cur;
-	struct xfs_inobt_rec_incore	rec;
-	int				ilen;
-	int				error;
-	int				i;
-	int				off;
-
-	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
-	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
-
-	/*
-	 * Initialize the cursor.
-	 */
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error0;
-
-	/*
-	 * Look for the entry describing this inode.
-	 */
-	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-			__func__, error);
-		goto error0;
-	}
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	error = xfs_inobt_get_rec(cur, &rec, &i);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-			__func__, error);
-		goto error0;
-	}
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	/*
-	 * Get the offset in the inode chunk.
-	 */
-	off = agino - rec.ir_startino;
-	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
-	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
-	/*
-	 * Mark the inode free & increment the count.
-	 */
-	rec.ir_free |= XFS_INOBT_MASK(off);
-	rec.ir_freecount++;
-
-	/*
-	 * When an inode cluster is free, it becomes eligible for removal
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-	    (rec.ir_freecount == mp->m_ialloc_inos)) {
-
-		*deleted = 1;
-		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
-
-		/*
-		 * Remove the inode cluster from the AGI B+Tree, adjust the
-		 * AGI and Superblock inode counts, and mark the disk space
-		 * to be freed when the transaction is committed.
-		 */
-		ilen = mp->m_ialloc_inos;
-		be32_add_cpu(&agi->agi_count, -ilen);
-		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
-		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-		pag = xfs_perag_get(mp, agno);
-		pag->pagi_freecount -= ilen - 1;
-		xfs_perag_put(pag);
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
-
-		if ((error = xfs_btree_delete(cur, &i))) {
-			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-				__func__, error);
-			goto error0;
-		}
-
-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-				  mp->m_ialloc_blks, flist, mp);
-	} else {
-		*deleted = 0;
-
-		error = xfs_inobt_update(cur, &rec);
-		if (error) {
-			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-				__func__, error);
-			goto error0;
-		}
-
-		/* 
-		 * Change the inode free counts and log the ag/sb changes.
-		 */
-		be32_add_cpu(&agi->agi_freecount, 1);
-		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-		pag = xfs_perag_get(mp, agno);
-		pag->pagi_freecount++;
-		xfs_perag_put(pag);
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
-	}
-
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error0;
-
-	*orec = rec;
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	return 0;
-
-error0:
-	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Free an inode in the free inode btree.
- */
-STATIC int
-xfs_difree_finobt(
-	struct xfs_mount		*mp,
-	struct xfs_trans		*tp,
-	struct xfs_buf			*agbp,
-	xfs_agino_t			agino,
-	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */
-{
-	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
-	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
-	struct xfs_btree_cur		*cur;
-	struct xfs_inobt_rec_incore	rec;
-	int				offset = agino - ibtrec->ir_startino;
-	int				error;
-	int				i;
-
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
-
-	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
-	if (error)
-		goto error;
-	if (i == 0) {
-		/*
-		 * If the record does not exist in the finobt, we must have just
-		 * freed an inode in a previously fully allocated chunk. If not,
-		 * something is out of sync.
-		 */
-		XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
-
-		error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
-					     ibtrec->ir_free, &i);
-		if (error)
-			goto error;
-		ASSERT(i == 1);
-
-		goto out;
-	}
-
-	/*
-	 * Read and update the existing record. We could just copy the ibtrec
-	 * across here, but that would defeat the purpose of having redundant
-	 * metadata. By making the modifications independently, we can catch
-	 * corruptions that we wouldn't see if we just copied from one record
-	 * to another.
-	 */
-	error = xfs_inobt_get_rec(cur, &rec, &i);
-	if (error)
-		goto error;
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error);
-
-	rec.ir_free |= XFS_INOBT_MASK(offset);
-	rec.ir_freecount++;
-
-	XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
-				(rec.ir_freecount == ibtrec->ir_freecount),
-				error);
-
-	/*
-	 * The content of inobt records should always match between the inobt
-	 * and finobt. The lifecycle of records in the finobt is different from
-	 * the inobt in that the finobt only tracks records with at least one
-	 * free inode. Hence, if all of the inodes are free and we aren't
-	 * keeping inode chunks permanently on disk, remove the record.
-	 * Otherwise, update the record with the new information.
-	 */
-	if (rec.ir_freecount == mp->m_ialloc_inos &&
-	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-		error = xfs_btree_delete(cur, &i);
-		if (error)
-			goto error;
-		ASSERT(i == 1);
-	} else {
-		error = xfs_inobt_update(cur, &rec);
-		if (error)
-			goto error;
-	}
-
-out:
-	error = xfs_check_agi_freecount(cur, agi);
-	if (error)
-		goto error;
-
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	return 0;
-
-error:
-	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-	return error;
-}
-
-/*
- * Free disk inode.  Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int
-xfs_difree(
-	struct xfs_trans	*tp,		/* transaction pointer */
-	xfs_ino_t		inode,		/* inode to be freed */
-	struct xfs_bmap_free	*flist,		/* extents to free */
-	int			*deleted,/* set if inode cluster was deleted */
-	xfs_ino_t		*first_ino)/* first inode in deleted cluster */
-{
-	/* REFERENCED */
-	xfs_agblock_t		agbno;	/* block number containing inode */
-	struct xfs_buf		*agbp;	/* buffer for allocation group header */
-	xfs_agino_t		agino;	/* allocation group inode number */
-	xfs_agnumber_t		agno;	/* allocation group number */
-	int			error;	/* error return value */
-	struct xfs_mount	*mp;	/* mount structure for filesystem */
-	struct xfs_inobt_rec_incore rec;/* btree record */
-
-	mp = tp->t_mountp;
-
-	/*
-	 * Break up inode number into its components.
-	 */
-	agno = XFS_INO_TO_AGNO(mp, inode);
-	if (agno >= mp->m_sb.sb_agcount)  {
-		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-			__func__, agno, mp->m_sb.sb_agcount);
-		ASSERT(0);
-		return EINVAL;
-	}
-	agino = XFS_INO_TO_AGINO(mp, inode);
-	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-			__func__, (unsigned long long)inode,
-			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-		ASSERT(0);
-		return EINVAL;
-	}
-	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-	if (agbno >= mp->m_sb.sb_agblocks)  {
-		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-			__func__, agbno, mp->m_sb.sb_agblocks);
-		ASSERT(0);
-		return EINVAL;
-	}
-	/*
-	 * Get the allocation group header.
-	 */
-	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-			__func__, error);
-		return error;
-	}
-
-	/*
-	 * Fix up the inode allocation btree.
-	 */
-	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
-				 &rec);
-	if (error)
-		goto error0;
-
-	/*
-	 * Fix up the free inode btree.
-	 */
-	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-		error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
-		if (error)
-			goto error0;
-	}
-
-	return 0;
-
-error0:
-	return error;
-}
-
-STATIC int
-xfs_imap_lookup(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino,
-	xfs_agblock_t		agbno,
-	xfs_agblock_t		*chunk_agbno,
-	xfs_agblock_t		*offset_agbno,
-	int			flags)
-{
-	struct xfs_inobt_rec_incore rec;
-	struct xfs_btree_cur	*cur;
-	struct xfs_buf		*agbp;
-	int			error;
-	int			i;
-
-	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-	if (error) {
-		xfs_alert(mp,
-			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-			__func__, error, agno);
-		return error;
-	}
-
-	/*
-	 * Lookup the inode record for the given agino. If the record cannot be
-	 * found, then it's an invalid inode number and we should abort. Once
-	 * we have a record, we need to ensure it contains the inode number
-	 * we are looking up.
-	 */
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
-	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-	if (!error) {
-		if (i)
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-		if (!error && i == 0)
-			error = EINVAL;
-	}
-
-	xfs_trans_brelse(tp, agbp);
-	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-	if (error)
-		return error;
-
-	/* check that the returned record contains the required inode */
-	if (rec.ir_startino > agino ||
-	    rec.ir_startino + mp->m_ialloc_inos <= agino)
-		return EINVAL;
-
-	/* for untrusted inodes check it is allocated first */
-	if ((flags & XFS_IGET_UNTRUSTED) &&
-	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
-		return EINVAL;
-
-	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
-	*offset_agbno = agbno - *chunk_agbno;
-	return 0;
-}
-
-/*
- * Return the location of the inode in imap, for mapping it into a buffer.
- */
-int
-xfs_imap(
-	xfs_mount_t	 *mp,	/* file system mount structure */
-	xfs_trans_t	 *tp,	/* transaction pointer */
-	xfs_ino_t	ino,	/* inode to locate */
-	struct xfs_imap	*imap,	/* location map structure */
-	uint		flags)	/* flags for inode btree lookup */
-{
-	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
-	xfs_agino_t	agino;	/* inode number within alloc group */
-	xfs_agnumber_t	agno;	/* allocation group number */
-	int		blks_per_cluster; /* num blocks per inode cluster */
-	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
-	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
-	int		error;	/* error code */
-	int		offset;	/* index of inode in its buffer */
-	xfs_agblock_t	offset_agbno;	/* blks from chunk start to inode */
-
-	ASSERT(ino != NULLFSINO);
-
-	/*
-	 * Split up the inode number into its parts.
-	 */
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agino = XFS_INO_TO_AGINO(mp, ino);
-	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
-	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-#ifdef DEBUG
-		/*
-		 * Don't output diagnostic information for untrusted inodes
-		 * as they can be invalid without implying corruption.
-		 */
-		if (flags & XFS_IGET_UNTRUSTED)
-			return EINVAL;
-		if (agno >= mp->m_sb.sb_agcount) {
-			xfs_alert(mp,
-				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-				__func__, agno, mp->m_sb.sb_agcount);
-		}
-		if (agbno >= mp->m_sb.sb_agblocks) {
-			xfs_alert(mp,
-		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-				__func__, (unsigned long long)agbno,
-				(unsigned long)mp->m_sb.sb_agblocks);
-		}
-		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-			xfs_alert(mp,
-		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-				__func__, ino,
-				XFS_AGINO_TO_INO(mp, agno, agino));
-		}
-		xfs_stack_trace();
-#endif /* DEBUG */
-		return EINVAL;
-	}
-
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-
-	/*
-	 * For bulkstat and handle lookups, we have an untrusted inode number
-	 * that we have to verify is valid. We cannot do this just by reading
-	 * the inode buffer as it may have been unlinked and removed leaving
-	 * inodes in stale state on disk. Hence we have to do a btree lookup
-	 * in all cases where an untrusted inode number is passed.
-	 */
-	if (flags & XFS_IGET_UNTRUSTED) {
-		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
-					&chunk_agbno, &offset_agbno, flags);
-		if (error)
-			return error;
-		goto out_map;
-	}
-
-	/*
-	 * If the inode cluster size is the same as the blocksize or
-	 * smaller we get to the buffer by simple arithmetics.
-	 */
-	if (blks_per_cluster == 1) {
-		offset = XFS_INO_TO_OFFSET(mp, ino);
-		ASSERT(offset < mp->m_sb.sb_inopblock);
-
-		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-		imap->im_len = XFS_FSB_TO_BB(mp, 1);
-		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-		return 0;
-	}
-
-	/*
-	 * If the inode chunks are aligned then use simple maths to
-	 * find the location. Otherwise we have to do a btree
-	 * lookup to find the location.
-	 */
-	if (mp->m_inoalign_mask) {
-		offset_agbno = agbno & mp->m_inoalign_mask;
-		chunk_agbno = agbno - offset_agbno;
-	} else {
-		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
-					&chunk_agbno, &offset_agbno, flags);
-		if (error)
-			return error;
-	}
-
-out_map:
-	ASSERT(agbno >= chunk_agbno);
-	cluster_agbno = chunk_agbno +
-		((offset_agbno / blks_per_cluster) * blks_per_cluster);
-	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
-		XFS_INO_TO_OFFSET(mp, ino);
-
-	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-
-	/*
-	 * If the inode number maps to a block outside the bounds
-	 * of the file system then return NULL rather than calling
-	 * read_buf and panicing when we get an error from the
-	 * driver.
-	 */
-	if ((imap->im_blkno + imap->im_len) >
-	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-		xfs_alert(mp,
-	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-			__func__, (unsigned long long) imap->im_blkno,
-			(unsigned long long) imap->im_len,
-			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-		return EINVAL;
-	}
-	return 0;
-}
-
-/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
-	xfs_mount_t	*mp)		/* file system mount structure */
-{
-	int		level;
-	uint		maxblocks;
-	uint		maxleafents;
-	int		minleafrecs;
-	int		minnoderecs;
-
-	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-		XFS_INODES_PER_CHUNK_LOG;
-	minleafrecs = mp->m_alloc_mnr[0];
-	minnoderecs = mp->m_alloc_mnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++)
-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	mp->m_in_maxlevels = level;
-}
-
-/*
- * Log specified fields for the ag hdr (inode section). The growth of the agi
- * structure over time requires that we interpret the buffer as two logical
- * regions delineated by the end of the unlinked list. This is due to the size
- * of the hash table and its location in the middle of the agi.
- *
- * For example, a request to log a field before agi_unlinked and a field after
- * agi_unlinked could cause us to log the entire hash table and use an excessive
- * amount of log space. To avoid this behavior, log the region up through
- * agi_unlinked in one call and the region after agi_unlinked through the end of
- * the structure in another.
- */
-void
-xfs_ialloc_log_agi(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_buf_t	*bp,		/* allocation group header buffer */
-	int		fields)		/* bitmask of fields to log */
-{
-	int			first;		/* first byte number */
-	int			last;		/* last byte number */
-	static const short	offsets[] = {	/* field starting offsets */
-					/* keep in sync with bit definitions */
-		offsetof(xfs_agi_t, agi_magicnum),
-		offsetof(xfs_agi_t, agi_versionnum),
-		offsetof(xfs_agi_t, agi_seqno),
-		offsetof(xfs_agi_t, agi_length),
-		offsetof(xfs_agi_t, agi_count),
-		offsetof(xfs_agi_t, agi_root),
-		offsetof(xfs_agi_t, agi_level),
-		offsetof(xfs_agi_t, agi_freecount),
-		offsetof(xfs_agi_t, agi_newino),
-		offsetof(xfs_agi_t, agi_dirino),
-		offsetof(xfs_agi_t, agi_unlinked),
-		offsetof(xfs_agi_t, agi_free_root),
-		offsetof(xfs_agi_t, agi_free_level),
-		sizeof(xfs_agi_t)
-	};
-#ifdef DEBUG
-	xfs_agi_t		*agi;	/* allocation group header */
-
-	agi = XFS_BUF_TO_AGI(bp);
-	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
-#endif
-
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
-	/*
-	 * Compute byte offsets for the first and last fields in the first
-	 * region and log the agi buffer. This only logs up through
-	 * agi_unlinked.
-	 */
-	if (fields & XFS_AGI_ALL_BITS_R1) {
-		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
-				  &first, &last);
-		xfs_trans_log_buf(tp, bp, first, last);
-	}
-
-	/*
-	 * Mask off the bits in the first region and calculate the first and
-	 * last field offsets for any bits in the second region.
-	 */
-	fields &= ~XFS_AGI_ALL_BITS_R1;
-	if (fields) {
-		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
-				  &first, &last);
-		xfs_trans_log_buf(tp, bp, first, last);
-	}
-}
-
-#ifdef DEBUG
-STATIC void
-xfs_check_agi_unlinked(
-	struct xfs_agi		*agi)
-{
-	int			i;
-
-	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
-		ASSERT(agi->agi_unlinked[i]);
-}
-#else
-#define xfs_check_agi_unlinked(agi)
-#endif
-
-static bool
-xfs_agi_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
-			return false;
-	/*
-	 * Validate the magic number of the agi block.
-	 */
-	if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
-		return false;
-	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
-		return false;
-
-	/*
-	 * during growfs operations, the perag is not fully initialised,
-	 * so we can't use it for any useful checking. growfs ensures we can't
-	 * use it by using uncached buffers that don't have the perag attached
-	 * so we can detect and avoid this problem.
-	 */
-	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
-		return false;
-
-	xfs_check_agi_unlinked(agi);
-	return true;
-}
-
-static void
-xfs_agi_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
-				XFS_ERRTAG_IALLOC_READ_AGI,
-				XFS_RANDOM_IALLOC_READ_AGI))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_agi_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	if (!xfs_agi_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (bip)
-		XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_agi_buf_ops = {
-	.verify_read = xfs_agi_read_verify,
-	.verify_write = xfs_agi_write_verify,
-};
-
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int
-xfs_read_agi(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	struct xfs_buf		**bpp)	/* allocation group hdr buf */
-{
-	int			error;
-
-	trace_xfs_read_agi(mp, agno);
-
-	ASSERT(agno != NULLAGNUMBER);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
-	if (error)
-		return error;
-
-	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-	return 0;
-}
-
-int
-xfs_ialloc_read_agi(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	struct xfs_buf		**bpp)	/* allocation group hdr buf */
-{
-	struct xfs_agi		*agi;	/* allocation group header */
-	struct xfs_perag	*pag;	/* per allocation group data */
-	int			error;
-
-	trace_xfs_ialloc_read_agi(mp, agno);
-
-	error = xfs_read_agi(mp, tp, agno, bpp);
-	if (error)
-		return error;
-
-	agi = XFS_BUF_TO_AGI(*bpp);
-	pag = xfs_perag_get(mp, agno);
-	if (!pag->pagi_init) {
-		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
-		pag->pagi_count = be32_to_cpu(agi->agi_count);
-		pag->pagi_init = 1;
-	}
-
-	/*
-	 * It's possible for these to be out of sync if
-	 * we are in the middle of a forced shutdown.
-	 */
-	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-		XFS_FORCED_SHUTDOWN(mp));
-	xfs_perag_put(pag);
-	return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno)		/* allocation group number */
-{
-	xfs_buf_t	*bp = NULL;
-	int		error;
-
-	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
-	if (error)
-		return error;
-	if (bp)
-		xfs_trans_brelse(tp, bp);
-	return 0;
-}
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
deleted file mode 100644
index 726f83a681a5..000000000000
--- a/fs/xfs/xfs_ialloc_btree.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-
-
-STATIC int
-xfs_inobt_get_minrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	return cur->bc_mp->m_inobt_mnr[level != 0];
-}
-
-STATIC struct xfs_btree_cur *
-xfs_inobt_dup_cursor(
-	struct xfs_btree_cur	*cur)
-{
-	return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.a.agbp, cur->bc_private.a.agno,
-			cur->bc_btnum);
-}
-
-STATIC void
-xfs_inobt_set_root(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*nptr,
-	int			inc)	/* level change */
-{
-	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
-
-	agi->agi_root = nptr->s;
-	be32_add_cpu(&agi->agi_level, inc);
-	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
-}
-
-STATIC void
-xfs_finobt_set_root(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*nptr,
-	int			inc)	/* level change */
-{
-	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
-
-	agi->agi_free_root = nptr->s;
-	be32_add_cpu(&agi->agi_free_level, inc);
-	xfs_ialloc_log_agi(cur->bc_tp, agbp,
-			   XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
-}
-
-STATIC int
-xfs_inobt_alloc_block(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*start,
-	union xfs_btree_ptr	*new,
-	int			*stat)
-{
-	xfs_alloc_arg_t		args;		/* block allocation args */
-	int			error;		/* error return value */
-	xfs_agblock_t		sbno = be32_to_cpu(start->s);
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-
-	memset(&args, 0, sizeof(args));
-	args.tp = cur->bc_tp;
-	args.mp = cur->bc_mp;
-	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
-	args.minlen = 1;
-	args.maxlen = 1;
-	args.prod = 1;
-	args.type = XFS_ALLOCTYPE_NEAR_BNO;
-
-	error = xfs_alloc_vextent(&args);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-	if (args.fsbno == NULLFSBLOCK) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-		*stat = 0;
-		return 0;
-	}
-	ASSERT(args.len == 1);
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-
-	new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
-	*stat = 1;
-	return 0;
-}
-
-STATIC int
-xfs_inobt_free_block(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp)
-{
-	xfs_fsblock_t		fsbno;
-	int			error;
-
-	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-	error = xfs_free_extent(cur->bc_tp, fsbno, 1);
-	if (error)
-		return error;
-
-	xfs_trans_binval(cur->bc_tp, bp);
-	return error;
-}
-
-STATIC int
-xfs_inobt_get_maxrecs(
-	struct xfs_btree_cur	*cur,
-	int			level)
-{
-	return cur->bc_mp->m_inobt_mxr[level != 0];
-}
-
-STATIC void
-xfs_inobt_init_key_from_rec(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	key->inobt.ir_startino = rec->inobt.ir_startino;
-}
-
-STATIC void
-xfs_inobt_init_rec_from_key(
-	union xfs_btree_key	*key,
-	union xfs_btree_rec	*rec)
-{
-	rec->inobt.ir_startino = key->inobt.ir_startino;
-}
-
-STATIC void
-xfs_inobt_init_rec_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*rec)
-{
-	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-	rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-}
-
-/*
- * initial value of ptr for lookup
- */
-STATIC void
-xfs_inobt_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-
-	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-
-	ptr->s = agi->agi_root;
-}
-
-STATIC void
-xfs_finobt_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-
-	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-	ptr->s = agi->agi_free_root;
-}
-
-STATIC __int64_t
-xfs_inobt_key_diff(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*key)
-{
-	return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
-			  cur->bc_rec.i.ir_startino;
-}
-
-static int
-xfs_inobt_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_perag	*pag = bp->b_pag;
-	unsigned int		level;
-
-	/*
-	 * During growfs operations, we can't verify the exact owner as the
-	 * perag is not fully initialised and hence not attached to the buffer.
-	 *
-	 * Similarly, during log recovery we will have a perag structure
-	 * attached, but the agi information will not yet have been initialised
-	 * from the on disk AGI. We don't currently use any of this information,
-	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
-	 * ever do.
-	 */
-	switch (block->bb_magic) {
-	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
-	case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
-			return false;
-		/* fall through */
-	case cpu_to_be32(XFS_IBT_MAGIC):
-	case cpu_to_be32(XFS_FIBT_MAGIC):
-		break;
-	default:
-		return 0;
-	}
-
-	/* numrecs and level verification */
-	level = be16_to_cpu(block->bb_level);
-	if (level >= mp->m_in_maxlevels)
-		return false;
-	if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
-		return false;
-
-	/* sibling pointer verification */
-	if (!block->bb_u.s.bb_leftsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-	if (!block->bb_u.s.bb_rightsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-
-	return true;
-}
-
-static void
-xfs_inobt_read_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_btree_sblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_inobt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_verifier_error(bp);
-	}
-}
-
-static void
-xfs_inobt_write_verify(
-	struct xfs_buf	*bp)
-{
-	if (!xfs_inobt_verify(bp)) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-	xfs_btree_sblock_calc_crc(bp);
-
-}
-
-const struct xfs_buf_ops xfs_inobt_buf_ops = {
-	.verify_read = xfs_inobt_read_verify,
-	.verify_write = xfs_inobt_write_verify,
-};
-
-#if defined(DEBUG) || defined(XFS_WARN)
-STATIC int
-xfs_inobt_keys_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_key	*k1,
-	union xfs_btree_key	*k2)
-{
-	return be32_to_cpu(k1->inobt.ir_startino) <
-		be32_to_cpu(k2->inobt.ir_startino);
-}
-
-STATIC int
-xfs_inobt_recs_inorder(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_rec	*r1,
-	union xfs_btree_rec	*r2)
-{
-	return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
-		be32_to_cpu(r2->inobt.ir_startino);
-}
-#endif	/* DEBUG */
-
-static const struct xfs_btree_ops xfs_inobt_ops = {
-	.rec_len		= sizeof(xfs_inobt_rec_t),
-	.key_len		= sizeof(xfs_inobt_key_t),
-
-	.dup_cursor		= xfs_inobt_dup_cursor,
-	.set_root		= xfs_inobt_set_root,
-	.alloc_block		= xfs_inobt_alloc_block,
-	.free_block		= xfs_inobt_free_block,
-	.get_minrecs		= xfs_inobt_get_minrecs,
-	.get_maxrecs		= xfs_inobt_get_maxrecs,
-	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
-	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
-	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
-	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
-	.key_diff		= xfs_inobt_key_diff,
-	.buf_ops		= &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-	.keys_inorder		= xfs_inobt_keys_inorder,
-	.recs_inorder		= xfs_inobt_recs_inorder,
-#endif
-};
-
-static const struct xfs_btree_ops xfs_finobt_ops = {
-	.rec_len		= sizeof(xfs_inobt_rec_t),
-	.key_len		= sizeof(xfs_inobt_key_t),
-
-	.dup_cursor		= xfs_inobt_dup_cursor,
-	.set_root		= xfs_finobt_set_root,
-	.alloc_block		= xfs_inobt_alloc_block,
-	.free_block		= xfs_inobt_free_block,
-	.get_minrecs		= xfs_inobt_get_minrecs,
-	.get_maxrecs		= xfs_inobt_get_maxrecs,
-	.init_key_from_rec	= xfs_inobt_init_key_from_rec,
-	.init_rec_from_key	= xfs_inobt_init_rec_from_key,
-	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
-	.init_ptr_from_cur	= xfs_finobt_init_ptr_from_cur,
-	.key_diff		= xfs_inobt_key_diff,
-	.buf_ops		= &xfs_inobt_buf_ops,
-#if defined(DEBUG) || defined(XFS_WARN)
-	.keys_inorder		= xfs_inobt_keys_inorder,
-	.recs_inorder		= xfs_inobt_recs_inorder,
-#endif
-};
-
-/*
- * Allocate a new inode btree cursor.
- */
-struct xfs_btree_cur *				/* new inode btree cursor */
-xfs_inobt_init_cursor(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_buf		*agbp,		/* buffer for agi structure */
-	xfs_agnumber_t		agno,		/* allocation group number */
-	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
-{
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
-	struct xfs_btree_cur	*cur;
-
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-
-	cur->bc_tp = tp;
-	cur->bc_mp = mp;
-	cur->bc_btnum = btnum;
-	if (btnum == XFS_BTNUM_INO) {
-		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-		cur->bc_ops = &xfs_inobt_ops;
-	} else {
-		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
-		cur->bc_ops = &xfs_finobt_ops;
-	}
-
-	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-	cur->bc_private.a.agbp = agbp;
-	cur->bc_private.a.agno = agno;
-
-	return cur;
-}
-
-/*
- * Calculate number of records in an inobt btree block.
- */
-int
-xfs_inobt_maxrecs(
-	struct xfs_mount	*mp,
-	int			blocklen,
-	int			leaf)
-{
-	blocklen -= XFS_INOBT_BLOCK_LEN(mp);
-
-	if (leaf)
-		return blocklen / sizeof(xfs_inobt_rec_t);
-	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
-}
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
deleted file mode 100644
index 1e5366d7745e..000000000000
--- a/fs/xfs/xfs_inode_buf.c
+++ /dev/null
@@ -1,479 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_icache.h"
-#include "xfs_trans.h"
-#include "xfs_ialloc.h"
-#include "xfs_dinode.h"
-
-/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
- */
-#if defined(DEBUG)
-void
-xfs_inobp_check(
-	xfs_mount_t	*mp,
-	xfs_buf_t	*bp)
-{
-	int		i;
-	int		j;
-	xfs_dinode_t	*dip;
-
-	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
-	for (i = 0; i < j; i++) {
-		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					i * mp->m_sb.sb_inodesize);
-		if (!dip->di_next_unlinked)  {
-			xfs_alert(mp,
-	"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
-				i, (long long)bp->b_bn);
-		}
-	}
-}
-#endif
-
-/*
- * If we are doing readahead on an inode buffer, we might be in log recovery
- * reading an inode allocation buffer that hasn't yet been replayed, and hence
- * has not had the inode cores stamped into it. Hence for readahead, the buffer
- * may be potentially invalid.
- *
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
- */
-static void
-xfs_inode_buf_verify(
-	struct xfs_buf	*bp,
-	bool		readahead)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	int		i;
-	int		ni;
-
-	/*
-	 * Validate the magic number and version of every inode in the buffer
-	 */
-	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
-	for (i = 0; i < ni; i++) {
-		int		di_ok;
-		xfs_dinode_t	*dip;
-
-		dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-					(i << mp->m_sb.sb_inodelog));
-		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-			    XFS_DINODE_GOOD_VERSION(dip->di_version);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP,
-						XFS_RANDOM_ITOBP_INOTOBP))) {
-			if (readahead) {
-				bp->b_flags &= ~XBF_DONE;
-				return;
-			}
-
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			xfs_verifier_error(bp);
-#ifdef DEBUG
-			xfs_alert(mp,
-				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
-				(unsigned long long)bp->b_bn, i,
-				be16_to_cpu(dip->di_magic));
-#endif
-		}
-	}
-	xfs_inobp_check(mp, bp);
-}
-
-
-static void
-xfs_inode_buf_read_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_inode_buf_verify(bp, false);
-}
-
-static void
-xfs_inode_buf_readahead_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_inode_buf_verify(bp, true);
-}
-
-static void
-xfs_inode_buf_write_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_inode_buf_verify(bp, false);
-}
-
-const struct xfs_buf_ops xfs_inode_buf_ops = {
-	.verify_read = xfs_inode_buf_read_verify,
-	.verify_write = xfs_inode_buf_write_verify,
-};
-
-const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
-	.verify_read = xfs_inode_buf_readahead_verify,
-	.verify_write = xfs_inode_buf_write_verify,
-};
-
-
-/*
- * This routine is called to map an inode to the buffer containing the on-disk
- * version of the inode.  It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
- */
-int
-xfs_imap_to_bp(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_imap		*imap,
-	struct xfs_dinode       **dipp,
-	struct xfs_buf		**bpp,
-	uint			buf_flags,
-	uint			iget_flags)
-{
-	struct xfs_buf		*bp;
-	int			error;
-
-	buf_flags |= XBF_UNMAPPED;
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   (int)imap->im_len, buf_flags, &bp,
-				   &xfs_inode_buf_ops);
-	if (error) {
-		if (error == EAGAIN) {
-			ASSERT(buf_flags & XBF_TRYLOCK);
-			return error;
-		}
-
-		if (error == EFSCORRUPTED &&
-		    (iget_flags & XFS_IGET_UNTRUSTED))
-			return EINVAL;
-
-		xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-			__func__, error);
-		return error;
-	}
-
-	*bpp = bp;
-	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
-	return 0;
-}
-
-void
-xfs_dinode_from_disk(
-	xfs_icdinode_t		*to,
-	xfs_dinode_t		*from)
-{
-	to->di_magic = be16_to_cpu(from->di_magic);
-	to->di_mode = be16_to_cpu(from->di_mode);
-	to->di_version = from ->di_version;
-	to->di_format = from->di_format;
-	to->di_onlink = be16_to_cpu(from->di_onlink);
-	to->di_uid = be32_to_cpu(from->di_uid);
-	to->di_gid = be32_to_cpu(from->di_gid);
-	to->di_nlink = be32_to_cpu(from->di_nlink);
-	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-	to->di_flushiter = be16_to_cpu(from->di_flushiter);
-	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
-	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
-	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
-	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
-	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
-	to->di_size = be64_to_cpu(from->di_size);
-	to->di_nblocks = be64_to_cpu(from->di_nblocks);
-	to->di_extsize = be32_to_cpu(from->di_extsize);
-	to->di_nextents = be32_to_cpu(from->di_nextents);
-	to->di_anextents = be16_to_cpu(from->di_anextents);
-	to->di_forkoff = from->di_forkoff;
-	to->di_aformat	= from->di_aformat;
-	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
-	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
-	to->di_flags	= be16_to_cpu(from->di_flags);
-	to->di_gen	= be32_to_cpu(from->di_gen);
-
-	if (to->di_version == 3) {
-		to->di_changecount = be64_to_cpu(from->di_changecount);
-		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
-		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
-		to->di_flags2 = be64_to_cpu(from->di_flags2);
-		to->di_ino = be64_to_cpu(from->di_ino);
-		to->di_lsn = be64_to_cpu(from->di_lsn);
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
-	}
-}
-
-void
-xfs_dinode_to_disk(
-	xfs_dinode_t		*to,
-	xfs_icdinode_t		*from)
-{
-	to->di_magic = cpu_to_be16(from->di_magic);
-	to->di_mode = cpu_to_be16(from->di_mode);
-	to->di_version = from ->di_version;
-	to->di_format = from->di_format;
-	to->di_onlink = cpu_to_be16(from->di_onlink);
-	to->di_uid = cpu_to_be32(from->di_uid);
-	to->di_gid = cpu_to_be32(from->di_gid);
-	to->di_nlink = cpu_to_be32(from->di_nlink);
-	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
-	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
-	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
-	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
-	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
-	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
-	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-	to->di_size = cpu_to_be64(from->di_size);
-	to->di_nblocks = cpu_to_be64(from->di_nblocks);
-	to->di_extsize = cpu_to_be32(from->di_extsize);
-	to->di_nextents = cpu_to_be32(from->di_nextents);
-	to->di_anextents = cpu_to_be16(from->di_anextents);
-	to->di_forkoff = from->di_forkoff;
-	to->di_aformat = from->di_aformat;
-	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
-	to->di_dmstate = cpu_to_be16(from->di_dmstate);
-	to->di_flags = cpu_to_be16(from->di_flags);
-	to->di_gen = cpu_to_be32(from->di_gen);
-
-	if (from->di_version == 3) {
-		to->di_changecount = cpu_to_be64(from->di_changecount);
-		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
-		to->di_flags2 = cpu_to_be64(from->di_flags2);
-		to->di_ino = cpu_to_be64(from->di_ino);
-		to->di_lsn = cpu_to_be64(from->di_lsn);
-		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-		uuid_copy(&to->di_uuid, &from->di_uuid);
-		to->di_flushiter = 0;
-	} else {
-		to->di_flushiter = cpu_to_be16(from->di_flushiter);
-	}
-}
-
-static bool
-xfs_dinode_verify(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*ip,
-	struct xfs_dinode	*dip)
-{
-	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
-		return false;
-
-	/* only version 3 or greater inodes are extensively verified here */
-	if (dip->di_version < 3)
-		return true;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return false;
-	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
-			      XFS_DINODE_CRC_OFF))
-		return false;
-	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
-		return false;
-	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
-		return false;
-	return true;
-}
-
-void
-xfs_dinode_calc_crc(
-	struct xfs_mount	*mp,
-	struct xfs_dinode	*dip)
-{
-	__uint32_t		crc;
-
-	if (dip->di_version < 3)
-		return;
-
-	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-			      XFS_DINODE_CRC_OFF);
-	dip->di_crc = xfs_end_cksum(crc);
-}
-
-/*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
- */
-int
-xfs_iread(
-	xfs_mount_t	*mp,
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	uint		iget_flags)
-{
-	xfs_buf_t	*bp;
-	xfs_dinode_t	*dip;
-	int		error;
-
-	/*
-	 * Fill in the location information in the in-core inode.
-	 */
-	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-	if (error)
-		return error;
-
-	/* shortcut IO on inode allocation if possible */
-	if ((iget_flags & XFS_IGET_CREATE) &&
-	    xfs_sb_version_hascrc(&mp->m_sb) &&
-	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-		/* initialise the on-disk inode core */
-		memset(&ip->i_d, 0, sizeof(ip->i_d));
-		ip->i_d.di_magic = XFS_DINODE_MAGIC;
-		ip->i_d.di_gen = prandom_u32();
-		if (xfs_sb_version_hascrc(&mp->m_sb)) {
-			ip->i_d.di_version = 3;
-			ip->i_d.di_ino = ip->i_ino;
-			uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
-		} else
-			ip->i_d.di_version = 2;
-		return 0;
-	}
-
-	/*
-	 * Get pointers to the on-disk inode and the buffer containing it.
-	 */
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
-	if (error)
-		return error;
-
-	/* even unallocated inodes are verified */
-	if (!xfs_dinode_verify(mp, ip, dip)) {
-		xfs_alert(mp, "%s: validation failed for inode %lld failed",
-				__func__, ip->i_ino);
-
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-		error = EFSCORRUPTED;
-		goto out_brelse;
-	}
-
-	/*
-	 * If the on-disk inode is already linked to a directory
-	 * entry, copy all of the inode into the in-core inode.
-	 * xfs_iformat_fork() handles copying in the inode format
-	 * specific information.
-	 * Otherwise, just get the truly permanent information.
-	 */
-	if (dip->di_mode) {
-		xfs_dinode_from_disk(&ip->i_d, dip);
-		error = xfs_iformat_fork(ip, dip);
-		if (error)  {
-#ifdef DEBUG
-			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-				__func__, error);
-#endif /* DEBUG */
-			goto out_brelse;
-		}
-	} else {
-		/*
-		 * Partial initialisation of the in-core inode. Just the bits
-		 * that xfs_ialloc won't overwrite or relies on being correct.
-		 */
-		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-		ip->i_d.di_version = dip->di_version;
-		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
-		if (dip->di_version == 3) {
-			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-		}
-
-		/*
-		 * Make sure to pull in the mode here as well in
-		 * case the inode is released without being used.
-		 * This ensures that xfs_inactive() will see that
-		 * the inode is already free and not try to mess
-		 * with the uninitialized part of it.
-		 */
-		ip->i_d.di_mode = 0;
-	}
-
-	/*
-	 * Automatically convert version 1 inode formats in memory to version 2
-	 * inode format. If the inode is modified, it will get logged and
-	 * rewritten as a version 2 inode. We can do this because we set the
-	 * superblock feature bit for v2 inodes unconditionally during mount
-	 * and it means the reast of the code can assume the inode version is 2
-	 * or higher.
-	 */
-	if (ip->i_d.di_version == 1) {
-		ip->i_d.di_version = 2;
-		memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-		ip->i_d.di_nlink = ip->i_d.di_onlink;
-		ip->i_d.di_onlink = 0;
-		xfs_set_projid(ip, 0);
-	}
-
-	ip->i_delayed_blks = 0;
-
-	/*
-	 * Mark the buffer containing the inode as something to keep
-	 * around for a while.  This helps to keep recently accessed
-	 * meta-data in-core longer.
-	 */
-	xfs_buf_set_ref(bp, XFS_INO_REF);
-
-	/*
-	 * Use xfs_trans_brelse() to release the buffer containing the on-disk
-	 * inode, because it was acquired with xfs_trans_read_buf() in
-	 * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
-	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
-	 * will only release the buffer if it is not dirty within the
-	 * transaction.  It will be OK to release the buffer in this case,
-	 * because inodes on disk are never destroyed and we will be locking the
-	 * new in-core inode before putting it in the cache where other
-	 * processes can find it.  Thus we don't have to worry about the inode
-	 * being changed just because we released the buffer.
-	 */
- out_brelse:
-	xfs_trans_brelse(tp, bp);
-	return error;
-}
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
deleted file mode 100644
index 2a124e97f082..000000000000
--- a/fs/xfs/xfs_inode_fork.c
+++ /dev/null
@@ -1,1906 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/log2.h>
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-
-kmem_zone_t *xfs_ifork_zone;
-
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
-	xfs_ifork_t		*ifp,
-	int			nrecs,
-	xfs_exntfmt_t		fmt)
-{
-	xfs_bmbt_irec_t		irec;
-	xfs_bmbt_rec_host_t	rec;
-	int			i;
-
-	for (i = 0; i < nrecs; i++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-		rec.l0 = get_unaligned(&ep->l0);
-		rec.l1 = get_unaligned(&ep->l1);
-		xfs_bmbt_get_all(&rec, &irec);
-		if (fmt == XFS_EXTFMT_NOSTATE)
-			ASSERT(irec.br_state == XFS_EXT_NORM);
-	}
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
-/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode.  For fifos, devs, and sockets
- * this means set if_rdev to the proper value.  For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers.  For a file in B-tree format, only the root is immediately
- * brought in-core.  The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
- */
-int
-xfs_iformat_fork(
-	xfs_inode_t		*ip,
-	xfs_dinode_t		*dip)
-{
-	xfs_attr_shortform_t	*atp;
-	int			size;
-	int			error = 0;
-	xfs_fsize_t             di_size;
-
-	if (unlikely(be32_to_cpu(dip->di_nextents) +
-		     be16_to_cpu(dip->di_anextents) >
-		     be64_to_cpu(dip->di_nblocks))) {
-		xfs_warn(ip->i_mount,
-			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
-			(unsigned long long)ip->i_ino,
-			(int)(be32_to_cpu(dip->di_nextents) +
-			      be16_to_cpu(dip->di_anextents)),
-			(unsigned long long)
-				be64_to_cpu(dip->di_nblocks));
-		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
-				     ip->i_mount, dip);
-		return EFSCORRUPTED;
-	}
-
-	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-			(unsigned long long)ip->i_ino,
-			dip->di_forkoff);
-		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
-				     ip->i_mount, dip);
-		return EFSCORRUPTED;
-	}
-
-	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
-		     !ip->i_mount->m_rtdev_targp)) {
-		xfs_warn(ip->i_mount,
-			"corrupt dinode %Lu, has realtime flag set.",
-			ip->i_ino);
-		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
-				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-		return EFSCORRUPTED;
-	}
-
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFIFO:
-	case S_IFCHR:
-	case S_IFBLK:
-	case S_IFSOCK:
-		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
-			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
-					      ip->i_mount, dip);
-			return EFSCORRUPTED;
-		}
-		ip->i_d.di_size = 0;
-		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
-		break;
-
-	case S_IFREG:
-	case S_IFLNK:
-	case S_IFDIR:
-		switch (dip->di_format) {
-		case XFS_DINODE_FMT_LOCAL:
-			/*
-			 * no local regular files yet
-			 */
-			if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
-				xfs_warn(ip->i_mount,
-			"corrupt inode %Lu (local format for regular file).",
-					(unsigned long long) ip->i_ino);
-				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
-						     XFS_ERRLEVEL_LOW,
-						     ip->i_mount, dip);
-				return EFSCORRUPTED;
-			}
-
-			di_size = be64_to_cpu(dip->di_size);
-			if (unlikely(di_size < 0 ||
-				     di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-				xfs_warn(ip->i_mount,
-			"corrupt inode %Lu (bad size %Ld for local inode).",
-					(unsigned long long) ip->i_ino,
-					(long long) di_size);
-				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
-						     XFS_ERRLEVEL_LOW,
-						     ip->i_mount, dip);
-				return EFSCORRUPTED;
-			}
-
-			size = (int)di_size;
-			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
-			break;
-		case XFS_DINODE_FMT_EXTENTS:
-			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
-			break;
-		case XFS_DINODE_FMT_BTREE:
-			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
-			break;
-		default:
-			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
-					 ip->i_mount);
-			return EFSCORRUPTED;
-		}
-		break;
-
-	default:
-		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-		return EFSCORRUPTED;
-	}
-	if (error) {
-		return error;
-	}
-	if (!XFS_DFORK_Q(dip))
-		return 0;
-
-	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
-	switch (dip->di_aformat) {
-	case XFS_DINODE_FMT_LOCAL:
-		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
-		size = be16_to_cpu(atp->hdr.totsize);
-
-		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-			xfs_warn(ip->i_mount,
-				"corrupt inode %Lu (bad attr fork size %Ld).",
-				(unsigned long long) ip->i_ino,
-				(long long) size);
-			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
-					     XFS_ERRLEVEL_LOW,
-					     ip->i_mount, dip);
-			return EFSCORRUPTED;
-		}
-
-		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
-		break;
-	case XFS_DINODE_FMT_EXTENTS:
-		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
-		break;
-	default:
-		error = EFSCORRUPTED;
-		break;
-	}
-	if (error) {
-		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-		ip->i_afp = NULL;
-		xfs_idestroy_fork(ip, XFS_DATA_FORK);
-	}
-	return error;
-}
-
-/*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there.  Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
- */
-STATIC int
-xfs_iformat_local(
-	xfs_inode_t	*ip,
-	xfs_dinode_t	*dip,
-	int		whichfork,
-	int		size)
-{
-	xfs_ifork_t	*ifp;
-	int		real_size;
-
-	/*
-	 * If the size is unreasonable, then something
-	 * is wrong and we just bail out rather than crash in
-	 * kmem_alloc() or memcpy() below.
-	 */
-	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_warn(ip->i_mount,
-	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
-			(unsigned long long) ip->i_ino, size,
-			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
-		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
-				     ip->i_mount, dip);
-		return EFSCORRUPTED;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	real_size = 0;
-	if (size == 0)
-		ifp->if_u1.if_data = NULL;
-	else if (size <= sizeof(ifp->if_u2.if_inline_data))
-		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-	else {
-		real_size = roundup(size, 4);
-		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-	}
-	ifp->if_bytes = size;
-	ifp->if_real_bytes = real_size;
-	if (size)
-		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-	ifp->if_flags &= ~XFS_IFEXTENTS;
-	ifp->if_flags |= XFS_IFINLINE;
-	return 0;
-}
-
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it.  Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
-	xfs_inode_t	*ip,
-	xfs_dinode_t	*dip,
-	int		whichfork)
-{
-	xfs_bmbt_rec_t	*dp;
-	xfs_ifork_t	*ifp;
-	int		nex;
-	int		size;
-	int		i;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
-	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
-	/*
-	 * If the number of extents is unreasonable, then something
-	 * is wrong and we just bail out rather than crash in
-	 * kmem_alloc() or memcpy() below.
-	 */
-	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-			(unsigned long long) ip->i_ino, nex);
-		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
-				     ip->i_mount, dip);
-		return EFSCORRUPTED;
-	}
-
-	ifp->if_real_bytes = 0;
-	if (nex == 0)
-		ifp->if_u1.if_extents = NULL;
-	else if (nex <= XFS_INLINE_EXTS)
-		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-	else
-		xfs_iext_add(ifp, 0, nex);
-
-	ifp->if_bytes = size;
-	if (size) {
-		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
-		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
-		for (i = 0; i < nex; i++, dp++) {
-			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-			ep->l0 = get_unaligned_be64(&dp->l0);
-			ep->l1 = get_unaligned_be64(&dp->l1);
-		}
-		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
-		if (whichfork != XFS_DATA_FORK ||
-			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
-				if (unlikely(xfs_check_nostate_extents(
-				    ifp, 0, nex))) {
-					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
-							 XFS_ERRLEVEL_LOW,
-							 ip->i_mount);
-					return EFSCORRUPTED;
-				}
-	}
-	ifp->if_flags |= XFS_IFEXTENTS;
-	return 0;
-}
-
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it.  The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
-	xfs_inode_t		*ip,
-	xfs_dinode_t		*dip,
-	int			whichfork)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_bmdr_block_t	*dfp;
-	xfs_ifork_t		*ifp;
-	/* REFERENCED */
-	int			nrecs;
-	int			size;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
-	nrecs = be16_to_cpu(dfp->bb_numrecs);
-
-	/*
-	 * blow out if -- fork has less extents than can fit in
-	 * fork (fork shouldn't be a btree format), root btree
-	 * block has more records than can fit into the fork,
-	 * or the number of extents is greater than the number of
-	 * blocks.
-	 */
-	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-					XFS_IFORK_MAXEXT(ip, whichfork) ||
-		     XFS_BMDR_SPACE_CALC(nrecs) >
-					XFS_DFORK_SIZE(dip, mp, whichfork) ||
-		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-		xfs_warn(mp, "corrupt inode %Lu (btree).",
-					(unsigned long long) ip->i_ino);
-		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-					 mp, dip);
-		return EFSCORRUPTED;
-	}
-
-	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
-	ASSERT(ifp->if_broot != NULL);
-	/*
-	 * Copy and convert from the on-disk structure
-	 * to the in-memory structure.
-	 */
-	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-			 ifp->if_broot, size);
-	ifp->if_flags &= ~XFS_IFEXTENTS;
-	ifp->if_flags |= XFS_IFBROOT;
-
-	return 0;
-}
-
-/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	int		whichfork)
-{
-	int		error;
-	xfs_ifork_t	*ifp;
-	xfs_extnum_t	nextents;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
-				 ip->i_mount);
-		return EFSCORRUPTED;
-	}
-	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-
-	/*
-	 * We know that the size is valid (it's checked in iformat_btree)
-	 */
-	ifp->if_bytes = ifp->if_real_bytes = 0;
-	ifp->if_flags |= XFS_IFEXTENTS;
-	xfs_iext_add(ifp, 0, nextents);
-	error = xfs_bmap_read_extents(tp, ip, whichfork);
-	if (error) {
-		xfs_iext_destroy(ifp);
-		ifp->if_flags &= ~XFS_IFEXTENTS;
-		return error;
-	}
-	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
-	return 0;
-}
-/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff.  Move the records
- * and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller.  When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root.  If the if_broot is currently NULL, then
- * if we are adding records, one will be allocated.  The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *	 requested for the if_broot array.
- */
-void
-xfs_iroot_realloc(
-	xfs_inode_t		*ip,
-	int			rec_diff,
-	int			whichfork)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	int			cur_max;
-	xfs_ifork_t		*ifp;
-	struct xfs_btree_block	*new_broot;
-	int			new_max;
-	size_t			new_size;
-	char			*np;
-	char			*op;
-
-	/*
-	 * Handle the degenerate case quietly.
-	 */
-	if (rec_diff == 0) {
-		return;
-	}
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (rec_diff > 0) {
-		/*
-		 * If there wasn't any memory allocated before, just
-		 * allocate it now and get out.
-		 */
-		if (ifp->if_broot_bytes == 0) {
-			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-			ifp->if_broot_bytes = (int)new_size;
-			return;
-		}
-
-		/*
-		 * If there is already an existing if_broot, then we need
-		 * to realloc() it and shift the pointers to their new
-		 * location.  The records don't change location because
-		 * they are kept butted up against the btree block header.
-		 */
-		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-		new_max = cur_max + rec_diff;
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-				XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
-				KM_SLEEP | KM_NOFS);
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-						     (int)new_size);
-		ifp->if_broot_bytes = (int)new_size;
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			XFS_IFORK_SIZE(ip, whichfork));
-		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
-		return;
-	}
-
-	/*
-	 * rec_diff is less than 0.  In this case, we are shrinking the
-	 * if_broot buffer.  It must already exist.  If we go to zero
-	 * records, just get rid of the root and clear the status bit.
-	 */
-	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-	new_max = cur_max + rec_diff;
-	ASSERT(new_max >= 0);
-	if (new_max > 0)
-		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-	else
-		new_size = 0;
-	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-		/*
-		 * First copy over the btree block header.
-		 */
-		memcpy(new_broot, ifp->if_broot,
-			XFS_BMBT_BLOCK_LEN(ip->i_mount));
-	} else {
-		new_broot = NULL;
-		ifp->if_flags &= ~XFS_IFBROOT;
-	}
-
-	/*
-	 * Only copy the records and pointers if there are any.
-	 */
-	if (new_max > 0) {
-		/*
-		 * First copy the records.
-		 */
-		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
-
-		/*
-		 * Then copy the pointers.
-		 */
-		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-						     ifp->if_broot_bytes);
-		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
-						     (int)new_size);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
-	}
-	kmem_free(ifp->if_broot);
-	ifp->if_broot = new_broot;
-	ifp->if_broot_bytes = (int)new_size;
-	if (ifp->if_broot)
-		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			XFS_IFORK_SIZE(ip, whichfork));
-	return;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased.  The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer.  Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- *	 requested for the if_data array.
- */
-void
-xfs_idata_realloc(
-	xfs_inode_t	*ip,
-	int		byte_diff,
-	int		whichfork)
-{
-	xfs_ifork_t	*ifp;
-	int		new_size;
-	int		real_size;
-
-	if (byte_diff == 0) {
-		return;
-	}
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	new_size = (int)ifp->if_bytes + byte_diff;
-	ASSERT(new_size >= 0);
-
-	if (new_size == 0) {
-		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-			kmem_free(ifp->if_u1.if_data);
-		}
-		ifp->if_u1.if_data = NULL;
-		real_size = 0;
-	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
-		/*
-		 * If the valid extents/data can fit in if_inline_ext/data,
-		 * copy them from the malloc'd vector and free it.
-		 */
-		if (ifp->if_u1.if_data == NULL) {
-			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-			ASSERT(ifp->if_real_bytes != 0);
-			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
-			      new_size);
-			kmem_free(ifp->if_u1.if_data);
-			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-		}
-		real_size = 0;
-	} else {
-		/*
-		 * Stuck with malloc/realloc.
-		 * For inline data, the underlying buffer must be
-		 * a multiple of 4 bytes in size so that it can be
-		 * logged and stay on word boundaries.  We enforce
-		 * that here.
-		 */
-		real_size = roundup(new_size, 4);
-		if (ifp->if_u1.if_data == NULL) {
-			ASSERT(ifp->if_real_bytes == 0);
-			ifp->if_u1.if_data = kmem_alloc(real_size,
-							KM_SLEEP | KM_NOFS);
-		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-			/*
-			 * Only do the realloc if the underlying size
-			 * is really changing.
-			 */
-			if (ifp->if_real_bytes != real_size) {
-				ifp->if_u1.if_data =
-					kmem_realloc(ifp->if_u1.if_data,
-							real_size,
-							ifp->if_real_bytes,
-							KM_SLEEP | KM_NOFS);
-			}
-		} else {
-			ASSERT(ifp->if_real_bytes == 0);
-			ifp->if_u1.if_data = kmem_alloc(real_size,
-							KM_SLEEP | KM_NOFS);
-			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
-				ifp->if_bytes);
-		}
-	}
-	ifp->if_real_bytes = real_size;
-	ifp->if_bytes = new_size;
-	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
-void
-xfs_idestroy_fork(
-	xfs_inode_t	*ip,
-	int		whichfork)
-{
-	xfs_ifork_t	*ifp;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (ifp->if_broot != NULL) {
-		kmem_free(ifp->if_broot);
-		ifp->if_broot = NULL;
-	}
-
-	/*
-	 * If the format is local, then we can't have an extents
-	 * array so just look for an inline data array.  If we're
-	 * not local then we may or may not have an extents list,
-	 * so check and free it up if we do.
-	 */
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
-		    (ifp->if_u1.if_data != NULL)) {
-			ASSERT(ifp->if_real_bytes != 0);
-			kmem_free(ifp->if_u1.if_data);
-			ifp->if_u1.if_data = NULL;
-			ifp->if_real_bytes = 0;
-		}
-	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
-		   ((ifp->if_flags & XFS_IFEXTIREC) ||
-		    ((ifp->if_u1.if_extents != NULL) &&
-		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
-		ASSERT(ifp->if_real_bytes != 0);
-		xfs_iext_destroy(ifp);
-	}
-	ASSERT(ifp->if_u1.if_extents == NULL ||
-	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
-	ASSERT(ifp->if_real_bytes == 0);
-	if (whichfork == XFS_ATTR_FORK) {
-		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-		ip->i_afp = NULL;
-	}
-}
-
-/*
- * Convert in-core extents to on-disk form
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode.
- *
- * In the case of the data fork, the in-core and on-disk fork sizes can be
- * different due to delayed allocation extents. We only copy on-disk extents
- * here, so callers must always use the physical fork size to determine the
- * size of the buffer passed to this routine.  We will return the size actually
- * used.
- */
-int
-xfs_iextents_copy(
-	xfs_inode_t		*ip,
-	xfs_bmbt_rec_t		*dp,
-	int			whichfork)
-{
-	int			copied;
-	int			i;
-	xfs_ifork_t		*ifp;
-	int			nrecs;
-	xfs_fsblock_t		start_block;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(ifp->if_bytes > 0);
-
-	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
-	ASSERT(nrecs > 0);
-
-	/*
-	 * There are some delayed allocation extents in the
-	 * inode, so copy the extents one at a time and skip
-	 * the delayed ones.  There must be at least one
-	 * non-delayed extent.
-	 */
-	copied = 0;
-	for (i = 0; i < nrecs; i++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-		start_block = xfs_bmbt_get_startblock(ep);
-		if (isnullstartblock(start_block)) {
-			/*
-			 * It's a delayed allocation extent, so skip it.
-			 */
-			continue;
-		}
-
-		/* Translate to on disk format */
-		put_unaligned_be64(ep->l0, &dp->l0);
-		put_unaligned_be64(ep->l1, &dp->l1);
-		dp++;
-		copied++;
-	}
-	ASSERT(copied != 0);
-	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
-
-	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
-}
-
-/*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
- */
-void
-xfs_iflush_fork(
-	xfs_inode_t		*ip,
-	xfs_dinode_t		*dip,
-	xfs_inode_log_item_t	*iip,
-	int			whichfork)
-{
-	char			*cp;
-	xfs_ifork_t		*ifp;
-	xfs_mount_t		*mp;
-	static const short	brootflag[2] =
-		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
-	static const short	dataflag[2] =
-		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
-	static const short	extflag[2] =
-		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
-	if (!iip)
-		return;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	/*
-	 * This can happen if we gave up in iformat in an error path,
-	 * for the attribute fork.
-	 */
-	if (!ifp) {
-		ASSERT(whichfork == XFS_ATTR_FORK);
-		return;
-	}
-	cp = XFS_DFORK_PTR(dip, whichfork);
-	mp = ip->i_mount;
-	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
-	case XFS_DINODE_FMT_LOCAL:
-		if ((iip->ili_fields & dataflag[whichfork]) &&
-		    (ifp->if_bytes > 0)) {
-			ASSERT(ifp->if_u1.if_data != NULL);
-			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
-		}
-		break;
-
-	case XFS_DINODE_FMT_EXTENTS:
-		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-		       !(iip->ili_fields & extflag[whichfork]));
-		if ((iip->ili_fields & extflag[whichfork]) &&
-		    (ifp->if_bytes > 0)) {
-			ASSERT(xfs_iext_get_ext(ifp, 0));
-			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
-			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
-				whichfork);
-		}
-		break;
-
-	case XFS_DINODE_FMT_BTREE:
-		if ((iip->ili_fields & brootflag[whichfork]) &&
-		    (ifp->if_broot_bytes > 0)) {
-			ASSERT(ifp->if_broot != NULL);
-			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			        XFS_IFORK_SIZE(ip, whichfork));
-			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
-				(xfs_bmdr_block_t *)cp,
-				XFS_DFORK_SIZE(dip, mp, whichfork));
-		}
-		break;
-
-	case XFS_DINODE_FMT_DEV:
-		if (iip->ili_fields & XFS_ILOG_DEV) {
-			ASSERT(whichfork == XFS_DATA_FORK);
-			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
-		}
-		break;
-
-	case XFS_DINODE_FMT_UUID:
-		if (iip->ili_fields & XFS_ILOG_UUID) {
-			ASSERT(whichfork == XFS_DATA_FORK);
-			memcpy(XFS_DFORK_DPTR(dip),
-			       &ip->i_df.if_u2.if_uuid,
-			       sizeof(uuid_t));
-		}
-		break;
-
-	default:
-		ASSERT(0);
-		break;
-	}
-}
-
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	idx)		/* index of target extent */
-{
-	ASSERT(idx >= 0);
-	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
-	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
-		return ifp->if_u1.if_ext_irec->er_extbuf;
-	} else if (ifp->if_flags & XFS_IFEXTIREC) {
-		xfs_ext_irec_t	*erp;		/* irec pointer */
-		int		erp_idx = 0;	/* irec index */
-		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
-
-		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
-		return &erp->er_extbuf[page_idx];
-	} else if (ifp->if_bytes) {
-		return &ifp->if_u1.if_extents[idx];
-	} else {
-		return NULL;
-	}
-}
-
-/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'.  'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* starting index of new items */
-	xfs_extnum_t	count,		/* number of inserted items */
-	xfs_bmbt_irec_t	*new,		/* items to insert */
-	int		state)		/* type of extent conversion */
-{
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-	xfs_extnum_t	i;		/* extent record index */
-
-	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	xfs_iext_add(ifp, idx, count);
-	for (i = idx; i < idx + count; i++, new++)
-		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	idx,		/* index to begin adding exts */
-	int		ext_diff)	/* number of extents to add */
-{
-	int		byte_diff;	/* new bytes being added */
-	int		new_size;	/* size of extents after adding */
-	xfs_extnum_t	nextents;	/* number of extents in file */
-
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	ASSERT((idx >= 0) && (idx <= nextents));
-	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
-	new_size = ifp->if_bytes + byte_diff;
-	/*
-	 * If the new number of extents (nextents + ext_diff)
-	 * fits inside the inode, then continue to use the inline
-	 * extent buffer.
-	 */
-	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
-		if (idx < nextents) {
-			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
-				&ifp->if_u2.if_inline_ext[idx],
-				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
-			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
-		}
-		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-		ifp->if_real_bytes = 0;
-	}
-	/*
-	 * Otherwise use a linear (direct) extent list.
-	 * If the extents are currently inside the inode,
-	 * xfs_iext_realloc_direct will switch us from
-	 * inline to direct extent allocation mode.
-	 */
-	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
-		xfs_iext_realloc_direct(ifp, new_size);
-		if (idx < nextents) {
-			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
-				&ifp->if_u1.if_extents[idx],
-				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
-			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
-		}
-	}
-	/* Indirection array */
-	else {
-		xfs_ext_irec_t	*erp;
-		int		erp_idx = 0;
-		int		page_idx = idx;
-
-		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
-		if (ifp->if_flags & XFS_IFEXTIREC) {
-			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
-		} else {
-			xfs_iext_irec_init(ifp);
-			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-			erp = ifp->if_u1.if_ext_irec;
-		}
-		/* Extents fit in target extent page */
-		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
-			if (page_idx < erp->er_extcount) {
-				memmove(&erp->er_extbuf[page_idx + ext_diff],
-					&erp->er_extbuf[page_idx],
-					(erp->er_extcount - page_idx) *
-					sizeof(xfs_bmbt_rec_t));
-				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
-			}
-			erp->er_extcount += ext_diff;
-			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-		}
-		/* Insert a new extent page */
-		else if (erp) {
-			xfs_iext_add_indirect_multi(ifp,
-				erp_idx, page_idx, ext_diff);
-		}
-		/*
-		 * If extent(s) are being appended to the last page in
-		 * the indirection array and the new extent(s) don't fit
-		 * in the page, then erp is NULL and erp_idx is set to
-		 * the next index needed in the indirection array.
-		 */
-		else {
-			uint	count = ext_diff;
-
-			while (count) {
-				erp = xfs_iext_irec_new(ifp, erp_idx);
-				erp->er_extcount = min(count, XFS_LINEAR_EXTS);
-				count -= erp->er_extcount;
-				if (count)
-					erp_idx++;
-			}
-		}
-	}
-	ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- *    |-------|   |-------|
- *    |       |   |       |    idx - number of extents before idx
- *    |  idx  |   | count |
- *    |       |   |       |    count - number of extents being inserted at idx
- *    |-------|   |-------|
- *    | count |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_add_indirect_multi(
-	xfs_ifork_t	*ifp,			/* inode fork pointer */
-	int		erp_idx,		/* target extent irec index */
-	xfs_extnum_t	idx,			/* index within target list */
-	int		count)			/* new extents being added */
-{
-	int		byte_diff;		/* new bytes being added */
-	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
-	xfs_extnum_t	ext_diff;		/* number of extents to add */
-	xfs_extnum_t	ext_cnt;		/* new extents still needed */
-	xfs_extnum_t	nex2;			/* extents after idx + count */
-	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
-	int		nlists;			/* number of irec's (lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	erp = &ifp->if_u1.if_ext_irec[erp_idx];
-	nex2 = erp->er_extcount - idx;
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-	/*
-	 * Save second part of target extent list
-	 * (all extents past */
-	if (nex2) {
-		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
-		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
-		erp->er_extcount -= nex2;
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
-		memset(&erp->er_extbuf[idx], 0, byte_diff);
-	}
-
-	/*
-	 * Add the new extents to the end of the target
-	 * list, then allocate new irec record(s) and
-	 * extent buffer(s) as needed to store the rest
-	 * of the new extents.
-	 */
-	ext_cnt = count;
-	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
-	if (ext_diff) {
-		erp->er_extcount += ext_diff;
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-		ext_cnt -= ext_diff;
-	}
-	while (ext_cnt) {
-		erp_idx++;
-		erp = xfs_iext_irec_new(ifp, erp_idx);
-		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
-		erp->er_extcount = ext_diff;
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-		ext_cnt -= ext_diff;
-	}
-
-	/* Add nex2 extents back to indirection array */
-	if (nex2) {
-		xfs_extnum_t	ext_avail;
-		int		i;
-
-		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-		i = 0;
-		/*
-		 * If nex2 extents fit in the current page, append
-		 * nex2_ep after the new extents.
-		 */
-		if (nex2 <= ext_avail) {
-			i = erp->er_extcount;
-		}
-		/*
-		 * Otherwise, check if space is available in the
-		 * next page.
-		 */
-		else if ((erp_idx < nlists - 1) &&
-			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
-			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
-			erp_idx++;
-			erp++;
-			/* Create a hole for nex2 extents */
-			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
-				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
-		}
-		/*
-		 * Final choice, create a new extent page for
-		 * nex2 extents.
-		 */
-		else {
-			erp_idx++;
-			erp = xfs_iext_irec_new(ifp, erp_idx);
-		}
-		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-		kmem_free(nex2_ep);
-		erp->er_extcount += nex2;
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
-	}
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array.  Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	idx,		/* index to begin removing exts */
-	int		ext_diff,	/* number of extents to remove */
-	int		state)		/* type of extent conversion */
-{
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-	xfs_extnum_t	nextents;	/* number of extents in file */
-	int		new_size;	/* size of extents after removal */
-
-	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
-	ASSERT(ext_diff > 0);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
-	if (new_size == 0) {
-		xfs_iext_destroy(ifp);
-	} else if (ifp->if_flags & XFS_IFEXTIREC) {
-		xfs_iext_remove_indirect(ifp, idx, ext_diff);
-	} else if (ifp->if_real_bytes) {
-		xfs_iext_remove_direct(ifp, idx, ext_diff);
-	} else {
-		xfs_iext_remove_inline(ifp, idx, ext_diff);
-	}
-	ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	idx,		/* index to begin removing exts */
-	int		ext_diff)	/* number of extents to remove */
-{
-	int		nextents;	/* number of extents in file */
-
-	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-	ASSERT(idx < XFS_INLINE_EXTS);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	ASSERT(((nextents - ext_diff) > 0) &&
-		(nextents - ext_diff) < XFS_INLINE_EXTS);
-
-	if (idx + ext_diff < nextents) {
-		memmove(&ifp->if_u2.if_inline_ext[idx],
-			&ifp->if_u2.if_inline_ext[idx + ext_diff],
-			(nextents - (idx + ext_diff)) *
-			 sizeof(xfs_bmbt_rec_t));
-		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
-			0, ext_diff * sizeof(xfs_bmbt_rec_t));
-	} else {
-		memset(&ifp->if_u2.if_inline_ext[idx], 0,
-			ext_diff * sizeof(xfs_bmbt_rec_t));
-	}
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	idx,		/* index to begin removing exts */
-	int		ext_diff)	/* number of extents to remove */
-{
-	xfs_extnum_t	nextents;	/* number of extents in file */
-	int		new_size;	/* size of extents after removal */
-
-	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-	new_size = ifp->if_bytes -
-		(ext_diff * sizeof(xfs_bmbt_rec_t));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-	if (new_size == 0) {
-		xfs_iext_destroy(ifp);
-		return;
-	}
-	/* Move extents up in the list (if needed) */
-	if (idx + ext_diff < nextents) {
-		memmove(&ifp->if_u1.if_extents[idx],
-			&ifp->if_u1.if_extents[idx + ext_diff],
-			(nextents - (idx + ext_diff)) *
-			 sizeof(xfs_bmbt_rec_t));
-	}
-	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
-		0, ext_diff * sizeof(xfs_bmbt_rec_t));
-	/*
-	 * Reallocate the direct extent list. If the extents
-	 * will fit inside the inode then xfs_iext_realloc_direct
-	 * will switch from direct to inline extent allocation
-	 * mode for us.
-	 */
-	xfs_iext_realloc_direct(ifp, new_size);
-	ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- *    |-------|   |-------|
- *    | nex1  |   |       |    nex1 - number of extents before idx
- *    |-------|   | count |
- *    |       |   |       |    count - number of extents being removed at idx
- *    | count |   |-------|
- *    |       |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_remove_indirect(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	idx,		/* index to begin removing extents */
-	int		count)		/* number of extents to remove */
-{
-	xfs_ext_irec_t	*erp;		/* indirection array pointer */
-	int		erp_idx = 0;	/* indirection array index */
-	xfs_extnum_t	ext_cnt;	/* extents left to remove */
-	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
-	xfs_extnum_t	nex1;		/* number of extents before idx */
-	xfs_extnum_t	nex2;		/* extents after idx + count */
-	int		page_idx = idx;	/* index in target extent list */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
-	ASSERT(erp != NULL);
-	nex1 = page_idx;
-	ext_cnt = count;
-	while (ext_cnt) {
-		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
-		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
-		/*
-		 * Check for deletion of entire list;
-		 * xfs_iext_irec_remove() updates extent offsets.
-		 */
-		if (ext_diff == erp->er_extcount) {
-			xfs_iext_irec_remove(ifp, erp_idx);
-			ext_cnt -= ext_diff;
-			nex1 = 0;
-			if (ext_cnt) {
-				ASSERT(erp_idx < ifp->if_real_bytes /
-					XFS_IEXT_BUFSZ);
-				erp = &ifp->if_u1.if_ext_irec[erp_idx];
-				nex1 = 0;
-				continue;
-			} else {
-				break;
-			}
-		}
-		/* Move extents up (if needed) */
-		if (nex2) {
-			memmove(&erp->er_extbuf[nex1],
-				&erp->er_extbuf[nex1 + ext_diff],
-				nex2 * sizeof(xfs_bmbt_rec_t));
-		}
-		/* Zero out rest of page */
-		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
-			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
-		/* Update remaining counters */
-		erp->er_extcount -= ext_diff;
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
-		ext_cnt -= ext_diff;
-		nex1 = 0;
-		erp_idx++;
-		erp++;
-	}
-	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
-	xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		new_size)	/* new size of extents after adding */
-{
-	int		rnew_size;	/* real new size of extents */
-
-	rnew_size = new_size;
-
-	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
-		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
-		 (new_size != ifp->if_real_bytes)));
-
-	/* Free extent records */
-	if (new_size == 0) {
-		xfs_iext_destroy(ifp);
-	}
-	/* Resize direct extent list and zero any new bytes */
-	else if (ifp->if_real_bytes) {
-		/* Check if extents will fit inside the inode */
-		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
-			xfs_iext_direct_to_inline(ifp, new_size /
-				(uint)sizeof(xfs_bmbt_rec_t));
-			ifp->if_bytes = new_size;
-			return;
-		}
-		if (!is_power_of_2(new_size)){
-			rnew_size = roundup_pow_of_two(new_size);
-		}
-		if (rnew_size != ifp->if_real_bytes) {
-			ifp->if_u1.if_extents =
-				kmem_realloc(ifp->if_u1.if_extents,
-						rnew_size,
-						ifp->if_real_bytes, KM_NOFS);
-		}
-		if (rnew_size > ifp->if_real_bytes) {
-			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
-				(uint)sizeof(xfs_bmbt_rec_t)], 0,
-				rnew_size - ifp->if_real_bytes);
-		}
-	}
-	/* Switch from the inline extent buffer to a direct extent list */
-	else {
-		if (!is_power_of_2(new_size)) {
-			rnew_size = roundup_pow_of_two(new_size);
-		}
-		xfs_iext_inline_to_direct(ifp, rnew_size);
-	}
-	ifp->if_real_bytes = rnew_size;
-	ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	nextents)	/* number of extents in file */
-{
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	ASSERT(nextents <= XFS_INLINE_EXTS);
-	/*
-	 * The inline buffer was zeroed when we switched
-	 * from inline to direct extent allocation mode,
-	 * so we don't need to clear it here.
-	 */
-	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
-		nextents * sizeof(xfs_bmbt_rec_t));
-	kmem_free(ifp->if_u1.if_extents);
-	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-	ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		new_size)	/* number of extents in file */
-{
-	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
-	memset(ifp->if_u1.if_extents, 0, new_size);
-	if (ifp->if_bytes) {
-		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
-			ifp->if_bytes);
-		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-			sizeof(xfs_bmbt_rec_t));
-	}
-	ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		new_size)	/* new indirection array size */
-{
-	int		nlists;		/* number of irec's (ex lists) */
-	int		size;		/* current indirection array size */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	size = nlists * sizeof(xfs_ext_irec_t);
-	ASSERT(ifp->if_real_bytes);
-	ASSERT((new_size >= 0) && (new_size != size));
-	if (new_size == 0) {
-		xfs_iext_destroy(ifp);
-	} else {
-		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-			kmem_realloc(ifp->if_u1.if_ext_irec,
-				new_size, size, KM_NOFS);
-	}
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
-	 xfs_ifork_t	*ifp)		/* inode fork pointer */
-{
-	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
-	xfs_extnum_t	nextents;	/* number of extents in file */
-	int		size;		/* size of file extents */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	ASSERT(nextents <= XFS_LINEAR_EXTS);
-	size = nextents * sizeof(xfs_bmbt_rec_t);
-
-	xfs_iext_irec_compact_pages(ifp);
-	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
-	ep = ifp->if_u1.if_ext_irec->er_extbuf;
-	kmem_free(ifp->if_u1.if_ext_irec);
-	ifp->if_flags &= ~XFS_IFEXTIREC;
-	ifp->if_u1.if_extents = ep;
-	ifp->if_bytes = size;
-	if (nextents < XFS_LINEAR_EXTS) {
-		xfs_iext_realloc_direct(ifp, size);
-	}
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
-	xfs_ifork_t	*ifp)		/* inode fork pointer */
-{
-	if (ifp->if_flags & XFS_IFEXTIREC) {
-		int	erp_idx;
-		int	nlists;
-
-		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-			xfs_iext_irec_remove(ifp, erp_idx);
-		}
-		ifp->if_flags &= ~XFS_IFEXTIREC;
-	} else if (ifp->if_real_bytes) {
-		kmem_free(ifp->if_u1.if_extents);
-	} else if (ifp->if_bytes) {
-		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-			sizeof(xfs_bmbt_rec_t));
-	}
-	ifp->if_u1.if_extents = NULL;
-	ifp->if_real_bytes = 0;
-	ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t *			/* pointer to found extent record */
-xfs_iext_bno_to_ext(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number to search for */
-	xfs_extnum_t	*idxp)		/* index of target extent */
-{
-	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
-	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
-	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
-	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
-	int		high;		/* upper boundary in search */
-	xfs_extnum_t	idx = 0;	/* index of target extent */
-	int		low;		/* lower boundary in search */
-	xfs_extnum_t	nextents;	/* number of file extents */
-	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
-
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*idxp = 0;
-		return NULL;
-	}
-	low = 0;
-	if (ifp->if_flags & XFS_IFEXTIREC) {
-		/* Find target extent list */
-		int	erp_idx = 0;
-		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
-		base = erp->er_extbuf;
-		high = erp->er_extcount - 1;
-	} else {
-		base = ifp->if_u1.if_extents;
-		high = nextents - 1;
-	}
-	/* Binary search extent records */
-	while (low <= high) {
-		idx = (low + high) >> 1;
-		ep = base + idx;
-		startoff = xfs_bmbt_get_startoff(ep);
-		blockcount = xfs_bmbt_get_blockcount(ep);
-		if (bno < startoff) {
-			high = idx - 1;
-		} else if (bno >= startoff + blockcount) {
-			low = idx + 1;
-		} else {
-			/* Convert back to file-based extent index */
-			if (ifp->if_flags & XFS_IFEXTIREC) {
-				idx += erp->er_extoff;
-			}
-			*idxp = idx;
-			return ep;
-		}
-	}
-	/* Convert back to file-based extent index */
-	if (ifp->if_flags & XFS_IFEXTIREC) {
-		idx += erp->er_extoff;
-	}
-	if (bno >= startoff + blockcount) {
-		if (++idx == nextents) {
-			ep = NULL;
-		} else {
-			ep = xfs_iext_get_ext(ifp, idx);
-		}
-	}
-	*idxp = idx;
-	return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t *			/* pointer to found extent record */
-xfs_iext_bno_to_irec(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number to search for */
-	int		*erp_idxp)	/* irec index of target ext list */
-{
-	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
-	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
-	int		erp_idx;	/* indirection array index */
-	int		nlists;		/* number of extent irec's (lists) */
-	int		high;		/* binary search upper limit */
-	int		low;		/* binary search lower limit */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	erp_idx = 0;
-	low = 0;
-	high = nlists - 1;
-	while (low <= high) {
-		erp_idx = (low + high) >> 1;
-		erp = &ifp->if_u1.if_ext_irec[erp_idx];
-		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
-		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
-			high = erp_idx - 1;
-		} else if (erp_next && bno >=
-			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
-			low = erp_idx + 1;
-		} else {
-			break;
-		}
-	}
-	*erp_idxp = erp_idx;
-	return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
-	int		*erp_idxp,	/* pointer to target irec */
-	int		realloc)	/* new bytes were just added */
-{
-	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
-	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
-	int		erp_idx;	/* indirection array index */
-	int		nlists;		/* number of irec's (ex lists) */
-	int		high;		/* binary search upper limit */
-	int		low;		/* binary search lower limit */
-	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	ASSERT(page_idx >= 0);
-	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	erp_idx = 0;
-	low = 0;
-	high = nlists - 1;
-
-	/* Binary search extent irec's */
-	while (low <= high) {
-		erp_idx = (low + high) >> 1;
-		erp = &ifp->if_u1.if_ext_irec[erp_idx];
-		prev = erp_idx > 0 ? erp - 1 : NULL;
-		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
-		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
-			high = erp_idx - 1;
-		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
-			   (page_idx == erp->er_extoff + erp->er_extcount &&
-			    !realloc)) {
-			low = erp_idx + 1;
-		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
-			   erp->er_extcount == XFS_LINEAR_EXTS) {
-			ASSERT(realloc);
-			page_idx = 0;
-			erp_idx++;
-			erp = erp_idx < nlists ? erp + 1 : NULL;
-			break;
-		} else {
-			page_idx -= erp->er_extoff;
-			break;
-		}
-	}
-	*idxp = page_idx;
-	*erp_idxp = erp_idx;
-	return erp;
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
-	xfs_ifork_t	*ifp)		/* inode fork pointer */
-{
-	xfs_ext_irec_t	*erp;		/* indirection array pointer */
-	xfs_extnum_t	nextents;	/* number of extents in file */
-
-	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	ASSERT(nextents <= XFS_LINEAR_EXTS);
-
-	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
-	if (nextents == 0) {
-		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-	} else if (!ifp->if_real_bytes) {
-		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
-	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
-		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
-	}
-	erp->er_extbuf = ifp->if_u1.if_extents;
-	erp->er_extcount = nextents;
-	erp->er_extoff = 0;
-
-	ifp->if_flags |= XFS_IFEXTIREC;
-	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
-	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
-	ifp->if_u1.if_ext_irec = erp;
-
-	return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		erp_idx)	/* index for new irec */
-{
-	xfs_ext_irec_t	*erp;		/* indirection array pointer */
-	int		i;		/* loop counter */
-	int		nlists;		/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-	/* Resize indirection array */
-	xfs_iext_realloc_indirect(ifp, ++nlists *
-				  sizeof(xfs_ext_irec_t));
-	/*
-	 * Move records down in the array so the
-	 * new page can use erp_idx.
-	 */
-	erp = ifp->if_u1.if_ext_irec;
-	for (i = nlists - 1; i > erp_idx; i--) {
-		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
-	}
-	ASSERT(i == erp_idx);
-
-	/* Initialize new extent record */
-	erp = ifp->if_u1.if_ext_irec;
-	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
-	erp[erp_idx].er_extcount = 0;
-	erp[erp_idx].er_extoff = erp_idx > 0 ?
-		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
-	return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		erp_idx)	/* irec index to remove */
-{
-	xfs_ext_irec_t	*erp;		/* indirection array pointer */
-	int		i;		/* loop counter */
-	int		nlists;		/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	erp = &ifp->if_u1.if_ext_irec[erp_idx];
-	if (erp->er_extbuf) {
-		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
-			-erp->er_extcount);
-		kmem_free(erp->er_extbuf);
-	}
-	/* Compact extent records */
-	erp = ifp->if_u1.if_ext_irec;
-	for (i = erp_idx; i < nlists - 1; i++) {
-		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
-	}
-	/*
-	 * Manually free the last extent record from the indirection
-	 * array.  A call to xfs_iext_realloc_indirect() with a size
-	 * of zero would result in a call to xfs_iext_destroy() which
-	 * would in turn call this function again, creating a nasty
-	 * infinite loop.
-	 */
-	if (--nlists) {
-		xfs_iext_realloc_indirect(ifp,
-			nlists * sizeof(xfs_ext_irec_t));
-	} else {
-		kmem_free(ifp->if_u1.if_ext_irec);
-	}
-	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array.  Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible.  The
- * compaction policy is as follows:
- *
- *    Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- *      No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
-	xfs_ifork_t	*ifp)		/* inode fork pointer */
-{
-	xfs_extnum_t	nextents;	/* number of extents in file */
-	int		nlists;		/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-	if (nextents == 0) {
-		xfs_iext_destroy(ifp);
-	} else if (nextents <= XFS_INLINE_EXTS) {
-		xfs_iext_indirect_to_direct(ifp);
-		xfs_iext_direct_to_inline(ifp, nextents);
-	} else if (nextents <= XFS_LINEAR_EXTS) {
-		xfs_iext_indirect_to_direct(ifp);
-	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
-		xfs_iext_irec_compact_pages(ifp);
-	}
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
-	xfs_ifork_t	*ifp)		/* inode fork pointer */
-{
-	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
-	int		erp_idx = 0;	/* indirection array index */
-	int		nlists;		/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	while (erp_idx < nlists - 1) {
-		erp = &ifp->if_u1.if_ext_irec[erp_idx];
-		erp_next = erp + 1;
-		if (erp_next->er_extcount <=
-		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
-			memcpy(&erp->er_extbuf[erp->er_extcount],
-				erp_next->er_extbuf, erp_next->er_extcount *
-				sizeof(xfs_bmbt_rec_t));
-			erp->er_extcount += erp_next->er_extcount;
-			/*
-			 * Free page before removing extent record
-			 * so er_extoffs don't get modified in
-			 * xfs_iext_irec_remove.
-			 */
-			kmem_free(erp_next->er_extbuf);
-			erp_next->er_extbuf = NULL;
-			xfs_iext_irec_remove(ifp, erp_idx + 1);
-			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-		} else {
-			erp_idx++;
-		}
-	}
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		erp_idx,	/* irec index to update */
-	int		ext_diff)	/* number of new extents */
-{
-	int		i;		/* loop counter */
-	int		nlists;		/* number of irec's (ex lists */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	for (i = erp_idx; i < nlists; i++) {
-		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
-	}
-}
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
deleted file mode 100644
index ee7e0e80246b..000000000000
--- a/fs/xfs/xfs_log_rlimit.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2013 Jie Liu.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_trans_space.h"
-#include "xfs_inode.h"
-#include "xfs_da_btree.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_bmap_btree.h"
-
-/*
- * Calculate the maximum length in bytes that would be required for a local
- * attribute value as large attributes out of line are not logged.
- */
-STATIC int
-xfs_log_calc_max_attrsetm_res(
-	struct xfs_mount	*mp)
-{
-	int			size;
-	int			nblks;
-
-	size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
-	       MAXNAMELEN - 1;
-	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-	nblks += XFS_B_TO_FSB(mp, size);
-	nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
-
-	return  M_RES(mp)->tr_attrsetm.tr_logres +
-		M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
-}
-
-/*
- * Iterate over the log space reservation table to figure out and return
- * the maximum one in terms of the pre-calculated values which were done
- * at mount time.
- */
-STATIC void
-xfs_log_get_max_trans_res(
-	struct xfs_mount	*mp,
-	struct xfs_trans_res	*max_resp)
-{
-	struct xfs_trans_res	*resp;
-	struct xfs_trans_res	*end_resp;
-	int			log_space = 0;
-	int			attr_space;
-
-	attr_space = xfs_log_calc_max_attrsetm_res(mp);
-
-	resp = (struct xfs_trans_res *)M_RES(mp);
-	end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
-	for (; resp < end_resp; resp++) {
-		int		tmp = resp->tr_logcount > 1 ?
-				      resp->tr_logres * resp->tr_logcount :
-				      resp->tr_logres;
-		if (log_space < tmp) {
-			log_space = tmp;
-			*max_resp = *resp;		/* struct copy */
-		}
-	}
-
-	if (attr_space > log_space) {
-		*max_resp = M_RES(mp)->tr_attrsetm;	/* struct copy */
-		max_resp->tr_logres = attr_space;
-	}
-}
-
-/*
- * Calculate the minimum valid log size for the given superblock configuration.
- * Used to calculate the minimum log size at mkfs time, and to determine if
- * the log is large enough or not at mount time. Returns the minimum size in
- * filesystem block size units.
- */
-int
-xfs_log_calc_minimum_size(
-	struct xfs_mount	*mp)
-{
-	struct xfs_trans_res	tres = {0};
-	int			max_logres;
-	int			min_logblks = 0;
-	int			lsunit = 0;
-
-	xfs_log_get_max_trans_res(mp, &tres);
-
-	max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
-	if (tres.tr_logcount > 1)
-		max_logres *= tres.tr_logcount;
-
-	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
-		lsunit = BTOBB(mp->m_sb.sb_logsunit);
-
-	/*
-	 * Two factors should be taken into account for calculating the minimum
-	 * log space.
-	 * 1) The fundamental limitation is that no single transaction can be
-	 *    larger than half size of the log.
-	 *
-	 *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
-	 *    define, which is set to 3. That means we can definitely fit
-	 *    maximally sized 2 transactions in the log. We'll use this same
-	 *    value here.
-	 *
-	 * 2) If the lsunit option is specified, a transaction requires 2 LSU
-	 *    for the reservation because there are two log writes that can
-	 *    require padding - the transaction data and the commit record which
-	 *    are written separately and both can require padding to the LSU.
-	 *    Consider that we can have an active CIL reservation holding 2*LSU,
-	 *    but the CIL is not over a push threshold, in this case, if we
-	 *    don't have enough log space for at one new transaction, which
-	 *    includes another 2*LSU in the reservation, we will run into dead
-	 *    loop situation in log space grant procedure. i.e.
-	 *    xlog_grant_head_wait().
-	 *
-	 *    Hence the log size needs to be able to contain two maximally sized
-	 *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
-	 *
-	 * Also, the log size should be a multiple of the log stripe unit, round
-	 * it up to lsunit boundary if lsunit is specified.
-	 */
-	if (lsunit) {
-		min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
-			      2 * lsunit;
-	} else
-		min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
-	min_logblks *= XFS_MIN_LOG_FACTOR;
-
-	return XFS_BB_TO_FSB(mp, min_logblks);
-}
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
deleted file mode 100644
index f4dd697cac08..000000000000
--- a/fs/xfs/xfs_rtbitmap.c
+++ /dev/null
@@ -1,973 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
-#include "xfs_icache.h"
-#include "xfs_dinode.h"
-#include "xfs_rtalloc.h"
-
-
-/*
- * Realtime allocator bitmap functions shared with userspace.
- */
-
-/*
- * Get a buffer for the bitmap or summary file block specified.
- * The buffer is returned read and locked.
- */
-int
-xfs_rtbuf_get(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	block,		/* block number in bitmap or summary */
-	int		issum,		/* is summary not bitmap */
-	xfs_buf_t	**bpp)		/* output: buffer for the block */
-{
-	xfs_buf_t	*bp;		/* block buffer, result */
-	xfs_inode_t	*ip;		/* bitmap or summary inode */
-	xfs_bmbt_irec_t	map;
-	int		nmap = 1;
-	int		error;		/* error value */
-
-	ip = issum ? mp->m_rsumip : mp->m_rbmip;
-
-	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
-	if (error)
-		return error;
-
-	ASSERT(map.br_startblock != NULLFSBLOCK);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp, NULL);
-	if (error)
-		return error;
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-int
-xfs_rtfind_back(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute and read in starting bitmap block for starting block.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Get the first word's index & point to it.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	len = start - limit + 1;
-	/*
-	 * Compute match value, based on the bit at start: if 1 (free)
-	 * then all-ones, else all-zeroes.
-	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-	/*
-	 * If the starting position is not word-aligned, deal with the
-	 * partial word.
-	 */
-	if (bit < XFS_NBWORD - 1) {
-		/*
-		 * Calculate first (leftmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
-		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
-			firstbit;
-		/*
-		 * Calculate the difference between the value there
-		 * and what we're looking for.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different.  Mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = bit - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		}
-		i = bit - firstbit + 1;
-		/*
-		 * Go on to previous block if that's where the previous word is
-		 * and we need the previous word.
-		 */
-		if (--word == -1 && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the previous one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ want)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to previous block if that's where the previous word is
-		 * and we need the previous word.
-		 */
-		if (--word == -1 && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if (len - i) {
-		/*
-		 * Calculate first (leftmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		firstbit = XFS_NBWORD - (len - i);
-		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * No match, return that we scanned the whole area.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start - i + 1;
-	return 0;
-}
-
-/*
- * Searching forward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-int
-xfs_rtfind_forw(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute and read in starting bitmap block for starting block.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Get the first word's index & point to it.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	len = limit - start + 1;
-	/*
-	 * Compute match value, based on the bit at start: if 1 (free)
-	 * then all-ones, else all-zeroes.
-	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-	/*
-	 * If the starting position is not word-aligned, deal with the
-	 * partial word.
-	 */
-	if (bit) {
-		/*
-		 * Calculate last (rightmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Calculate the difference between the value there
-		 * and what we're looking for.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different.  Mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = XFS_RTLOBIT(wdiff) - bit;
-			*rtblock = start + i - 1;
-			return 0;
-		}
-		i = lastbit - bit;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ want)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Calculate mask for all the relevant bits in this word.
-		 */
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * No match, return that we scanned the whole area.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start + i - 1;
-	return 0;
-}
-
-/*
- * Read and modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-int
-xfs_rtmodify_summary(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	int		delta,		/* change to make to summary info */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
-{
-	xfs_buf_t	*bp;		/* buffer for the summary block */
-	int		error;		/* error value */
-	xfs_fsblock_t	sb;		/* summary fsblock */
-	int		so;		/* index into the summary file */
-	xfs_suminfo_t	*sp;		/* pointer to returned data */
-
-	/*
-	 * Compute entry number in the summary file.
-	 */
-	so = XFS_SUMOFFS(mp, log, bbno);
-	/*
-	 * Compute the block number in the summary file.
-	 */
-	sb = XFS_SUMOFFSTOBLOCK(mp, so);
-	/*
-	 * If we have an old buffer, and the block number matches, use that.
-	 */
-	if (rbpp && *rbpp && *rsb == sb)
-		bp = *rbpp;
-	/*
-	 * Otherwise we have to get the buffer.
-	 */
-	else {
-		/*
-		 * If there was an old one, get rid of it first.
-		 */
-		if (rbpp && *rbpp)
-			xfs_trans_brelse(tp, *rbpp);
-		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-		if (error) {
-			return error;
-		}
-		/*
-		 * Remember this buffer and block for the next call.
-		 */
-		if (rbpp) {
-			*rbpp = bp;
-			*rsb = sb;
-		}
-	}
-	/*
-	 * Point to the summary information, modify and log it.
-	 */
-	sp = XFS_SUMPTR(mp, bp, so);
-	*sp += delta;
-	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-		(uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
-	return 0;
-}
-
-/*
- * Set the given range of bitmap bits to the given value.
- * Do whatever I/O and logging is required.
- */
-int
-xfs_rtmodify_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to modify */
-	xfs_extlen_t	len,		/* length of extent to modify */
-	int		val)		/* 1 for free, 0 for allocated */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtword_t	*first;		/* first used word in the buffer */
-	int		i;		/* current bit number rel. to start */
-	int		lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute starting bitmap block number.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	/*
-	 * Read the bitmap block, and point to its data.
-	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Compute the starting word's address, and starting bit.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	first = b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	/*
-	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
-	 */
-	val = -val;
-	/*
-	 * If not starting on a word boundary, deal with the first
-	 * (partial) word.
-	 */
-	if (bit) {
-		/*
-		 * Compute first bit not changed and mask of relevant bits.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Set/clear the active bits.
-		 */
-		if (val)
-			*b |= mask;
-		else
-			*b &= ~mask;
-		i = lastbit - bit;
-		/*
-		 * Go on to the next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * Log the changed part of this block.
-			 * Get the next one.
-			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer
-			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Set the word value correctly.
-		 */
-		*b = val;
-		i += XFS_NBWORD;
-		/*
-		 * Go on to the next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * Log the changed part of this block.
-			 * Get the next one.
-			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer
-			 */
-			b++;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Compute a mask of relevant bits.
-		 */
-		bit = 0;
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Set/clear the active bits.
-		 */
-		if (val)
-			*b |= mask;
-		else
-			*b &= ~mask;
-		b++;
-	}
-	/*
-	 * Log any remaining changed bytes.
-	 */
-	if (b > first)
-		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
-			(uint)((char *)b - (char *)bufp - 1));
-	return 0;
-}
-
-/*
- * Mark an extent specified by start and len freed.
- * Updates all the summary information as well as the bitmap.
- */
-int
-xfs_rtfree_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to free */
-	xfs_extlen_t	len,		/* length to free */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
-{
-	xfs_rtblock_t	end;		/* end of the freed extent */
-	int		error;		/* error value */
-	xfs_rtblock_t	postblock;	/* first block freed > end */
-	xfs_rtblock_t	preblock;	/* first block freed < start */
-
-	end = start + len - 1;
-	/*
-	 * Modify the bitmap to mark this extent freed.
-	 */
-	error = xfs_rtmodify_range(mp, tp, start, len, 1);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Assume we're freeing out of the middle of an allocated extent.
-	 * We need to find the beginning and end of the extent so we can
-	 * properly update the summary.
-	 */
-	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Find the next allocated block (end of allocated extent).
-	 */
-	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-		&postblock);
-	if (error)
-		return error;
-	/*
-	 * If there are blocks not being freed at the front of the
-	 * old extent, add summary data for them to be allocated.
-	 */
-	if (preblock < start) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(start - preblock),
-			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * If there are blocks not being freed at the end of the
-	 * old extent, add summary data for them to be allocated.
-	 */
-	if (postblock > end) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(postblock - end),
-			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * Increment the summary information corresponding to the entire
-	 * (new) free extent.
-	 */
-	error = xfs_rtmodify_summary(mp, tp,
-		XFS_RTBLOCKLOG(postblock + 1 - preblock),
-		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
-	return error;
-}
-
-/*
- * Check that the given range is either all allocated (val = 0) or
- * all free (val = 1).
- */
-int
-xfs_rtcheck_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block number of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		val,		/* 1 for free, 0 for allocated */
-	xfs_rtblock_t	*new,		/* out: first block not matching */
-	int		*stat)		/* out: 1 for matches, 0 for not */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute starting bitmap block number
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	/*
-	 * Read the bitmap block.
-	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Compute the starting word's address, and starting bit.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	/*
-	 * 0 (allocated) => all zero's; 1 (free) => all one's.
-	 */
-	val = -val;
-	/*
-	 * If not starting on a word boundary, deal with the first
-	 * (partial) word.
-	 */
-	if (bit) {
-		/*
-		 * Compute first bit not examined.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		/*
-		 * Mask of relevant bits.
-		 */
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ val) & mask)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = XFS_RTLOBIT(wdiff) - bit;
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		}
-		i = lastbit - bit;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ val)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Mask of relevant bits.
-		 */
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ val) & mask)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * Successful, return.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*new = start + i;
-	*stat = 1;
-	return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the given extent (block range) is allocated already.
- */
-STATIC int				/* error */
-xfs_rtcheck_alloc_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number of extent */
-	xfs_extlen_t	len)		/* length of extent */
-{
-	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
-	int		stat;
-	int		error;
-
-	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
-	if (error)
-		return error;
-	ASSERT(stat);
-	return 0;
-}
-#else
-#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
-#endif
-/*
- * Free an extent in the realtime subvolume.  Length is expressed in
- * realtime extents, as is the block number.
- */
-int					/* error */
-xfs_rtfree_extent(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to free */
-	xfs_extlen_t	len)		/* length of extent freed */
-{
-	int		error;		/* error value */
-	xfs_mount_t	*mp;		/* file system mount structure */
-	xfs_fsblock_t	sb;		/* summary file block number */
-	xfs_buf_t	*sumbp = NULL;	/* summary file block buffer */
-
-	mp = tp->t_mountp;
-
-	ASSERT(mp->m_rbmip->i_itemp != NULL);
-	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-
-	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
-	if (error)
-		return error;
-
-	/*
-	 * Free the range of realtime blocks.
-	 */
-	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Mark more blocks free in the superblock.
-	 */
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
-	/*
-	 * If we've now freed all the blocks, reset the file sequence
-	 * number to 0.
-	 */
-	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
-	    mp->m_sb.sb_rextents) {
-		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-	}
-	return 0;
-}
-
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
deleted file mode 100644
index 23c2f2577c8d..000000000000
--- a/fs/xfs/xfs_symlink_remote.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2012-2013 Red Hat, Inc.
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_shared.h"
-#include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_cksum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
-
-
-/*
- * Each contiguous block has a header, so it is not just a simple pathlen
- * to FSB conversion.
- */
-int
-xfs_symlink_blocks(
-	struct xfs_mount *mp,
-	int		pathlen)
-{
-	int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-
-	return (pathlen + buflen - 1) / buflen;
-}
-
-int
-xfs_symlink_hdr_set(
-	struct xfs_mount	*mp,
-	xfs_ino_t		ino,
-	uint32_t		offset,
-	uint32_t		size,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return 0;
-
-	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
-	dsl->sl_offset = cpu_to_be32(offset);
-	dsl->sl_bytes = cpu_to_be32(size);
-	uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
-	dsl->sl_owner = cpu_to_be64(ino);
-	dsl->sl_blkno = cpu_to_be64(bp->b_bn);
-	bp->b_ops = &xfs_symlink_buf_ops;
-
-	return sizeof(struct xfs_dsymlink_hdr);
-}
-
-/*
- * Checking of the symlink header is split into two parts. the verifier does
- * CRC, location and bounds checking, the unpacking function checks the path
- * parameters and owner.
- */
-bool
-xfs_symlink_hdr_ok(
-	xfs_ino_t		ino,
-	uint32_t		offset,
-	uint32_t		size,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-	if (offset != be32_to_cpu(dsl->sl_offset))
-		return false;
-	if (size != be32_to_cpu(dsl->sl_bytes))
-		return false;
-	if (ino != be64_to_cpu(dsl->sl_owner))
-		return false;
-
-	/* ok */
-	return true;
-}
-
-static bool
-xfs_symlink_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return false;
-	if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
-		return false;
-	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
-		return false;
-	if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
-		return false;
-	if (be32_to_cpu(dsl->sl_offset) +
-				be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
-		return false;
-	if (dsl->sl_owner == 0)
-		return false;
-
-	return true;
-}
-
-static void
-xfs_symlink_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-
-	/* no verification of non-crc buffers */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
-	else if (!xfs_symlink_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-
-	if (bp->b_error)
-		xfs_verifier_error(bp);
-}
-
-static void
-xfs_symlink_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_buf_log_item	*bip = bp->b_fspriv;
-
-	/* no verification of non-crc buffers */
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return;
-
-	if (!xfs_symlink_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		xfs_verifier_error(bp);
-		return;
-	}
-
-	if (bip) {
-		struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-		dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-	}
-	xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
-}
-
-const struct xfs_buf_ops xfs_symlink_buf_ops = {
-	.verify_read = xfs_symlink_read_verify,
-	.verify_write = xfs_symlink_write_verify,
-};
-
-void
-xfs_symlink_local_to_remote(
-	struct xfs_trans	*tp,
-	struct xfs_buf		*bp,
-	struct xfs_inode	*ip,
-	struct xfs_ifork	*ifp)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	char			*buf;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb)) {
-		bp->b_ops = NULL;
-		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-		return;
-	}
-
-	/*
-	 * As this symlink fits in an inode literal area, it must also fit in
-	 * the smallest buffer the filesystem supports.
-	 */
-	ASSERT(BBTOB(bp->b_length) >=
-			ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
-
-	bp->b_ops = &xfs_symlink_buf_ops;
-
-	buf = bp->b_addr;
-	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
-	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
-}
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
deleted file mode 100644
index f2bda7c76b8a..000000000000
--- a/fs/xfs/xfs_trans_resv.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * Copyright (C) 2010 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_qm.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-
-/*
- * A buffer has a format structure overhead in the log in addition
- * to the data, so we need to take this into account when reserving
- * space in a transaction for a buffer.  Round the space required up
- * to a multiple of 128 bytes so that we don't change the historical
- * reservation that has been used for this overhead.
- */
-STATIC uint
-xfs_buf_log_overhead(void)
-{
-	return round_up(sizeof(struct xlog_op_header) +
-			sizeof(struct xfs_buf_log_format), 128);
-}
-
-/*
- * Calculate out transaction log reservation per item in bytes.
- *
- * The nbufs argument is used to indicate the number of items that
- * will be changed in a transaction.  size is used to tell how many
- * bytes should be reserved per item.
- */
-STATIC uint
-xfs_calc_buf_res(
-	uint		nbufs,
-	uint		size)
-{
-	return nbufs * (size + xfs_buf_log_overhead());
-}
-
-/*
- * Logging inodes is really tricksy. They are logged in memory format,
- * which means that what we write into the log doesn't directly translate into
- * the amount of space they use on disk.
- *
- * Case in point - btree format forks in memory format use more space than the
- * on-disk format. In memory, the buffer contains a normal btree block header so
- * the btree code can treat it as though it is just another generic buffer.
- * However, when we write it to the inode fork, we don't write all of this
- * header as it isn't needed. e.g. the root is only ever in the inode, so
- * there's no need for sibling pointers which would waste 16 bytes of space.
- *
- * Hence when we have an inode with a maximally sized btree format fork, then
- * amount of information we actually log is greater than the size of the inode
- * on disk. Hence we need an inode reservation function that calculates all this
- * correctly. So, we log:
- *
- * - 4 log op headers for object
- *	- for the ilf, the inode core and 2 forks
- * - inode log format object
- * - the inode core
- * - two inode forks containing bmap btree root blocks.
- *	- the btree data contained by both forks will fit into the inode size,
- *	  hence when combined with the inode core above, we have a total of the
- *	  actual inode size.
- *	- the BMBT headers need to be accounted separately, as they are
- *	  additional to the records and pointers that fit inside the inode
- *	  forks.
- */
-STATIC uint
-xfs_calc_inode_res(
-	struct xfs_mount	*mp,
-	uint			ninodes)
-{
-	return ninodes *
-		(4 * sizeof(struct xlog_op_header) +
-		 sizeof(struct xfs_inode_log_format) +
-		 mp->m_sb.sb_inodesize +
-		 2 * XFS_BMBT_BLOCK_LEN(mp));
-}
-
-/*
- * The free inode btree is a conditional feature and the log reservation
- * requirements differ slightly from that of the traditional inode allocation
- * btree. The finobt tracks records for inode chunks with at least one free
- * inode. A record can be removed from the tree for an inode allocation
- * or free and thus the finobt reservation is unconditional across:
- *
- * 	- inode allocation
- * 	- inode free
- * 	- inode chunk allocation
- *
- * The 'modify' param indicates to include the record modification scenario. The
- * 'alloc' param indicates to include the reservation for free space btree
- * modifications on behalf of finobt modifications. This is required only for
- * transactions that do not already account for free space btree modifications.
- *
- * the free inode btree: max depth * block size
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- * the free inode btree entry: block size
- */
-STATIC uint
-xfs_calc_finobt_res(
-	struct xfs_mount 	*mp,
-	int			alloc,
-	int			modify)
-{
-	uint res;
-
-	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
-		return 0;
-
-	res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
-	if (alloc)
-		res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
-					XFS_FSB_TO_B(mp, 1));
-	if (modify)
-		res += (uint)XFS_FSB_TO_B(mp, 1);
-
-	return res;
-}
-
-/*
- * Various log reservation values.
- *
- * These are based on the size of the file system block because that is what
- * most transactions manipulate.  Each adds in an additional 128 bytes per
- * item logged to try to account for the overhead of the transaction mechanism.
- *
- * Note:  Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
- * This is because the number in the worst case is quite high and quite
- * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
- * extents in only a single AG at a time.  This will require changes to the
- * EFI code as well, however, so that the EFI for the extents not freed is
- * logged again in each transaction.  See SGI PV #261917.
- *
- * Reservation functions here avoid a huge stack in xfs_trans_init due to
- * register overflow from temporaries in the calculations.
- */
-
-
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_write_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX((xfs_calc_inode_res(mp, 1) +
-		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-				      XFS_FSB_TO_B(mp, 1)) +
-		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-				      XFS_FSB_TO_B(mp, 1))),
-		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-				      XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_itruncate_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX((xfs_calc_inode_res(mp, 1) +
-		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
-				      XFS_FSB_TO_B(mp, 1))),
-		    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-				      XFS_FSB_TO_B(mp, 1)) +
-		    xfs_calc_buf_res(5, 0) +
-		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				     XFS_FSB_TO_B(mp, 1)) +
-		    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
-				     mp->m_in_maxlevels, 0)));
-}
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *	of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_rename_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX((xfs_calc_inode_res(mp, 4) +
-		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
-				      XFS_FSB_TO_B(mp, 1))),
-		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
-				      XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For removing an inode from unlinked list at first, we can modify:
- *    the agi hash list and counters: sector size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- */
-STATIC uint
-xfs_calc_iunlink_remove_reservation(
-	struct xfs_mount        *mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-	       max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
-}
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_link_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_iunlink_remove_reservation(mp) +
-		MAX((xfs_calc_inode_res(mp, 2) +
-		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-				      XFS_FSB_TO_B(mp, 1))),
-		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				      XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For adding an inode to unlinked list we can modify:
- *    the agi hash list: sector size
- *    the unlinked inode: inode size
- */
-STATIC uint
-xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_remove_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_iunlink_add_reservation(mp) +
-		MAX((xfs_calc_inode_res(mp, 1) +
-		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-				      XFS_FSB_TO_B(mp, 1))),
-		    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-				      XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For create, break it in to the two cases that the transaction
- * covers. We start with the modify case - allocation done by modification
- * of the state of existing inodes - and the allocation case.
- */
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the finobt (record modification and allocation btrees)
- */
-STATIC uint
-xfs_calc_create_resv_modify(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_inode_res(mp, 2) +
-		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		(uint)XFS_FSB_TO_B(mp, 1) +
-		xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_finobt_res(mp, 1, 1);
-}
-
-/*
- * For create we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_create_resv_alloc(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-		mp->m_sb.sb_sectsize +
-		xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-__xfs_calc_create_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX(xfs_calc_create_resv_alloc(mp),
-		    xfs_calc_create_resv_modify(mp));
-}
-
-/*
- * For icreate we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- *    the finobt (record insertion)
- */
-STATIC uint
-xfs_calc_icreate_resv_alloc(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-		mp->m_sb.sb_sectsize +
-		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_finobt_res(mp, 0, 0);
-}
-
-STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX(xfs_calc_icreate_resv_alloc(mp),
-		    xfs_calc_create_resv_modify(mp));
-}
-
-STATIC uint
-xfs_calc_create_reservation(
-	struct xfs_mount	*mp)
-{
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return xfs_calc_icreate_reservation(mp);
-	return __xfs_calc_create_reservation(mp);
-
-}
-
-STATIC uint
-xfs_calc_create_tmpfile_reservation(
-	struct xfs_mount        *mp)
-{
-	uint	res = XFS_DQUOT_LOGRES(mp);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		res += xfs_calc_icreate_resv_alloc(mp);
-	else
-		res += xfs_calc_create_resv_alloc(mp);
-
-	return res + xfs_calc_iunlink_add_reservation(mp);
-}
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-STATIC uint
-xfs_calc_mkdir_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_create_reservation(mp);
-}
-
-
-/*
- * Making a new symplink is the same as creating a new file, but
- * with the added blocks for remote symlink data which can be up to 1kB in
- * length (MAXPATHLEN).
- */
-STATIC uint
-xfs_calc_symlink_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_create_reservation(mp) +
-	       xfs_calc_buf_res(1, MAXPATHLEN);
-}
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- *    the finobt (record insertion, removal or modification)
- */
-STATIC uint
-xfs_calc_ifree_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_inode_res(mp, 1) +
-		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_iunlink_remove_reservation(mp) +
-		xfs_calc_buf_res(1, 0) +
-		xfs_calc_buf_res(2 + mp->m_ialloc_blks +
-				 mp->m_in_maxlevels, 0) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_finobt_res(mp, 0, 1);
-}
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-STATIC uint
-xfs_calc_ichange_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_inode_res(mp, 1) +
-		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-
-}
-
-/*
- * Growing the data section of the filesystem.
- *	superblock
- *	agi and agf
- *	allocation btrees
- */
-STATIC uint
-xfs_calc_growdata_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *	superblock: sector size
- *	agf of the ag from which the extent is allocated: sector size
- *	bmap btree for bitmap/summary inode: max depth * blocksize
- *	bitmap/summary inode: inode size
- *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-STATIC uint
-xfs_calc_growrtalloc_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-				 XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_inode_res(mp, 1) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *	one bitmap/summary block: blocksize
- */
-STATIC uint
-xfs_calc_growrtzero_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *	superblock: sector size
- *	bitmap inode: inode size
- *	summary inode: inode size
- *	one bitmap block: blocksize
- *	summary blocks: new summary size
- */
-STATIC uint
-xfs_calc_growrtfree_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		xfs_calc_inode_res(mp, 2) +
-		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
-		xfs_calc_buf_res(1, mp->m_rsumsize);
-}
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *	inode
- */
-STATIC uint
-xfs_calc_swrite_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *	inode
- */
-STATIC uint
-xfs_calc_writeid_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_inode_res(mp, 1);
-}
-
-/*
- * Converting the inode from non-attributed to attributed.
- *	the inode being converted: inode size
- *	agf block and superblock (for block allocation)
- *	the new block (directory sized)
- *	bmap blocks for the new directory block
- *	allocation btrees
- */
-STATIC uint
-xfs_calc_addafork_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_inode_res(mp, 1) +
-		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
-		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
-				 XFS_FSB_TO_B(mp, 1)) +
-		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-				 XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrinval_reservation(
-	struct xfs_mount	*mp)
-{
-	return MAX((xfs_calc_inode_res(mp, 1) +
-		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-				     XFS_FSB_TO_B(mp, 1))),
-		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-				     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Setting an attribute at mount time.
- *	the inode getting the attribute
- *	the superblock for allocations
- *	the agfs extents are allocated from
- *	the attribute btree * max depth
- *	the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime(see
- * below).
- */
-STATIC uint
-xfs_calc_attrsetm_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		xfs_calc_inode_res(mp, 1) +
-		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Setting an attribute at runtime, transaction space unit per block.
- * 	the superblock for allocations: sector size
- *	the inode bmap btree could join or split: max depth * block size
- * Since the runtime attribute transaction space is dependent on the total
- * blocks needed for the 1st bmap, here we calculate out the space unit for
- * one block so that the caller could figure out the total space according
- * to the attibute extent length in blocks by:
- *	ext * M_RES(mp)->tr_attrsetrt.tr_logres
- */
-STATIC uint
-xfs_calc_attrsetrt_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-				 XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrrm_reservation(
-	struct xfs_mount	*mp)
-{
-	return XFS_DQUOT_LOGRES(mp) +
-		MAX((xfs_calc_inode_res(mp, 1) +
-		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
-				      XFS_FSB_TO_B(mp, 1)) +
-		     (uint)XFS_FSB_TO_B(mp,
-					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
-		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-				      XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-STATIC uint
-xfs_calc_clear_agi_bucket_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Clearing the quotaflags in the superblock.
- *	the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Adjusting quota limits.
- *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
- */
-STATIC uint
-xfs_calc_qm_setqlim_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
-}
-
-/*
- * Allocating quota on disk if needed.
- *	the write transaction log space for quota file extent allocation
- *	the unit of quota allocation: one system block size
- */
-STATIC uint
-xfs_calc_qm_dqalloc_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_write_reservation(mp) +
-		xfs_calc_buf_res(1,
-			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
-}
-
-/*
- * Turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- *    the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
-	struct xfs_mount	*mp)
-{
-	return sizeof(struct xfs_qoff_logitem) * 2 +
-		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(
-	struct xfs_mount	*mp)
-{
-	return sizeof(struct xfs_qoff_logitem) * 2;
-}
-
-/*
- * Syncing the incore super block changes to disk.
- *     the super block to reflect the changes: sector size
- */
-STATIC uint
-xfs_calc_sb_reservation(
-	struct xfs_mount	*mp)
-{
-	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-void
-xfs_trans_resv_calc(
-	struct xfs_mount	*mp,
-	struct xfs_trans_resv	*resp)
-{
-	/*
-	 * The following transactions are logged in physical format and
-	 * require a permanent reservation on space.
-	 */
-	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
-	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
-	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
-	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
-	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
-	resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
-	resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
-	resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
-	resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
-	resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
-	resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
-	resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
-	resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
-	resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
-	resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_create_tmpfile.tr_logres =
-			xfs_calc_create_tmpfile_reservation(mp);
-	resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
-	resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
-	resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
-	resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
-	resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
-	resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
-	resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
-	resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
-	resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
-	resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
-	resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-	resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
-	resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
-	resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
-	resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
-	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
-	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
-	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
-
-	/*
-	 * The following transactions are logged in logical format with
-	 * a default log count.
-	 */
-	resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
-	resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-	resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
-	resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-	resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
-	resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-	resp->tr_qm_equotaoff.tr_logres =
-		xfs_calc_qm_quotaoff_end_reservation(mp);
-	resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-	resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
-	resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
-	/* The following transaction are logged in logical format */
-	resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
-	resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
-	resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
-	resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
-	resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
-	resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
-	resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
-	resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
-}
-- 
cgit v1.2.3-59-g8ed1b