From d0a58e833931234c44e515b5b8bede32bd4e6eed Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 6 Apr 2016 07:05:41 +1000
Subject: xfs: disallow rw remount on fs with unknown ro-compat features

Today, a kernel which refuses to mount a filesystem read-write
due to unknown ro-compat features can still transition to read-write
via the remount path.  The old kernel is most likely none the wiser,
because it's unaware of the new feature, and isn't using it.  However,
writing to the filesystem may well corrupt metadata related to that
new feature, and moving to a newer kernel which understand the feature
will have problems.

Right now the only ro-compat feature we have is the free inode btree,
which showed up in v3.16.  It would be good to push this back to
all the active stable kernels, I think, so that if anyone is using
newer mkfs (which enables the finobt feature) with older kernel
releases, they'll be protected.

Cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d760934109b5..ca058a153c15 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1276,6 +1276,16 @@ xfs_fs_remount(
 			return -EINVAL;
 		}
 
+		if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		    xfs_sb_has_ro_compat_feature(sbp,
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+			xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+				(sbp->sb_features_ro_compat &
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+			return -EINVAL;
+		}
+
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From ad747e3b299671e1a53db74963cc6c5f6cdb9f6d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 6 Apr 2016 07:06:20 +1000
Subject: xfs: Don't wrap growfs AGFL indexes

Commit 96f859d ("libxfs: pack the agfl header structure so
XFS_AGFL_SIZE is correct") allowed the freelist to use the empty
slot at the end of the freelist on 64 bit systems that was not
being used due to sizeof() rounding up the structure size.

This has caused versions of xfs_repair prior to 4.5.0 (which also
has the fix) to report this as a corruption once the filesystem has
been grown. Older kernels can also have problems (seen from a whacky
container/vm management environment) mounting filesystems grown on a
system with a newer kernel than the vm/container it is deployed on.

To avoid this problem, change the initial free list indexes not to
wrap across the end of the AGFL, hence avoiding the initialisation
of agf_fllast to the last index in the AGFL.

cc: <stable@vger.kernel.org> # 4.4-4.5
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_fsops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a5317..ca0d3eb44925 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -243,8 +243,8 @@ xfs_growfs_data_private(
 		agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
 		agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
 		agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-		agf->agf_flfirst = 0;
-		agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+		agf->agf_flfirst = cpu_to_be32(1);
+		agf->agf_fllast = 0;
 		agf->agf_flcount = 0;
 		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
 		agf->agf_freeblks = cpu_to_be32(tmpsize);
-- 
cgit v1.2.3-59-g8ed1b


From ce5c767db079649db88a9f189798839f9c544981 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Wed, 6 Apr 2016 07:19:40 +1000
Subject: xfs: add missing break in xfs_parseargs()

Commit 2e74af0e1189 ("xfs: convert mount option parsing to tokens")
missed a 'break;' in xfs_parseargs() which causes mount to fail with
"-o pqnoenforce" option when mounting a v4 filesystem. xfs/050
catches this failure:

XFS (vda6): Super block does not support project and group quota together

Fixes: 2e74af0e1189 ("xfs: convert mount option parsing to tokens")
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ca058a153c15..a9ea12d46b74 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -350,6 +350,7 @@ xfs_parseargs(
 		case Opt_pqnoenforce:
 			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+			break;
 		case Opt_gquota:
 		case Opt_grpquota:
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-- 
cgit v1.2.3-59-g8ed1b


From 143f4aede7fb25b9198b15660d6f9830936394a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 07:41:43 +1000
Subject: xfs: factor out a helper to initialize a local format inode fork

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_dir2_sf.c    |  9 +++-----
 fs/xfs/libxfs/xfs_inode_fork.c | 48 ++++++++++++++++++++++++++----------------
 fs/xfs/libxfs/xfs_inode_fork.h |  1 +
 fs/xfs/xfs_symlink.c           | 12 ++---------
 4 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 974d62e677f4..e5bb9cc3b243 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -257,15 +257,12 @@ xfs_dir2_block_to_sf(
 	 *
 	 * Convert the inode to local format and copy the data in.
 	 */
-	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-	dp->i_df.if_flags |= XFS_IFINLINE;
-	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
 	ASSERT(dp->i_df.if_bytes == 0);
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	dp->i_d.di_size = size;
 
 	logflags |= XFS_ILOG_DDATA;
-	memcpy(dp->i_df.if_u1.if_data, dst, size);
-	dp->i_d.di_size = size;
 	xfs_dir2_sf_check(args);
 out:
 	xfs_trans_log_inode(args->trans, dp, logflags);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c8..86a97f8a9de3 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -231,6 +231,34 @@ xfs_iformat_fork(
 	return error;
 }
 
+void
+xfs_init_local_fork(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	const void		*data,
+	int			size)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			real_size = 0;
+
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+	}
+
+	if (size)
+		memcpy(ifp->if_u1.if_data, data, size);
+
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+	ifp->if_flags |= XFS_IFINLINE;
+}
+
 /*
  * The file is in-lined in the on-disk inode.
  * If it fits into if_inline_data, then copy
@@ -248,8 +276,6 @@ xfs_iformat_local(
 	int		whichfork,
 	int		size)
 {
-	xfs_ifork_t	*ifp;
-	int		real_size;
 
 	/*
 	 * If the size is unreasonable, then something
@@ -265,22 +291,8 @@ xfs_iformat_local(
 				     ip->i_mount, dip);
 		return -EFSCORRUPTED;
 	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	real_size = 0;
-	if (size == 0)
-		ifp->if_u1.if_data = NULL;
-	else if (size <= sizeof(ifp->if_u2.if_inline_data))
-		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-	else {
-		real_size = roundup(size, 4);
-		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-	}
-	ifp->if_bytes = size;
-	ifp->if_real_bytes = real_size;
-	if (size)
-		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-	ifp->if_flags &= ~XFS_IFEXTENTS;
-	ifp->if_flags |= XFS_IFINLINE;
+
+	xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..f95e072ae646 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -134,6 +134,7 @@ void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int		xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
 				  int);
+void		xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 
 struct xfs_bmbt_rec_host *
 		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b44284c1adda..b69f4a770fc9 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -302,19 +302,11 @@ xfs_symlink(
 	 * If the symlink will fit into the inode, write it inline.
 	 */
 	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
-		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
-		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
-		ip->i_d.di_size = pathlen;
-
-		/*
-		 * The inode was initially created in extent format.
-		 */
-		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
-		ip->i_df.if_flags |= XFS_IFINLINE;
+		xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
 
+		ip->i_d.di_size = pathlen;
 		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
 	} else {
 		int	offset;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2b3d1d41b4d96c3b074096ae57b27cd191969643 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 07:48:27 +1000
Subject: xfs: set up inode operation vectors later

In the next patch we'll set up different inode operations for inline vs
out of line symlinks, for that we need to make sure the flags are already
set up properly.

[dchinner: added xfs_setup_iops() call to xfs_rename_alloc_whiteout()]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c |  1 +
 fs/xfs/xfs_inode.h |  5 ++++-
 fs/xfs/xfs_iops.c  | 59 ++++++++++++++++++++++++++++++++++--------------------
 3 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..4fcd6a722213 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2855,6 +2855,7 @@ xfs_rename_alloc_whiteout(
 	 * and flag it as linkable.
 	 */
 	drop_nlink(VFS_I(tmpfile));
+	xfs_setup_iops(tmpfile);
 	xfs_finish_inode_setup(tmpfile);
 	VFS_I(tmpfile)->i_state |= I_LINKABLE;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 43e1d51b15eb..e52d7c7aeb5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -440,6 +440,9 @@ loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
 
 
 /* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+
 /*
  * When setting up a newly allocated inode, we need to call
  * xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +450,6 @@ loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
  * before we've completed instantiation. Otherwise we can do it
  * the moment the inode lookup is complete.
  */
-extern void xfs_setup_inode(struct xfs_inode *ip);
 static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 {
 	xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
 {
 	xfs_setup_inode(ip);
+	xfs_setup_iops(ip);
 	xfs_finish_inode_setup(ip);
 }
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61f4a29..f08d91c51b7f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -181,6 +181,8 @@ xfs_generic_create(
 	}
 #endif
 
+	xfs_setup_iops(ip);
+
 	if (tmpfile)
 		d_tmpfile(dentry, inode);
 	else
@@ -368,6 +370,8 @@ xfs_vn_symlink(
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
+	xfs_setup_iops(cip);
+
 	d_instantiate(dentry, inode);
 	xfs_finish_inode_setup(cip);
 	return 0;
@@ -1193,7 +1197,7 @@ xfs_diflags_to_iflags(
 }
 
 /*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
  *
  * When reading existing inodes from disk this is called directly from xfs_iget,
  * when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,8 +1236,38 @@ xfs_setup_inode(
 	i_size_write(inode, ip->i_d.di_size);
 	xfs_diflags_to_iflags(inode, ip);
 
-	ip->d_ops = ip->i_mount->m_nondir_inode_ops;
-	lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+	if (S_ISDIR(inode->i_mode)) {
+		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+		ip->d_ops = ip->i_mount->m_dir_inode_ops;
+	} else {
+		ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+	}
+
+	/*
+	 * Ensure all page cache allocations are done from GFP_NOFS context to
+	 * prevent direct reclaim recursion back into the filesystem and blowing
+	 * stacks or deadlocking.
+	 */
+	gfp_mask = mapping_gfp_mask(inode->i_mapping);
+	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+	/*
+	 * If there is no attribute fork no ACL can exist on this inode,
+	 * and it can't have any file capabilities attached to it either.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		inode_has_no_xattr(inode);
+		cache_no_acl(inode);
+	}
+}
+
+void
+xfs_setup_iops(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = &ip->i_vnode;
+
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_op = &xfs_inode_operations;
@@ -1241,13 +1275,11 @@ xfs_setup_inode(
 		inode->i_mapping->a_ops = &xfs_address_space_operations;
 		break;
 	case S_IFDIR:
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
 		if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
 			inode->i_op = &xfs_dir_ci_inode_operations;
 		else
 			inode->i_op = &xfs_dir_inode_operations;
 		inode->i_fop = &xfs_dir_file_operations;
-		ip->d_ops = ip->i_mount->m_dir_inode_ops;
 		break;
 	case S_IFLNK:
 		inode->i_op = &xfs_symlink_inode_operations;
@@ -1259,21 +1291,4 @@ xfs_setup_inode(
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 		break;
 	}
-
-	/*
-	 * Ensure all page cache allocations are done from GFP_NOFS context to
-	 * prevent direct reclaim recursion back into the filesystem and blowing
-	 * stacks or deadlocking.
-	 */
-	gfp_mask = mapping_gfp_mask(inode->i_mapping);
-	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
-
-	/*
-	 * If there is no attribute fork no ACL can exist on this inode,
-	 * and it can't have any file capabilities attached to it either.
-	 */
-	if (!XFS_IFORK_Q(ip)) {
-		inode_has_no_xattr(inode);
-		cache_no_acl(inode);
-	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From bfe8804d908a791b16e3686c101f0d7eca9fb5b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 07:50:54 +1000
Subject: xfs: use ->readlink to implement the readlink_by_handle ioctl

Also drop the now unused readlink_copy export.

[dchinner: use d_inode(dentry) rather than dentry->d_inode]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/namei.c         |  1 -
 fs/xfs/xfs_ioctl.c | 18 ++----------------
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/namei.c b/fs/namei.c
index 794f81dce766..cdd041985929 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4515,7 +4515,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
 out:
 	return len;
 }
-EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcb6c19ce3ea..5414bcaf2e73 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
 {
 	struct dentry		*dentry;
 	__u32			olen;
-	void			*link;
 	int			error;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
 		return PTR_ERR(dentry);
 
 	/* Restrict this handle operation to symlinks only. */
-	if (!d_is_symlink(dentry)) {
+	if (!d_inode(dentry)->i_op->readlink) {
 		error = -EINVAL;
 		goto out_dput;
 	}
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
 		goto out_dput;
 	}
 
-	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-	if (!link) {
-		error = -ENOMEM;
-		goto out_dput;
-	}
-
-	error = xfs_readlink(XFS_I(d_inode(dentry)), link);
-	if (error)
-		goto out_kfree;
-	error = readlink_copy(hreq->ohandle, olen, link);
-	if (error)
-		goto out_kfree;
+	error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
 
- out_kfree:
-	kfree(link);
  out_dput:
 	dput(dentry);
 	return error;
-- 
cgit v1.2.3-59-g8ed1b


From 30ee052e12b97c190b27fe6f20e3ac3047df7b5c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 07:53:29 +1000
Subject: xfs: optimize inline symlinks

By overallocating the in-core inode fork data buffer and zero
terminating the link target in xfs_init_local_fork we can avoid
the memory allocation in ->follow_link.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 22 ++++++++++++++++++----
 fs/xfs/xfs_inode_item.c        |  4 ++--
 fs/xfs/xfs_iops.c              | 29 ++++++++++++++++++++++++++---
 fs/xfs/xfs_symlink.c           |  9 +++------
 4 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 86a97f8a9de3..4fbe2263c1fc 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -239,19 +239,33 @@ xfs_init_local_fork(
 	int			size)
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-	int			real_size = 0;
+	int			mem_size = size, real_size = 0;
+	bool			zero_terminate;
+
+	/*
+	 * If we are using the local fork to store a symlink body we need to
+	 * zero-terminate it so that we can pass it back to the VFS directly.
+	 * Overallocate the in-memory fork by one for that and add a zero
+	 * to terminate it below.
+	 */
+	zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+	if (zero_terminate)
+		mem_size++;
 
 	if (size == 0)
 		ifp->if_u1.if_data = NULL;
-	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+	else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
 		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 	else {
-		real_size = roundup(size, 4);
+		real_size = roundup(mem_size, 4);
 		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 	}
 
-	if (size)
+	if (size) {
 		memcpy(ifp->if_u1.if_data, data, size);
+		if (zero_terminate)
+			ifp->if_u1.if_data[size] = '\0';
+	}
 
 	ifp->if_bytes = size;
 	ifp->if_real_bytes = real_size;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c48b5b18d771..37e23c7a5684 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
 			 */
 			data_bytes = roundup(ip->i_df.if_bytes, 4);
 			ASSERT(ip->i_df.if_real_bytes == 0 ||
-			       ip->i_df.if_real_bytes == data_bytes);
+			       ip->i_df.if_real_bytes >= data_bytes);
 			ASSERT(ip->i_df.if_u1.if_data != NULL);
 			ASSERT(ip->i_d.di_size > 0);
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
 			 */
 			data_bytes = roundup(ip->i_afp->if_bytes, 4);
 			ASSERT(ip->i_afp->if_real_bytes == 0 ||
-			       ip->i_afp->if_real_bytes == data_bytes);
+			       ip->i_afp->if_real_bytes >= data_bytes);
 			ASSERT(ip->i_afp->if_u1.if_data != NULL);
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
 					ip->i_afp->if_u1.if_data,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f08d91c51b7f..aee06d9a7b6c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -446,6 +446,16 @@ xfs_vn_get_link(
 	return ERR_PTR(error);
 }
 
+STATIC const char *
+xfs_vn_get_link_inline(
+	struct dentry		*dentry,
+	struct inode		*inode,
+	struct delayed_call	*done)
+{
+	ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+	return XFS_I(inode)->i_df.if_u1.if_data;
+}
+
 STATIC int
 xfs_vn_getattr(
 	struct vfsmount		*mnt,
@@ -1171,6 +1181,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.update_time		= xfs_vn_update_time,
 };
 
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+	.readlink		= generic_readlink,
+	.get_link		= xfs_vn_get_link_inline,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+	.update_time		= xfs_vn_update_time,
+};
+
 STATIC void
 xfs_diflags_to_iflags(
 	struct inode		*inode,
@@ -1282,9 +1304,10 @@ xfs_setup_iops(
 		inode->i_fop = &xfs_dir_file_operations;
 		break;
 	case S_IFLNK:
-		inode->i_op = &xfs_symlink_inode_operations;
-		if (!(ip->i_df.if_flags & XFS_IFINLINE))
-			inode->i_mapping->a_ops = &xfs_address_space_operations;
+		if (ip->i_df.if_flags & XFS_IFINLINE)
+			inode->i_op = &xfs_inline_symlink_inode_operations;
+		else
+			inode->i_op = &xfs_symlink_inode_operations;
 		break;
 	default:
 		inode->i_op = &xfs_inode_operations;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b69f4a770fc9..5961c1e880c2 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -131,6 +131,8 @@ xfs_readlink(
 
 	trace_xfs_readlink(ip);
 
+	ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
@@ -150,12 +152,7 @@ xfs_readlink(
 	}
 
 
-	if (ip->i_df.if_flags & XFS_IFINLINE) {
-		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
-		link[pathlen] = '\0';
-	} else {
-		error = xfs_readlink_bmap(ip, link);
-	}
+	error = xfs_readlink_bmap(ip, link);
 
  out:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-- 
cgit v1.2.3-59-g8ed1b


From 2a6fba6d2311151598abaa1e7c9abd5f8d024a43 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 6 Apr 2016 07:57:18 +1000
Subject: xfs: only return -errno or success from attr ->put_listent

Today, the put_listent formatters return either 1 or 0; if
they return 1, some callers treat this as an error and return
it up the stack, despite "1" not being a valid (negative)
error code.

The intent seems to be that if the input buffer is full,
we set seen_enough or set count = -1, and return 1;
but some callers check the return before checking the
seen_enough or count fields of the context.

Fix this by only returning non-zero for actual errors
encountered, and rely on the caller to first check the
return value, then check the values in the context to
decide what to do.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_attr.h      |  1 +
 fs/xfs/xfs_attr_list.c |  8 +++-----
 fs/xfs/xfs_xattr.c     | 14 ++++++++++----
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index dd4824589470..234331227c0c 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,6 +112,7 @@ typedef struct attrlist_cursor_kern {
  *========================================================================*/
 
 
+/* Return 0 on success, or -errno; other state communicated via *context */
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
 			      unsigned char *, int, int, unsigned char *);
 
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 4fa14820e2e2..c8be331a3196 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -108,16 +108,14 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					   (int)sfe->namelen,
 					   (int)sfe->valuelen,
 					   &sfe->nameval[sfe->namelen]);
-
+			if (error)
+				return error;
 			/*
 			 * Either search callback finished early or
 			 * didn't fit it all in the buffer after all.
 			 */
 			if (context->seen_enough)
 				break;
-
-			if (error)
-				return error;
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 		}
 		trace_xfs_attr_list_sf_all(context);
@@ -581,7 +579,7 @@ xfs_attr_put_listent(
 		trace_xfs_attr_list_full(context);
 		alist->al_more = 1;
 		context->seen_enough = 1;
-		return 1;
+		return 0;
 	}
 
 	aep = (attrlist_ent_t *)&context->alist[context->firstu];
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 110f1d7d86b0..f2201299311f 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -146,7 +146,7 @@ __xfs_xattr_put_listent(
 	arraytop = context->count + prefix_len + namelen + 1;
 	if (arraytop > context->firstu) {
 		context->count = -1;	/* insufficient space */
-		return 1;
+		return 0;
 	}
 	offset = (char *)context->alist + context->count;
 	strncpy(offset, prefix, prefix_len);
@@ -221,11 +221,15 @@ xfs_xattr_put_listent(
 }
 
 ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+	struct dentry	*dentry,
+	char		*data,
+	size_t		size)
 {
 	struct xfs_attr_list_context context;
 	struct attrlist_cursor_kern cursor = { 0 };
-	struct inode		*inode = d_inode(dentry);
+	struct inode	*inode = d_inode(dentry);
+	int		error;
 
 	/*
 	 * First read the regular on-disk attributes.
@@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	context.firstu = context.bufsize;
 	context.put_listent = xfs_xattr_put_listent;
 
-	xfs_attr_list_int(&context);
+	error = xfs_attr_list_int(&context);
+	if (error)
+		return error;
 	if (context.count < 0)
 		return -ERANGE;
 
-- 
cgit v1.2.3-59-g8ed1b


From e5bd12bfea60af455f4cbad494e4ac1082e3abd6 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 6 Apr 2016 07:57:32 +1000
Subject: xfs: don't pass value into attr ->put_listent

The value is not used; only names and value lengths are
returned.  Remove the argument.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_attr.h      |  2 +-
 fs/xfs/xfs_attr_list.c | 18 ++++++------------
 fs/xfs/xfs_xattr.c     |  3 +--
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 234331227c0c..dab4f41de278 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -114,7 +114,7 @@ typedef struct attrlist_cursor_kern {
 
 /* Return 0 on success, or -errno; other state communicated via *context */
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-			      unsigned char *, int, int, unsigned char *);
+			      unsigned char *, int, int);
 
 typedef struct xfs_attr_list_context {
 	struct xfs_inode		*dp;		/* inode */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index c8be331a3196..d30dbfae05fe 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -106,8 +106,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					   sfe->flags,
 					   sfe->nameval,
 					   (int)sfe->namelen,
-					   (int)sfe->valuelen,
-					   &sfe->nameval[sfe->namelen]);
+					   (int)sfe->valuelen);
 			if (error)
 				return error;
 			/*
@@ -198,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					sbp->flags,
 					sbp->name,
 					sbp->namelen,
-					sbp->valuelen,
-					&sbp->name[sbp->namelen]);
+					sbp->valuelen);
 		if (error) {
 			kmem_free(sbuf);
 			return error;
@@ -430,8 +428,7 @@ xfs_attr3_leaf_list_int(
 						entry->flags,
 						name_loc->nameval,
 						(int)name_loc->namelen,
-						be16_to_cpu(name_loc->valuelen),
-						&name_loc->nameval[name_loc->namelen]);
+						be16_to_cpu(name_loc->valuelen));
 			if (retval)
 				return retval;
 		} else {
@@ -459,16 +456,14 @@ xfs_attr3_leaf_list_int(
 							entry->flags,
 							name_rmt->name,
 							(int)name_rmt->namelen,
-							valuelen,
-							args.value);
+							valuelen);
 				kmem_free(args.value);
 			} else {
 				retval = context->put_listent(context,
 						entry->flags,
 						name_rmt->name,
 						(int)name_rmt->namelen,
-						valuelen,
-						NULL);
+						valuelen);
 			}
 			if (retval)
 				return retval;
@@ -549,8 +544,7 @@ xfs_attr_put_listent(
 	int		flags,
 	unsigned char	*name,
 	int		namelen,
-	int		valuelen,
-	unsigned char	*value)
+	int		valuelen)
 {
 	struct attrlist *alist = (struct attrlist *)context->alist;
 	attrlist_ent_t *aep;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index f2201299311f..7fdcf3327652 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -166,8 +166,7 @@ xfs_xattr_put_listent(
 	int		flags,
 	unsigned char	*name,
 	int		namelen,
-	int		valuelen,
-	unsigned char	*value)
+	int		valuelen)
 {
 	char *prefix;
 	int prefix_len;
-- 
cgit v1.2.3-59-g8ed1b


From 7af5ad28a603f2d1ef4c579b8ab0a9d4767a348e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 6 Apr 2016 07:57:45 +1000
Subject: xfs: remove put_value from attr ->put_listent context

The put_value context member is never set; remove it
and the conditional test in xfs_attr3_leaf_list_int().

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_attr.h      |  1 -
 fs/xfs/xfs_attr_list.c | 31 +++----------------------------
 2 files changed, 3 insertions(+), 29 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index dab4f41de278..e3da5d448bcf 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -127,7 +127,6 @@ typedef struct xfs_attr_list_context {
 	int				firstu;		/* first used byte in buffer */
 	int				flags;		/* from VOP call */
 	int				resynch;	/* T/F: resynch with cursor */
-	int				put_value;	/* T/F: need value for listent */
 	put_listent_func_t		put_listent;	/* list output fmt function */
 	int				index;		/* index into output buffer */
 } xfs_attr_list_context_t;
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d30dbfae05fe..cbf4f5d072f6 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -429,45 +429,20 @@ xfs_attr3_leaf_list_int(
 						name_loc->nameval,
 						(int)name_loc->namelen,
 						be16_to_cpu(name_loc->valuelen));
-			if (retval)
-				return retval;
 		} else {
 			xfs_attr_leaf_name_remote_t *name_rmt =
 				xfs_attr3_leaf_name_remote(leaf, i);
 
 			int valuelen = be32_to_cpu(name_rmt->valuelen);
 
-			if (context->put_value) {
-				xfs_da_args_t args;
-
-				memset((char *)&args, 0, sizeof(args));
-				args.geo = context->dp->i_mount->m_attr_geo;
-				args.dp = context->dp;
-				args.whichfork = XFS_ATTR_FORK;
-				args.valuelen = valuelen;
-				args.rmtvaluelen = valuelen;
-				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-				args.rmtblkcnt = xfs_attr3_rmt_blocks(
-							args.dp->i_mount, valuelen);
-				retval = xfs_attr_rmtval_get(&args);
-				if (!retval)
-					retval = context->put_listent(context,
-							entry->flags,
-							name_rmt->name,
-							(int)name_rmt->namelen,
-							valuelen);
-				kmem_free(args.value);
-			} else {
-				retval = context->put_listent(context,
+			retval = context->put_listent(context,
 						entry->flags,
 						name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen);
-			}
-			if (retval)
-				return retval;
 		}
+		if (retval)
+			break;
 		if (context->seen_enough)
 			break;
 		cursor->offset++;
-- 
cgit v1.2.3-59-g8ed1b


From 3ab3ffcaca99e0b77480d77bd393fc227b09069f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 6 Apr 2016 07:57:47 +1000
Subject: xfs: collapse cases in xfs_attr3_leaf_list_int

Consolidate the 2 calls to ->put_listent in
xfs_attr3_leaf_list_int(), by setting up name, namelen, and
valuelen for the local vs remote cases, then call ->put_listent
and do the error handling all in one spot.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/xfs/xfs_attr_list.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index cbf4f5d072f6..d25f26b22ac9 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -412,6 +412,9 @@ xfs_attr3_leaf_list_int(
 	 */
 	retval = 0;
 	for (; i < ichdr.count; entry++, i++) {
+		char *name;
+		int namelen, valuelen;
+
 		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
 			cursor->hashval = be32_to_cpu(entry->hashval);
 			cursor->offset = 0;
@@ -421,26 +424,23 @@ xfs_attr3_leaf_list_int(
 			continue;		/* skip incomplete entries */
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
-			xfs_attr_leaf_name_local_t *name_loc =
-				xfs_attr3_leaf_name_local(leaf, i);
-
-			retval = context->put_listent(context,
-						entry->flags,
-						name_loc->nameval,
-						(int)name_loc->namelen,
-						be16_to_cpu(name_loc->valuelen));
-		} else {
-			xfs_attr_leaf_name_remote_t *name_rmt =
-				xfs_attr3_leaf_name_remote(leaf, i);
+			xfs_attr_leaf_name_local_t *name_loc;
 
-			int valuelen = be32_to_cpu(name_rmt->valuelen);
+			name_loc = xfs_attr3_leaf_name_local(leaf, i);
+			name = name_loc->nameval;
+			namelen = name_loc->namelen;
+			valuelen = be16_to_cpu(name_loc->valuelen);
+		} else {
+			xfs_attr_leaf_name_remote_t *name_rmt;
 
-			retval = context->put_listent(context,
-						entry->flags,
-						name_rmt->name,
-						(int)name_rmt->namelen,
-						valuelen);
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+			name = name_rmt->name;
+			namelen = name_rmt->namelen;
+			valuelen = be32_to_cpu(name_rmt->valuelen);
 		}
+
+		retval = context->put_listent(context, entry->flags,
+					      name, namelen, valuelen);
 		if (retval)
 			break;
 		if (context->seen_enough)
-- 
cgit v1.2.3-59-g8ed1b


From bb18782aa47d8cde90fed5cb0af312642e98a4fa Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 6 Apr 2016 08:11:25 +1000
Subject: xfs: build bios directly in xfs_add_to_ioend

Currently adding a buffer to the ioend and then building a bio from
the buffer list are two separate operations. We don't build the bios
and submit them until the ioend is submitted, and this places a
fixed dependency on bufferhead chaining in the ioend.

The first step to removing the bufferhead chaining in the ioend is
on the IO submission side. We can build the bio directly as we add
the buffers to the ioend chain, thereby removing the need for a
latter "buffer-to-bio" submission loop. This allows us to submit
bios on large ioends as soon as we cannot add more data to the bio.

These bios then get captured by the active plug, and hence will be
dispatched as soon as either the plug overflows or we schedule away
from the writeback context. This will reduce submission latency for
large IOs, but will also allow more timely request queue based
writeback blocking when the device becomes congested.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[hch: various small updates]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 69 +++++++++++++++++++++++++------------------------------
 fs/xfs/xfs_aops.h |  1 +
 2 files changed, 32 insertions(+), 38 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d445a64b979e..d03946719992 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -274,6 +274,7 @@ xfs_alloc_ioend(
 	xfs_ioend_t		*ioend;
 
 	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+	memset(ioend, 0, sizeof(*ioend));
 
 	/*
 	 * Set the count to 1 initially, which will prevent an I/O
@@ -281,16 +282,9 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
-	ioend->io_error = 0;
 	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = type;
 	ioend->io_inode = inode;
-	ioend->io_buffer_head = NULL;
-	ioend->io_buffer_tail = NULL;
-	ioend->io_offset = 0;
-	ioend->io_size = 0;
-	ioend->io_append_trans = NULL;
-
 	INIT_WORK(&ioend->io_work, xfs_end_io);
 	return ioend;
 }
@@ -452,13 +446,18 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 
 /*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
  *
  * If @fail is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * and unlocked them. In this situation, we need to fail the bio and ioend
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
  */
 STATIC int
 xfs_submit_ioend(
@@ -466,14 +465,13 @@ xfs_submit_ioend(
 	xfs_ioend_t		*ioend,
 	int			status)
 {
-	struct buffer_head	*bh;
-	struct bio		*bio;
-	sector_t		lastblock = 0;
-
 	/* Reserve log space if we might write beyond the on-disk inode size. */
 	if (!status &&
-	     ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+	    ioend->io_bio && ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(ioend) &&
+	    !ioend->io_append_trans)
 		status = xfs_setfilesize_trans_alloc(ioend);
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -481,31 +479,15 @@ xfs_submit_ioend(
 	 * time.
 	 */
 	if (status) {
+		if (ioend->io_bio)
+			bio_put(ioend->io_bio);
 		ioend->io_error = status;
 		xfs_finish_ioend(ioend);
 		return status;
 	}
 
-	bio = NULL;
-	for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-		if (!bio) {
-retry:
-			bio = xfs_alloc_ioend_bio(bh);
-		} else if (bh->b_blocknr != lastblock + 1) {
-			xfs_submit_ioend_bio(wbc, ioend, bio);
-			goto retry;
-		}
-
-		if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-			xfs_submit_ioend_bio(wbc, ioend, bio);
-			goto retry;
-		}
-
-		lastblock = bh->b_blocknr;
-	}
-	if (bio)
-		xfs_submit_ioend_bio(wbc, ioend, bio);
+	xfs_submit_ioend_bio(wbc, ioend, ioend->io_bio);
+	ioend->io_bio = NULL;
 	xfs_finish_ioend(ioend);
 	return 0;
 }
@@ -523,6 +505,7 @@ xfs_add_to_ioend(
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
 	struct xfs_writepage_ctx *wpc,
+	struct writeback_control *wbc,
 	struct list_head	*iolist)
 {
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
@@ -542,8 +525,18 @@ xfs_add_to_ioend(
 		wpc->ioend->io_buffer_tail->b_private = bh;
 		wpc->ioend->io_buffer_tail = bh;
 	}
-
 	bh->b_private = NULL;
+
+retry:
+	if (!wpc->ioend->io_bio)
+		wpc->ioend->io_bio = xfs_alloc_ioend_bio(bh);
+
+	if (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) {
+		xfs_submit_ioend_bio(wbc, wpc->ioend, wpc->ioend->io_bio);
+		wpc->ioend->io_bio = NULL;
+		goto retry;
+	}
+
 	wpc->ioend->io_size += bh->b_size;
 	wpc->last_block = bh->b_blocknr;
 	xfs_start_buffer_writeback(bh);
@@ -803,7 +796,7 @@ xfs_writepage_map(
 			lock_buffer(bh);
 			if (wpc->io_type != XFS_IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+			xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 			count++;
 		}
 
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b4421177b68d..8947991e0990 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -52,6 +52,7 @@ typedef struct xfs_ioend {
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
+	struct bio		*io_bio;	/* bio being built */
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
-- 
cgit v1.2.3-59-g8ed1b


From 37992c18bba3f578860c6448b7bae18a14e535d3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 6 Apr 2016 08:12:28 +1000
Subject: xfs: don't release bios on completion immediately

Completion of an ioend requires us to walk the bufferhead list to
end writback on all the bufferheads. This, in turn, is needed so
that we can end writeback on all the pages we just did IO on.

To remove our dependency on bufferheads in writeback, we need to
turn this around the other way - we need to walk the pages we've
just completed IO on, and then walk the buffers attached to the
pages and complete their IO. In doing this, we remove the
requirement for the ioend to track bufferheads directly.

To enable IO completion to walk all the pages we've submitted IO on,
we need to keep the bios that we used for IO around until the ioend
has been completed. We can do this simply by chaining the bios to
the ioend at completion time, and then walking their pages directly
just before destroying the ioend.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[hch: changed the xfs_finish_page_writeback calling convention]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 95 +++++++++++++++++++++++++++++++++++++++----------------
 fs/xfs/xfs_aops.h |  5 +--
 2 files changed, 71 insertions(+), 29 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d03946719992..9d9a01b50078 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -84,20 +84,64 @@ xfs_find_bdev_for_inode(
 }
 
 /*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory.  Do not use the ioend after this.
+ * We're now finished for good with this page.  Update the page state via the
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ */
+static void
+xfs_finish_page_writeback(
+	struct inode		*inode,
+	struct bio_vec		*bvec,
+	int			error)
+{
+	unsigned int		blockmask = (1 << inode->i_blkbits) - 1;
+	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
+	struct buffer_head	*head, *bh;
+	unsigned int		off = 0;
+
+	ASSERT(bvec->bv_offset < PAGE_SIZE);
+	ASSERT((bvec->bv_offset & blockmask) == 0);
+	ASSERT(end < PAGE_SIZE);
+	ASSERT((bvec->bv_len & blockmask) == 0);
+
+	bh = head = page_buffers(bvec->bv_page);
+
+	do {
+		if (off < bvec->bv_offset)
+			goto next_bh;
+		if (off > end)
+			break;
+		bh->b_end_io(bh, !error);
+next_bh:
+		off += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+}
+
+/*
+ * We're now finished for good with this ioend structure.  Update the page
+ * state, release holds on bios, and finally free up memory.  Do not use the
+ * ioend after this.
  */
 STATIC void
 xfs_destroy_ioend(
-	xfs_ioend_t		*ioend)
+	struct xfs_ioend	*ioend)
 {
-	struct buffer_head	*bh, *next;
+	struct inode		*inode = ioend->io_inode;
+	int			error = ioend->io_error;
+	struct bio		*bio, *next;
+
+	for (bio = ioend->io_bio_done; bio; bio = next) {
+		struct bio_vec	*bvec;
+		int		i;
+
+		next = bio->bi_private;
+		bio->bi_private = NULL;
 
-	for (bh = ioend->io_buffer_head; bh; bh = next) {
-		next = bh->b_private;
-		bh->b_end_io(bh, !ioend->io_error);
+		/* walk each page on bio, ending page IO on them */
+		bio_for_each_segment_all(bvec, bio, i)
+			xfs_finish_page_writeback(inode, bvec, error);
+
+		bio_put(bio);
 	}
 
 	mempool_free(ioend, xfs_ioend_pool);
@@ -286,6 +330,7 @@ xfs_alloc_ioend(
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	INIT_WORK(&ioend->io_work, xfs_end_io);
+	spin_lock_init(&ioend->io_lock);
 	return ioend;
 }
 
@@ -365,15 +410,21 @@ STATIC void
 xfs_end_bio(
 	struct bio		*bio)
 {
-	xfs_ioend_t		*ioend = bio->bi_private;
-
-	if (!ioend->io_error)
-		ioend->io_error = bio->bi_error;
+	struct xfs_ioend	*ioend = bio->bi_private;
+	unsigned long		flags;
 
-	/* Toss bio and pass work off to an xfsdatad thread */
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
-	bio_put(bio);
+
+	spin_lock_irqsave(&ioend->io_lock, flags);
+	if (!ioend->io_error)
+	       ioend->io_error = bio->bi_error;
+	if (!ioend->io_bio_done)
+		ioend->io_bio_done = bio;
+	else
+		ioend->io_bio_done_tail->bi_private = bio;
+	ioend->io_bio_done_tail = bio;
+	spin_unlock_irqrestore(&ioend->io_lock, flags);
 
 	xfs_finish_ioend(ioend);
 }
@@ -511,21 +562,11 @@ xfs_add_to_ioend(
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 	    bh->b_blocknr != wpc->last_block + 1 ||
 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-		struct xfs_ioend	*new;
-
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
-
-		new = xfs_alloc_ioend(inode, wpc->io_type);
-		new->io_offset = offset;
-		new->io_buffer_head = bh;
-		new->io_buffer_tail = bh;
-		wpc->ioend = new;
-	} else {
-		wpc->ioend->io_buffer_tail->b_private = bh;
-		wpc->ioend->io_buffer_tail = bh;
+		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type);
+		wpc->ioend->io_offset = offset;
 	}
-	bh->b_private = NULL;
 
 retry:
 	if (!wpc->ioend->io_bio)
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 8947991e0990..61a3dc3dbdf8 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -46,13 +46,14 @@ typedef struct xfs_ioend {
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
 	struct inode		*io_inode;	/* file being written to */
-	struct buffer_head	*io_buffer_head;/* buffer linked list head */
-	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct bio		*io_bio;	/* bio being built */
+	struct bio		*io_bio_done;	/* bios completed */
+	struct bio		*io_bio_done_tail; /* bios completed */
+	spinlock_t		io_lock;	/* for bio completion list */
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
-- 
cgit v1.2.3-59-g8ed1b


From 0e51a8e191dbd9b9c7b7bb0a1c28d57cd2be8e6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 08:34:30 +1000
Subject: xfs: optimize bio handling in the buffer writeback path

This patch implements two closely related changes:  First it embeds
a bio the ioend structure so that we don't have to allocate one
separately.  Second it uses the block layer bio chaining mechanism
to chain additional bios off this first one if needed instead of
manually accounting for multiple bio completions in the ioend
structure.  Together this removes a memory allocation per ioend and
greatly simplifies the ioend setup and I/O completion path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  | 247 ++++++++++++++++++++++++-----------------------------
 fs/xfs/xfs_aops.h  |  15 ++--
 fs/xfs/xfs_super.c |  26 ++----
 3 files changed, 123 insertions(+), 165 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9d9a01b50078..b5f1c66bbb58 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,18 +124,25 @@ next_bh:
  */
 STATIC void
 xfs_destroy_ioend(
-	struct xfs_ioend	*ioend)
+	struct xfs_ioend	*ioend,
+	int			error)
 {
 	struct inode		*inode = ioend->io_inode;
-	int			error = ioend->io_error;
+	struct bio		*last = ioend->io_bio;
 	struct bio		*bio, *next;
 
-	for (bio = ioend->io_bio_done; bio; bio = next) {
+	for (bio = &ioend->io_inline_bio; bio; bio = next) {
 		struct bio_vec	*bvec;
 		int		i;
 
-		next = bio->bi_private;
-		bio->bi_private = NULL;
+		/*
+		 * For the last bio, bi_private points to the ioend, so we
+		 * need to explicitly end the iteration here.
+		 */
+		if (bio == last)
+			next = NULL;
+		else
+			next = bio->bi_private;
 
 		/* walk each page on bio, ending page IO on them */
 		bio_for_each_segment_all(bvec, bio, i)
@@ -143,8 +150,6 @@ xfs_destroy_ioend(
 
 		bio_put(bio);
 	}
-
-	mempool_free(ioend, xfs_ioend_pool);
 }
 
 /*
@@ -218,7 +223,8 @@ xfs_setfilesize(
 
 STATIC int
 xfs_setfilesize_ioend(
-	struct xfs_ioend	*ioend)
+	struct xfs_ioend	*ioend,
+	int			error)
 {
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	struct xfs_trans	*tp = ioend->io_append_trans;
@@ -232,36 +238,14 @@ xfs_setfilesize_ioend(
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	/* we abort the update if there was an IO error */
-	if (ioend->io_error) {
+	if (error) {
 		xfs_trans_cancel(tp);
-		return ioend->io_error;
+		return error;
 	}
 
 	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 
-/*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
-	struct xfs_ioend	*ioend)
-{
-	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
-
-		if (ioend->io_type == XFS_IO_UNWRITTEN)
-			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-		else if (ioend->io_append_trans)
-			queue_work(mp->m_data_workqueue, &ioend->io_work);
-		else
-			xfs_destroy_ioend(ioend);
-	}
-}
-
 /*
  * IO write completion.
  */
@@ -269,16 +253,17 @@ STATIC void
 xfs_end_io(
 	struct work_struct *work)
 {
-	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
-	struct xfs_inode *ip = XFS_I(ioend->io_inode);
-	int		error = 0;
+	struct xfs_ioend	*ioend =
+		container_of(work, struct xfs_ioend, io_work);
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	int			error = ioend->io_bio->bi_error;
 
 	/*
 	 * Set an error if the mount has shut down and proceed with end I/O
 	 * processing so it can perform whatever cleanups are necessary.
 	 */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		ioend->io_error = -EIO;
+		error = -EIO;
 
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
@@ -288,50 +273,33 @@ xfs_end_io(
 	 * on error.
 	 */
 	if (ioend->io_type == XFS_IO_UNWRITTEN) {
-		if (ioend->io_error)
+		if (error)
 			goto done;
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						  ioend->io_size);
 	} else if (ioend->io_append_trans) {
-		error = xfs_setfilesize_ioend(ioend);
+		error = xfs_setfilesize_ioend(ioend, error);
 	} else {
 		ASSERT(!xfs_ioend_is_append(ioend));
 	}
 
 done:
-	if (error)
-		ioend->io_error = error;
-	xfs_destroy_ioend(ioend);
+	xfs_destroy_ioend(ioend, error);
 }
 
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-	struct inode		*inode,
-	unsigned int		type)
+STATIC void
+xfs_end_bio(
+	struct bio		*bio)
 {
-	xfs_ioend_t		*ioend;
-
-	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-	memset(ioend, 0, sizeof(*ioend));
+	struct xfs_ioend	*ioend = bio->bi_private;
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 
-	/*
-	 * Set the count to 1 initially, which will prevent an I/O
-	 * completion callback from happening before we have started
-	 * all the I/O from calling the completion routine too early.
-	 */
-	atomic_set(&ioend->io_remaining, 1);
-	INIT_LIST_HEAD(&ioend->io_list);
-	ioend->io_type = type;
-	ioend->io_inode = inode;
-	INIT_WORK(&ioend->io_work, xfs_end_io);
-	spin_lock_init(&ioend->io_lock);
-	return ioend;
+	if (ioend->io_type == XFS_IO_UNWRITTEN)
+		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+	else if (ioend->io_append_trans)
+		queue_work(mp->m_data_workqueue, &ioend->io_work);
+	else
+		xfs_destroy_ioend(ioend, bio->bi_error);
 }
 
 STATIC int
@@ -403,56 +371,6 @@ xfs_imap_valid(
 		offset < imap->br_startoff + imap->br_blockcount;
 }
 
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
-	struct bio		*bio)
-{
-	struct xfs_ioend	*ioend = bio->bi_private;
-	unsigned long		flags;
-
-	bio->bi_private = NULL;
-	bio->bi_end_io = NULL;
-
-	spin_lock_irqsave(&ioend->io_lock, flags);
-	if (!ioend->io_error)
-	       ioend->io_error = bio->bi_error;
-	if (!ioend->io_bio_done)
-		ioend->io_bio_done = bio;
-	else
-		ioend->io_bio_done_tail->bi_private = bio;
-	ioend->io_bio_done_tail = bio;
-	spin_unlock_irqrestore(&ioend->io_lock, flags);
-
-	xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
-	struct writeback_control *wbc,
-	xfs_ioend_t		*ioend,
-	struct bio		*bio)
-{
-	atomic_inc(&ioend->io_remaining);
-	bio->bi_private = ioend;
-	bio->bi_end_io = xfs_end_bio;
-	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-	struct buffer_head	*bh)
-{
-	struct bio		*bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
-	ASSERT(bio->bi_private == NULL);
-	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
-	return bio;
-}
-
 STATIC void
 xfs_start_buffer_writeback(
 	struct buffer_head	*bh)
@@ -513,16 +431,19 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 STATIC int
 xfs_submit_ioend(
 	struct writeback_control *wbc,
-	xfs_ioend_t		*ioend,
+	struct xfs_ioend	*ioend,
 	int			status)
 {
 	/* Reserve log space if we might write beyond the on-disk inode size. */
 	if (!status &&
-	    ioend->io_bio && ioend->io_type != XFS_IO_UNWRITTEN &&
+	    ioend->io_type != XFS_IO_UNWRITTEN &&
 	    xfs_ioend_is_append(ioend) &&
 	    !ioend->io_append_trans)
 		status = xfs_setfilesize_trans_alloc(ioend);
 
+	ioend->io_bio->bi_private = ioend;
+	ioend->io_bio->bi_end_io = xfs_end_bio;
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -530,19 +451,75 @@ xfs_submit_ioend(
 	 * time.
 	 */
 	if (status) {
-		if (ioend->io_bio)
-			bio_put(ioend->io_bio);
-		ioend->io_error = status;
-		xfs_finish_ioend(ioend);
+		ioend->io_bio->bi_error = status;
+		bio_endio(ioend->io_bio);
 		return status;
 	}
 
-	xfs_submit_ioend_bio(wbc, ioend, ioend->io_bio);
-	ioend->io_bio = NULL;
-	xfs_finish_ioend(ioend);
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+		   ioend->io_bio);
 	return 0;
 }
 
+static void
+xfs_init_bio_from_bh(
+	struct bio		*bio,
+	struct buffer_head	*bh)
+{
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+}
+
+static struct xfs_ioend *
+xfs_alloc_ioend(
+	struct inode		*inode,
+	unsigned int		type,
+	xfs_off_t		offset,
+	struct buffer_head	*bh)
+{
+	struct xfs_ioend	*ioend;
+	struct bio		*bio;
+
+	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+	xfs_init_bio_from_bh(bio, bh);
+
+	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
+	INIT_LIST_HEAD(&ioend->io_list);
+	ioend->io_type = type;
+	ioend->io_inode = inode;
+	ioend->io_size = 0;
+	ioend->io_offset = offset;
+	INIT_WORK(&ioend->io_work, xfs_end_io);
+	ioend->io_append_trans = NULL;
+	ioend->io_bio = bio;
+	return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+	struct xfs_ioend	*ioend,
+	struct writeback_control *wbc,
+	struct buffer_head	*bh)
+{
+	struct bio *new;
+
+	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+	xfs_init_bio_from_bh(new, bh);
+
+	bio_chain(ioend->io_bio, new);
+	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+		   ioend->io_bio);
+	ioend->io_bio = new;
+}
+
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
@@ -564,19 +541,15 @@ xfs_add_to_ioend(
 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		if (wpc->ioend)
 			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type);
-		wpc->ioend->io_offset = offset;
+		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 	}
 
-retry:
-	if (!wpc->ioend->io_bio)
-		wpc->ioend->io_bio = xfs_alloc_ioend_bio(bh);
-
-	if (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) {
-		xfs_submit_ioend_bio(wbc, wpc->ioend, wpc->ioend->io_bio);
-		wpc->ioend->io_bio = NULL;
-		goto retry;
-	}
+	/*
+	 * If the buffer doesn't fit into the bio we need to allocate a new
+	 * one.  This shouldn't happen more than once for a given buffer.
+	 */
+	while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+		xfs_chain_bio(wpc->ioend, wbc, bh);
 
 	wpc->ioend->io_size += bh->b_size;
 	wpc->last_block = bh->b_blocknr;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 61a3dc3dbdf8..814aab790713 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
 
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,24 +37,19 @@ enum {
 	{ XFS_IO_OVERWRITE,		"overwrite" }
 
 /*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
+ * Structure for buffered I/O completions.
  */
-typedef struct xfs_ioend {
+struct xfs_ioend {
 	struct list_head	io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
-	int			io_error;	/* I/O error code */
-	atomic_t		io_remaining;	/* hold count */
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct bio		*io_bio;	/* bio being built */
-	struct bio		*io_bio_done;	/* bios completed */
-	struct bio		*io_bio_done_tail; /* bios completed */
-	spinlock_t		io_lock;	/* for bio completion list */
-} xfs_ioend_t;
+	struct bio		io_inline_bio;	/* MUST BE LAST! */
+};
 
 extern const struct address_space_operations xfs_address_space_operations;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d760934109b5..e52e9c1fd933 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -58,8 +58,7 @@
 #include <linux/parser.h>
 
 static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
+struct bio_set *xfs_ioend_bioset;
 
 static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 #ifdef DEBUG
@@ -1688,20 +1687,15 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-
-	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-	if (!xfs_ioend_zone)
+	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+			offsetof(struct xfs_ioend, io_inline_bio));
+	if (!xfs_ioend_bioset)
 		goto out;
 
-	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-						  xfs_ioend_zone);
-	if (!xfs_ioend_pool)
-		goto out_destroy_ioend_zone;
-
 	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
 						"xfs_log_ticket");
 	if (!xfs_log_ticket_zone)
-		goto out_destroy_ioend_pool;
+		goto out_free_ioend_bioset;
 
 	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
 						"xfs_bmap_free_item");
@@ -1797,10 +1791,8 @@ xfs_init_zones(void)
 	kmem_zone_destroy(xfs_bmap_free_item_zone);
  out_destroy_log_ticket_zone:
 	kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
-	mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
-	kmem_zone_destroy(xfs_ioend_zone);
+ out_free_ioend_bioset:
+	bioset_free(xfs_ioend_bioset);
  out:
 	return -ENOMEM;
 }
@@ -1826,9 +1818,7 @@ xfs_destroy_zones(void)
 	kmem_zone_destroy(xfs_btree_cur_zone);
 	kmem_zone_destroy(xfs_bmap_free_item_zone);
 	kmem_zone_destroy(xfs_log_ticket_zone);
-	mempool_destroy(xfs_ioend_pool);
-	kmem_zone_destroy(xfs_ioend_zone);
-
+	bioset_free(xfs_ioend_bioset);
 }
 
 STATIC int __init
-- 
cgit v1.2.3-59-g8ed1b


From 253f4911f297b83745938b7f2c5649b94730b002 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 09:19:55 +1000
Subject: xfs: better xfs_trans_alloc interface

Merge xfs_trans_reserve and xfs_trans_alloc into a single function call
that returns a transaction with all the required log and block reservations,
and which allows passing transaction flags directly to avoid the cumbersome
_xfs_trans_alloc interface.

While we're at it we also get rid of the transaction type argument that has
been superflous since we stopped supporting the non-CIL logging mode.  The
guts of it will be removed in another patch.

[dchinner: fixed transaction leak in error path in xfs_setattr_nonsize]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_attr.c   | 58 +++++++-----------------------
 fs/xfs/libxfs/xfs_bmap.c   | 22 +++++-------
 fs/xfs/libxfs/xfs_sb.c     |  8 ++---
 fs/xfs/libxfs/xfs_shared.h |  5 +--
 fs/xfs/xfs_aops.c          | 19 ++++------
 fs/xfs/xfs_attr_inactive.c | 16 ++-------
 fs/xfs/xfs_bmap_util.c     | 45 +++++++-----------------
 fs/xfs/xfs_dquot.c         |  7 ++--
 fs/xfs/xfs_file.c          |  8 ++---
 fs/xfs/xfs_fsops.c         | 10 ++----
 fs/xfs/xfs_inode.c         | 60 ++++++++++++-------------------
 fs/xfs/xfs_ioctl.c         | 13 +++----
 fs/xfs/xfs_iomap.c         | 53 +++++++++-------------------
 fs/xfs/xfs_iops.c          | 29 ++++++---------
 fs/xfs/xfs_log_recover.c   | 10 +++---
 fs/xfs/xfs_pnfs.c          |  7 ++--
 fs/xfs/xfs_qm.c            |  9 ++---
 fs/xfs/xfs_qm_syscalls.c   | 26 ++++----------
 fs/xfs/xfs_rtalloc.c       | 21 +++++------
 fs/xfs/xfs_symlink.c       | 16 ++++-----
 fs/xfs/xfs_trans.c         | 88 +++++++++++++++++++++-------------------------
 fs/xfs/xfs_trans.h         |  8 ++---
 22 files changed, 191 insertions(+), 347 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fa3b948ef9c2..4e126f41a0aa 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -242,37 +242,21 @@ xfs_attr_set(
 			return error;
 	}
 
-	/*
-	 * Start our first transaction of the day.
-	 *
-	 * All future transactions during this code must be "chained" off
-	 * this one via the trans_dup() call.  All transactions will contain
-	 * the inode, and the inode will always be marked with trans_ihold().
-	 * Since the inode will be locked in all transactions, we must log
-	 * the inode in every transaction to let it float upward through
-	 * the log.
-	 */
-	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+	tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+			 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+	tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 
 	/*
 	 * Root fork attributes can use reserved data blocks for this
 	 * operation if necessary
 	 */
-
-	if (rsvd)
-		args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-	tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-			 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-	tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-	error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-	if (error) {
-		xfs_trans_cancel(args.trans);
+	error = xfs_trans_alloc(mp, &tres, args.total, 0,
+			rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+	if (error)
 		return error;
-	}
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
 
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
 	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
 				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 				       XFS_QMOPT_RES_REGBLKS);
@@ -428,32 +412,16 @@ xfs_attr_remove(
 	if (error)
 		return error;
 
-	/*
-	 * Start our first transaction of the day.
-	 *
-	 * All future transactions during this code must be "chained" off
-	 * this one via the trans_dup() call.  All transactions will contain
-	 * the inode, and the inode will always be marked with trans_ihold().
-	 * Since the inode will be locked in all transactions, we must log
-	 * the inode in every transaction to let it float upward through
-	 * the log.
-	 */
-	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
 	/*
 	 * Root fork attributes can use reserved data blocks for this
 	 * operation if necessary
 	 */
-
-	if (flags & ATTR_ROOT)
-		args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-	error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
-				  XFS_ATTRRM_SPACE_RES(mp), 0);
-	if (error) {
-		xfs_trans_cancel(args.trans);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
+			XFS_ATTRRM_SPACE_RES(mp), 0,
+			(flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+			&args.trans);
+	if (error)
 		return error;
-	}
 
 	xfs_ilock(dp, XFS_ILOCK_EXCL);
 	/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 041b6948aecc..e7ec8ccd3d64 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork(
 
 	mp = ip->i_mount;
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+
 	blks = XFS_ADDAFORK_SPACE_RES(mp);
-	if (rsvd)
-		tp->t_flags |= XFS_TRANS_RESERVE;
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
+			rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+	if (error)
 		return error;
-	}
+
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
 			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -6026,13 +6025,10 @@ xfs_bmap_split_extent(
 	xfs_fsblock_t           firstfsb;
 	int                     error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8a53eaa349f4..12ca86778e02 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -838,12 +838,10 @@ xfs_sync_sb(
 	struct xfs_trans	*tp;
 	int			error;
 
-	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+			XFS_TRANS_NO_WRITECOUNT, &tp);
+	if (error)
 		return error;
-	}
 
 	xfs_log_sb(tp);
 	if (wait)
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 81ac870834da..7d4ab6439081 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -181,8 +181,9 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);
 #define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */
 #define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer
-					   count in superblock */
+#define XFS_TRANS_NO_WRITECOUNT 0x40	/* do not elevate SB writecount */
+#define XFS_TRANS_NOFS		0x80	/* pass KM_NOFS to kmem_alloc */
+
 /*
  * Field values for xfs_trans_mod_sb.
  */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d445a64b979e..3311db6ecbbd 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -120,13 +120,9 @@ xfs_setfilesize_trans_alloc(
 	struct xfs_trans	*tp;
 	int			error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	ioend->io_append_trans = tp;
 
@@ -1391,13 +1387,10 @@ xfs_end_io_direct_write(
 
 		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-		if (error) {
-			xfs_trans_cancel(tp);
-			return error;
-		}
-		error = xfs_setfilesize(ip, tp, offset, size);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+				&tp);
+		if (!error)
+			error = xfs_setfilesize(ip, tp, offset, size);
 	}
 
 	return error;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2bb959ada45b..55d214981ed2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -405,21 +405,11 @@ xfs_attr_inactive(
 		goto out_destroy_fork;
 	xfs_iunlock(dp, lock_mode);
 
-	/*
-	 * Start our first transaction of the day.
-	 *
-	 * All future transactions during this code must be "chained" off
-	 * this one via the trans_dup() call.  All transactions will contain
-	 * the inode, and the inode will always be marked with trans_ihold().
-	 * Since the inode will be locked in all transactions, we must log
-	 * the inode in every transaction to let it float upward through
-	 * the log.
-	 */
 	lock_mode = 0;
-	trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-	error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
 	if (error)
-		goto out_cancel;
+		goto out_destroy_fork;
 
 	lock_mode = XFS_ILOCK_EXCL;
 	xfs_ilock(dp, lock_mode);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a32c1dcae2ff..3246ebc7cbd0 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -900,19 +900,15 @@ xfs_free_eofblocks(
 		 * Free them up now by truncating the file to
 		 * its current size.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
 		if (need_iolock) {
-			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-				xfs_trans_cancel(tp);
+			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
 				return -EAGAIN;
-			}
 		}
 
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+				&tp);
 		if (error) {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp);
 			if (need_iolock)
 				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return error;
@@ -1037,9 +1033,9 @@ xfs_alloc_file_space(
 		/*
 		 * Allocate and setup the transaction.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-					  resblks, resrtextents);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+				resrtextents, 0, &tp);
+
 		/*
 		 * Check for running out of space
 		 */
@@ -1048,7 +1044,6 @@ xfs_alloc_file_space(
 			 * Free the transaction structure.
 			 */
 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp);
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1311,18 +1306,10 @@ xfs_free_file_space(
 		 * transaction to dip into the reserve blocks to ensure
 		 * the freeing of the space succeeds at ENOSPC.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
-
-		/*
-		 * check for running out of space
-		 */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+				&tp);
 		if (error) {
-			/*
-			 * Free the transaction structure.
-			 */
 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp);
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1482,19 +1469,16 @@ xfs_shift_file_space(
 	}
 
 	while (!error && !done) {
-		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
 		/*
 		 * We would need to reserve permanent block for transaction.
 		 * This will come into picture when after shifting extent into
 		 * hole we found that adjacent extents can be merged which
 		 * may lead to freeing of a block during record update.
 		 */
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-		if (error) {
-			xfs_trans_cancel(tp);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+		if (error)
 			break;
-		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1747,12 +1731,9 @@ xfs_swap_extents(
 	if (error)
 		goto out_unlock;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
 		goto out_unlock;
-	}
 
 	/*
 	 * Lock and join the inodes to the tansaction so that transaction commit
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 316b2a1bdba5..23e24329b132 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -614,11 +614,10 @@ xfs_qm_dqread(
 	trace_xfs_dqread(dqp);
 
 	if (flags & XFS_QMOPT_DQALLOC) {
-		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
-					  XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+				XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
 		if (error)
-			goto error1;
+			goto error0;
 	}
 
 	/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ac0fd32de31e..98bbd8f84c76 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -145,12 +145,10 @@ xfs_update_prealloc_flags(
 	struct xfs_trans	*tp;
 	int			error;
 
-	tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-	error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+			0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a5317..9c563d480b3c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -198,14 +198,10 @@ xfs_growfs_data_private(
 			return error;
 	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
-	tp->t_flags |= XFS_TRANS_RESERVE;
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
-				  XFS_GROWFS_SPACE_RES(mp), 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
 		return error;
-	}
 
 	/*
 	 * Write new AG headers to disk. Non-transactional, but written
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..b82c729634f6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1161,11 +1161,9 @@ xfs_create(
 		rdev = 0;
 		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
 		tres = &M_RES(mp)->tr_mkdir;
-		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 	} else {
 		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 		tres = &M_RES(mp)->tr_create;
-		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 	}
 
 	/*
@@ -1174,20 +1172,19 @@ xfs_create(
 	 * the case we'll drop the one we have and get a more
 	 * appropriate transaction later.
 	 */
-	error = xfs_trans_reserve(tp, tres, resblks, 0);
+	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
 		xfs_flush_inodes(mp);
-		error = xfs_trans_reserve(tp, tres, resblks, 0);
+		error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
 	}
 	if (error == -ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
-		error = xfs_trans_reserve(tp, tres, 0, 0);
+		error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
 	}
 	if (error)
-		goto out_trans_cancel;
-
+		goto out_release_inode;
 
 	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
 		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -1337,17 +1334,16 @@ xfs_create_tmpfile(
 		return error;
 
 	resblks = XFS_IALLOC_SPACE_RES(mp);
-	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
-
 	tres = &M_RES(mp)->tr_create_tmpfile;
-	error = xfs_trans_reserve(tp, tres, resblks, 0);
+
+	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
-		error = xfs_trans_reserve(tp, tres, 0, 0);
+		error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
 	}
 	if (error)
-		goto out_trans_cancel;
+		goto out_release_inode;
 
 	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
 						pdqp, resblks, 1, 0);
@@ -1432,15 +1428,14 @@ xfs_link(
 	if (error)
 		goto std_return;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
 	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		resblks = 0;
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
 	}
 	if (error)
-		goto error_return;
+		goto std_return;
 
 	xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1710,11 +1705,9 @@ xfs_inactive_truncate(
 	struct xfs_trans	*tp;
 	int			error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -1764,8 +1757,6 @@ xfs_inactive_ifree(
 	struct xfs_trans	*tp;
 	int			error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
 	/*
 	 * The ifree transaction might need to allocate blocks for record
 	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1772,8 @@ xfs_inactive_ifree(
 	 * now remains allocated and sits on the unlinked list until the fs is
 	 * repaired.
 	 */
-	tp->t_flags |= XFS_TRANS_RESERVE;
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
-				  XFS_IFREE_SPACE_RES(mp), 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
 	if (error) {
 		if (error == -ENOSPC) {
 			xfs_warn_ratelimited(mp,
@@ -1792,7 +1782,6 @@ xfs_inactive_ifree(
 		} else {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
 		}
-		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -2525,11 +2514,6 @@ xfs_remove(
 	if (error)
 		goto std_return;
 
-	if (is_dir)
-		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-	else
-		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-
 	/*
 	 * We try to get the real space reservation first,
 	 * allowing for directory btree deletion(s) implying
@@ -2540,14 +2524,15 @@ xfs_remove(
 	 * block from the directory.
 	 */
 	resblks = XFS_REMOVE_SPACE_RES(mp);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		resblks = 0;
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+				&tp);
 	}
 	if (error) {
 		ASSERT(error != -ENOSPC);
-		goto out_trans_cancel;
+		goto std_return;
 	}
 
 	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2910,15 +2895,15 @@ xfs_rename(
 	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
 				inodes, &num_inodes);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
 	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
 	if (error == -ENOSPC) {
 		spaceres = 0;
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+				&tp);
 	}
 	if (error)
-		goto out_trans_cancel;
+		goto out_release_wip;
 
 	/*
 	 * Attach the dquots to the inodes
@@ -3155,6 +3140,7 @@ out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 out_trans_cancel:
 	xfs_trans_cancel(tp);
+out_release_wip:
 	if (wip)
 		IRELE(wip);
 	return error;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcb6c19ce3ea..6f82187ee297 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -334,12 +334,10 @@ xfs_set_dmattrs(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
+
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
@@ -1141,10 +1139,9 @@ xfs_ioctl_setattr_get_trans(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		goto out_unlock;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
 	if (error)
-		goto out_cancel;
+		return ERR_PTR(error);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d81bdc080370..58391355a44d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
 	int		error;
 	int		lockmode;
 	int		bmapi_flags = XFS_BMAPI_PREALLOC;
+	uint		tflags = 0;
 
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
@@ -191,11 +192,6 @@ xfs_iomap_write_direct(
 	if (error)
 		return error;
 
-	/*
-	 * Allocate and setup the transaction
-	 */
-	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
 	/*
 	 * For DAX, we do not allocate unwritten extents, but instead we zero
 	 * the block before we commit the transaction.  Ideally we'd like to do
@@ -209,23 +205,17 @@ xfs_iomap_write_direct(
 	 * the reserve block pool for bmbt block allocation if there is no space
 	 * left but we need to do unwritten extent conversion.
 	 */
-
 	if (IS_DAX(VFS_I(ip))) {
 		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
 		if (ISUNWRITTEN(imap)) {
-			tp->t_flags |= XFS_TRANS_RESERVE;
+			tflags |= XFS_TRANS_RESERVE;
 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
 		}
 	}
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-				  resblks, resrtextents);
-	/*
-	 * Check for running out of space, note: need lock to return
-	 */
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
+			tflags, &tp);
+	if (error)
 		return error;
-	}
 
 	lockmode = XFS_ILOCK_EXCL;
 	xfs_ilock(ip, lockmode);
@@ -726,15 +716,13 @@ xfs_iomap_write_allocate(
 
 		nimaps = 0;
 		while (nimaps == 0) {
-			tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-			tp->t_flags |= XFS_TRANS_RESERVE;
 			nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-						  nres, 0);
-			if (error) {
-				xfs_trans_cancel(tp);
+
+			error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+					0, XFS_TRANS_RESERVE, &tp);
+			if (error)
 				return error;
-			}
+
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 			xfs_trans_ijoin(tp, ip, 0);
 
@@ -878,25 +866,18 @@ xfs_iomap_write_unwritten(
 
 	do {
 		/*
-		 * set up a transaction to convert the range of extents
+		 * Set up a transaction to convert the range of extents
 		 * from unwritten to real. Do allocations in a loop until
 		 * we have covered the range passed in.
 		 *
-		 * Note that we open code the transaction allocation here
-		 * to pass KM_NOFS--we can't risk to recursing back into
-		 * the filesystem here as we might be asked to write out
-		 * the same inode that we complete here and might deadlock
-		 * on the iolock.
+		 * Note that we can't risk to recursing back into the filesystem
+		 * here as we might be asked to write out the same inode that we
+		 * complete here and might deadlock on the iolock.
 		 */
-		sb_start_intwrite(mp->m_super);
-		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
-		tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-					  resblks, 0);
-		if (error) {
-			xfs_trans_cancel(tp);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+				XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+		if (error)
 			return error;
-		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61f4a29..fc7766164dc9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -599,12 +599,12 @@ xfs_setattr_nonsize(
 			return error;
 	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
 	if (error)
-		goto out_trans_cancel;
+		goto out_dqrele;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
@@ -633,12 +633,10 @@ xfs_setattr_nonsize(
 						NULL, capable(CAP_FOWNER) ?
 						XFS_QMOPT_FORCE_RES : 0);
 			if (error)	/* out of quota */
-				goto out_unlock;
+				goto out_cancel;
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip, 0);
-
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
 	 */
@@ -722,10 +720,9 @@ xfs_setattr_nonsize(
 
 	return 0;
 
-out_unlock:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
+out_cancel:
 	xfs_trans_cancel(tp);
+out_dqrele:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
 	return error;
@@ -834,7 +831,7 @@ xfs_setattr_size(
 	 * We have to do all the page cache truncate work outside the
 	 * transaction context as the "lock" order is page lock->log space
 	 * reservation as defined by extent allocation in the writeback path.
-	 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
 	 * having already truncated the in-memory version of the file (i.e. made
 	 * user visible changes). There's not much we can do about this, except
 	 * to hope that the caller sees ENOMEM and retries the truncate
@@ -849,10 +846,9 @@ xfs_setattr_size(
 		return error;
 	truncate_setsize(inode, newsize);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
 	if (error)
-		goto out_trans_cancel;
+		return error;
 
 	lock_flags |= XFS_ILOCK_EXCL;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +967,9 @@ xfs_vn_update_time(
 
 	trace_xfs_update_time(ip);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (flags & S_CTIME)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 396565f43247..558f3d1d91ad 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4205,10 +4205,9 @@ xlog_recover_process_efi(
 		}
 	}
 
-	tp = xfs_trans_alloc(mp, 0);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
 	if (error)
-		goto abort_error;
+		return error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket(
 	int		offset;
 	int		error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
 	if (error)
-		goto out_abort;
+		goto out_error;
 
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ade236e90bb3..3332baeac582 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -308,12 +308,9 @@ xfs_fs_commit_blocks(
 			goto out_drop_iolock;
 	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
 		goto out_drop_iolock;
-	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be125e1758c1..a60d9e2739d1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
 		}
 	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
-				  XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+			XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	if (need_alloc) {
 		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index f4d0e0a8f517..475a3882a81f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
 	if (error) {
-		xfs_trans_cancel(tp);
 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 		goto out_put;
 	}
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
 	defq = xfs_get_defquota(dqp, q);
 	xfs_dqunlock(dqp);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+	if (error)
 		goto out_rele;
-	}
 
 	xfs_dqlock(dqp);
 	xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
 	int			error;
 	xfs_qoff_logitem_t	*qoffi;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
 					flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
 
 	*qoffstartp = NULL;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+	if (error)
 		goto out;
-	}
 
 	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
 	xfs_trans_log_quotaoff_item(tp, qoffi);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index abf44435d04a..3938b37d1043 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,15 +780,14 @@ xfs_growfs_rt_alloc(
 	 * Allocate space to the file, as necessary.
 	 */
 	while (oblocks < nblocks) {
-		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
 		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
 		/*
 		 * Reserve space & log for one extent added to the file.
 		 */
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
-					  resblks, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+				0, 0, &tp);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 		/*
 		 * Lock the inode.
 		 */
@@ -823,14 +822,13 @@ xfs_growfs_rt_alloc(
 		for (bno = map.br_startoff, fsbno = map.br_startblock;
 		     bno < map.br_startoff + map.br_blockcount;
 		     bno++, fsbno++) {
-			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
 			/*
 			 * Reserve log for one block zeroing.
 			 */
-			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
-						  0, 0);
+			error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+					0, 0, 0, &tp);
 			if (error)
-				goto out_trans_cancel;
+				return error;
 			/*
 			 * Lock the bitmap inode.
 			 */
@@ -994,11 +992,10 @@ xfs_growfs_rt(
 		/*
 		 * Start a transaction, get the log reservation.
 		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
-					  0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+				&tp);
 		if (error)
-			goto error_cancel;
+			break;
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b44284c1adda..c3aeaa884478 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -221,7 +221,6 @@ xfs_symlink(
 	if (error)
 		return error;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
 	/*
 	 * The symlink will fit into the inode data fork?
 	 * There can't be any attributes so we get the whole variable part.
@@ -231,13 +230,15 @@ xfs_symlink(
 	else
 		fs_blocks = xfs_symlink_blocks(mp, pathlen);
 	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
 	if (error == -ENOSPC && fs_blocks == 0) {
 		resblks = 0;
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+				&tp);
 	}
 	if (error)
-		goto out_trans_cancel;
+		goto out_release_inode;
 
 	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
 		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -455,12 +456,9 @@ xfs_inactive_symlink_rmt(
 	 */
 	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
 		return error;
-	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 20c53666cb4b..b3669efb2a0a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -46,47 +46,6 @@ xfs_trans_init(
 	xfs_trans_resv_calc(mp, M_RES(mp));
 }
 
-/*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction.  These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
-	xfs_mount_t	*mp,
-	uint		type)
-{
-	xfs_trans_t     *tp;
-
-	sb_start_intwrite(mp->m_super);
-	tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
-	tp->t_flags |= XFS_TRANS_FREEZE_PROT;
-	return tp;
-}
-
-xfs_trans_t *
-_xfs_trans_alloc(
-	xfs_mount_t	*mp,
-	uint		type,
-	xfs_km_flags_t	memflags)
-{
-	xfs_trans_t	*tp;
-
-	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
-	atomic_inc(&mp->m_active_trans);
-
-	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
-	tp->t_type = type;
-	tp->t_mountp = mp;
-	INIT_LIST_HEAD(&tp->t_items);
-	INIT_LIST_HEAD(&tp->t_busy);
-	return tp;
-}
-
 /*
  * Free the transaction structure.  If there is more clean up
  * to do when the structure is freed, add it here.
@@ -99,7 +58,7 @@ xfs_trans_free(
 	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
 	atomic_dec(&tp->t_mountp->m_active_trans);
-	if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_end_intwrite(tp->t_mountp->m_super);
 	xfs_trans_free_dqinfo(tp);
 	kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
 	 * Initialize the new transaction structure.
 	 */
 	ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
-	ntp->t_type = tp->t_type;
 	ntp->t_mountp = tp->t_mountp;
 	INIT_LIST_HEAD(&ntp->t_items);
 	INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
 
 	ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
 		       (tp->t_flags & XFS_TRANS_RESERVE) |
-		       (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+		       (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
 	/* We gave our writer reference to the new transaction */
-	tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+	tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
 	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
 	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
 	tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
  * This does not do quota reservations. That typically is done by the
  * caller afterwards.
  */
-int
+static int
 xfs_trans_reserve(
 	struct xfs_trans	*tp,
 	struct xfs_trans_res	*resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
 						resp->tr_logres,
 						resp->tr_logcount,
 						&tp->t_ticket, XFS_TRANSACTION,
-						permanent, tp->t_type);
+						permanent, 0);
 		}
 
 		if (error)
@@ -268,6 +226,42 @@ undo_blocks:
 	return error;
 }
 
+int
+xfs_trans_alloc(
+	struct xfs_mount	*mp,
+	struct xfs_trans_res	*resp,
+	uint			blocks,
+	uint			rtextents,
+	uint			flags,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+		sb_start_intwrite(mp->m_super);
+
+	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+	atomic_inc(&mp->m_active_trans);
+
+	tp = kmem_zone_zalloc(xfs_trans_zone,
+		(flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+	tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_flags = flags;
+	tp->t_mountp = mp;
+	INIT_LIST_HEAD(&tp->t_items);
+	INIT_LIST_HEAD(&tp->t_busy);
+
+	error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+	if (error) {
+		xfs_trans_cancel(tp);
+		return error;
+	}
+
+	*tpp = tp;
+	return 0;
+}
+
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e7c49cf43fbc..9a462e892e4f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -90,7 +90,6 @@ void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
  */
 typedef struct xfs_trans {
 	unsigned int		t_magic;	/* magic number */
-	unsigned int		t_type;		/* transaction type */
 	unsigned int		t_log_res;	/* amt of log space resvd */
 	unsigned int		t_log_count;	/* count for perm log res */
 	unsigned int		t_blk_res;	/* # of blocks resvd */
@@ -148,10 +147,9 @@ typedef struct xfs_trans {
 /*
  * XFS transaction mechanism exported interfaces.
  */
-xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-int		xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
-				  uint, uint);
+int		xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+			uint blocks, uint rtextents, uint flags,
+			struct xfs_trans **tpp);
 void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 
 struct xfs_buf	*xfs_trans_get_buf_map(struct xfs_trans *tp,
-- 
cgit v1.2.3-59-g8ed1b


From 710b1e2c2948c1e5d0499def5273ecbc6472342d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 09:20:36 +1000
Subject: xfs: remove transaction types

These aren't used for CIL-style logging and can be dropped.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_log_format.h |  5 +++
 fs/xfs/libxfs/xfs_shared.h     | 97 ------------------------------------------
 fs/xfs/xfs_log.c               | 58 +------------------------
 fs/xfs/xfs_log.h               |  3 +-
 fs/xfs/xfs_log_cil.c           |  1 -
 fs/xfs/xfs_log_priv.h          |  1 -
 fs/xfs/xfs_trace.h             |  5 +--
 fs/xfs/xfs_trans.c             |  2 +-
 8 files changed, 10 insertions(+), 162 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d54a8018b079..e8f49c029ff0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -211,6 +211,11 @@ typedef struct xfs_trans_header {
 
 #define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
 
+/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT	40
+
 /*
  * Log item types.
  */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 7d4ab6439081..16002b5ec4eb 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -55,103 +55,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 
-/*
- * Transaction types.  Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE	1
-#define XFS_TRANS_SETATTR_SIZE		2
-#define XFS_TRANS_INACTIVE		3
-#define XFS_TRANS_CREATE		4
-#define XFS_TRANS_CREATE_TRUNC		5
-#define XFS_TRANS_TRUNCATE_FILE		6
-#define XFS_TRANS_REMOVE		7
-#define XFS_TRANS_LINK			8
-#define XFS_TRANS_RENAME		9
-#define XFS_TRANS_MKDIR			10
-#define XFS_TRANS_RMDIR			11
-#define XFS_TRANS_SYMLINK		12
-#define XFS_TRANS_SET_DMATTRS		13
-#define XFS_TRANS_GROWFS		14
-#define XFS_TRANS_STRAT_WRITE		15
-#define XFS_TRANS_DIOSTRAT		16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define	XFS_TRANS_WRITEID		18
-#define	XFS_TRANS_ADDAFORK		19
-#define	XFS_TRANS_ATTRINVAL		20
-#define	XFS_TRANS_ATRUNCATE		21
-#define	XFS_TRANS_ATTR_SET		22
-#define	XFS_TRANS_ATTR_RM		23
-#define	XFS_TRANS_ATTR_FLAG		24
-#define	XFS_TRANS_CLEAR_AGI_BUCKET	25
-#define XFS_TRANS_SB_CHANGE		26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1		27
-#define XFS_TRANS_DUMMY2		28
-#define XFS_TRANS_QM_QUOTAOFF		29
-#define XFS_TRANS_QM_DQALLOC		30
-#define XFS_TRANS_QM_SETQLIM		31
-#define XFS_TRANS_QM_DQCLUSTER		32
-#define XFS_TRANS_QM_QINOCREATE		33
-#define XFS_TRANS_QM_QUOTAOFF_END	34
-#define XFS_TRANS_FSYNC_TS		35
-#define	XFS_TRANS_GROWFSRT_ALLOC	36
-#define	XFS_TRANS_GROWFSRT_ZERO		37
-#define	XFS_TRANS_GROWFSRT_FREE		38
-#define	XFS_TRANS_SWAPEXT		39
-#define	XFS_TRANS_CHECKPOINT		40
-#define	XFS_TRANS_ICREATE		41
-#define	XFS_TRANS_CREATE_TMPFILE	42
-#define	XFS_TRANS_TYPE_MAX		43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
-	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
-	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
-	{ XFS_TRANS_CREATE,		"CREATE" }, \
-	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
-	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
-	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
-	{ XFS_TRANS_LINK,		"LINK" }, \
-	{ XFS_TRANS_RENAME,		"RENAME" }, \
-	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
-	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
-	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
-	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
-	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
-	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
-	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
-	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
-	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
-	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
-	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
-	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
-	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
-	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
-	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
-	{ XFS_TRANS_SB_CHANGE,		"SBCHANGE" }, \
-	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
-	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
-	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
-	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
-	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
-	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
-	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
-	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
-	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
-	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
-	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
-	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
-	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
-	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
-	{ XFS_TRANS_ICREATE,		"ICREATE" }, \
-	{ XFS_TRANS_CREATE_TMPFILE,	"CREATE_TMPFILE" }, \
-	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
-
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b49ccf5c1d75..268f7d65cfd4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -435,8 +435,7 @@ xfs_log_reserve(
 	int		 	cnt,
 	struct xlog_ticket	**ticp,
 	__uint8_t	 	client,
-	bool			permanent,
-	uint		 	t_type)
+	bool			permanent)
 {
 	struct xlog		*log = mp->m_log;
 	struct xlog_ticket	*tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
 	if (!tic)
 		return -ENOMEM;
 
-	tic->t_trans_type = t_type;
 	*ticp = tic;
 
 	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	} while (iclog != first_iclog);
 #endif
 	if (! (XLOG_FORCED_SHUTDOWN(log))) {
-		error = xfs_log_reserve(mp, 600, 1, &tic,
-					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
 		if (!error) {
 			/* the data section must be 32 bit size aligned */
 			struct {
@@ -2032,58 +2029,8 @@ xlog_print_tic_res(
 	    REG_TYPE_STR(ICREATE, "inode create")
 	};
 #undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type)	[XFS_TRANS_##type] = #type
-	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-	    TRANS_TYPE_STR(SETATTR_NOT_SIZE),
-	    TRANS_TYPE_STR(SETATTR_SIZE),
-	    TRANS_TYPE_STR(INACTIVE),
-	    TRANS_TYPE_STR(CREATE),
-	    TRANS_TYPE_STR(CREATE_TRUNC),
-	    TRANS_TYPE_STR(TRUNCATE_FILE),
-	    TRANS_TYPE_STR(REMOVE),
-	    TRANS_TYPE_STR(LINK),
-	    TRANS_TYPE_STR(RENAME),
-	    TRANS_TYPE_STR(MKDIR),
-	    TRANS_TYPE_STR(RMDIR),
-	    TRANS_TYPE_STR(SYMLINK),
-	    TRANS_TYPE_STR(SET_DMATTRS),
-	    TRANS_TYPE_STR(GROWFS),
-	    TRANS_TYPE_STR(STRAT_WRITE),
-	    TRANS_TYPE_STR(DIOSTRAT),
-	    TRANS_TYPE_STR(WRITEID),
-	    TRANS_TYPE_STR(ADDAFORK),
-	    TRANS_TYPE_STR(ATTRINVAL),
-	    TRANS_TYPE_STR(ATRUNCATE),
-	    TRANS_TYPE_STR(ATTR_SET),
-	    TRANS_TYPE_STR(ATTR_RM),
-	    TRANS_TYPE_STR(ATTR_FLAG),
-	    TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
-	    TRANS_TYPE_STR(SB_CHANGE),
-	    TRANS_TYPE_STR(DUMMY1),
-	    TRANS_TYPE_STR(DUMMY2),
-	    TRANS_TYPE_STR(QM_QUOTAOFF),
-	    TRANS_TYPE_STR(QM_DQALLOC),
-	    TRANS_TYPE_STR(QM_SETQLIM),
-	    TRANS_TYPE_STR(QM_DQCLUSTER),
-	    TRANS_TYPE_STR(QM_QINOCREATE),
-	    TRANS_TYPE_STR(QM_QUOTAOFF_END),
-	    TRANS_TYPE_STR(FSYNC_TS),
-	    TRANS_TYPE_STR(GROWFSRT_ALLOC),
-	    TRANS_TYPE_STR(GROWFSRT_ZERO),
-	    TRANS_TYPE_STR(GROWFSRT_FREE),
-	    TRANS_TYPE_STR(SWAPEXT),
-	    TRANS_TYPE_STR(CHECKPOINT),
-	    TRANS_TYPE_STR(ICREATE),
-	    TRANS_TYPE_STR(CREATE_TMPFILE)
-	};
-#undef TRANS_TYPE_STR
 
 	xfs_warn(mp, "xlog_write: reservation summary:");
-	xfs_warn(mp, "  trans type  = %s (%u)",
-		 ((ticket->t_trans_type <= 0 ||
-		   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-		  "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
-		 ticket->t_trans_type);
 	xfs_warn(mp, "  unit res    = %d bytes",
 		 ticket->t_unit_res);
 	xfs_warn(mp, "  current res = %d bytes",
@@ -3709,7 +3656,6 @@ xlog_ticket_alloc(
 	tic->t_tid		= prandom_u32();
 	tic->t_clientid		= client;
 	tic->t_flags		= XLOG_TIC_INITED;
-	tic->t_trans_type	= 0;
 	if (permanent)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
 
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index aa533a7d50f2..80ba0c047090 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -161,8 +161,7 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
 			  int		   count,
 			  struct xlog_ticket **ticket,
 			  __uint8_t	   clientid,
-			  bool		   permanent,
-			  uint		   t_type);
+			  bool		   permanent);
 int	  xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int	  xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4e7649351f5a..5e54e7955ea6 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
 
 	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
 				KM_SLEEP|KM_NOFS);
-	tic->t_trans_type = XFS_TRANS_CHECKPOINT;
 
 	/*
 	 * set the current reservation to zero so we know to steal the basic
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ed8896310c00..765f084759b5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
 	char		   t_cnt;	 /* current count		 : 1  */
 	char		   t_clientid;	 /* who does this belong to;	 : 1  */
 	char		   t_flags;	 /* properties of reservation	 : 1  */
-	uint		   t_trans_type; /* transaction type             : 4  */
 
         /* reservation array fields */
 	uint		   t_res_num;                    /* num in array : 4 */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008e..f08129444280 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -944,7 +944,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 	TP_ARGS(log, tic),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(unsigned, trans_type)
 		__field(char, ocnt)
 		__field(char, cnt)
 		__field(int, curr_res)
@@ -962,7 +961,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 	),
 	TP_fast_assign(
 		__entry->dev = log->l_mp->m_super->s_dev;
-		__entry->trans_type = tic->t_trans_type;
 		__entry->ocnt = tic->t_ocnt;
 		__entry->cnt = tic->t_cnt;
 		__entry->curr_res = tic->t_curr_res;
@@ -980,14 +978,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 		__entry->curr_block = log->l_curr_block;
 		__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
 	),
-	TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+	TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
 		  "t_unit_res %u t_flags %s reserveq %s "
 		  "writeq %s grant_reserve_cycle %d "
 		  "grant_reserve_bytes %d grant_write_cycle %d "
 		  "grant_write_bytes %d curr_cycle %d curr_block %d "
 		  "tail_cycle %d tail_block %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
 		  __entry->ocnt,
 		  __entry->cnt,
 		  __entry->curr_res,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index b3669efb2a0a..5f3d33d16e67 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -177,7 +177,7 @@ xfs_trans_reserve(
 						resp->tr_logres,
 						resp->tr_logcount,
 						&tp->t_ticket, XFS_TRANSACTION,
-						permanent, 0);
+						permanent);
 		}
 
 		if (error)
-- 
cgit v1.2.3-59-g8ed1b


From 9f27889f3a96ff356ac92688cc0c4be3935ae3af Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 6 Apr 2016 09:46:30 +1000
Subject: xfs: Add caller function output to xfs_log_force tracepoint

I had sent this patch yesterday, but for some reason it didn't reach
xfs list, sending again.

Output the caller of xfs_log_force might be useful when tracing log
checkpoint problems without the need to build kernel with DEBUG.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log.c   |  4 ++--
 fs/xfs/xfs_trace.h | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b49ccf5c1d75..8c08015bc094 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3378,7 +3378,7 @@ xfs_log_force(
 {
 	int	error;
 
-	trace_xfs_log_force(mp, 0);
+	trace_xfs_log_force(mp, 0, _RET_IP_);
 	error = _xfs_log_force(mp, flags, NULL);
 	if (error)
 		xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3527,7 @@ xfs_log_force_lsn(
 {
 	int	error;
 
-	trace_xfs_log_force(mp, lsn);
+	trace_xfs_log_force(mp, lsn, _RET_IP_);
 	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
 	if (error)
 		xfs_warn(mp, "%s: error %d returned.", __func__, error);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008e..384bb1791481 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1053,19 +1053,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
 )
 
 TRACE_EVENT(xfs_log_force,
-	TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
-	TP_ARGS(mp, lsn),
+	TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+	TP_ARGS(mp, lsn, caller_ip),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_lsn_t, lsn)
+		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->lsn = lsn;
+		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d lsn 0x%llx",
+	TP_printk("dev %d:%d lsn 0x%llx caller %ps",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->lsn)
+		  __entry->lsn, (void *)__entry->caller_ip)
 )
 
 #define DEFINE_LOG_ITEM_EVENT(name) \
-- 
cgit v1.2.3-59-g8ed1b


From 664b60f6babc98ee03c2ff15b9482cc8c5e15a83 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Apr 2016 09:47:01 +1000
Subject: xfs: improve kmem_realloc

Use krealloc to implement our realloc function.  This helps to avoid
new allocations if we are still in the slab bucket.  At least for the
bmap btree root that's actually the common case.

This also allows removing the now unused oldsize argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/kmem.c                  | 26 +++++++++++++++-----------
 fs/xfs/kmem.h                  |  2 +-
 fs/xfs/libxfs/xfs_inode_fork.c | 10 +++-------
 fs/xfs/xfs_log_recover.c       |  2 +-
 fs/xfs/xfs_mount.c             |  1 -
 5 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 686ba6fb20dd..339c696bbc01 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 }
 
 void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-	     xfs_km_flags_t flags)
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
-	void	*new;
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
 
-	new = kmem_alloc(newsize, flags);
-	if (ptr) {
-		if (new)
-			memcpy(new, ptr,
-				((oldsize < newsize) ? oldsize : newsize));
-		kmem_free(ptr);
-	}
-	return new;
+	do {
+		ptr = krealloc(old, newsize, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+	"%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
+				current->comm, current->pid,
+				newsize, __func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
 }
 
 void *
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index d1c66e465ca5..689f746224e7 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
 {
 	kvfree(ptr);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c8..1c842dce44b6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -516,7 +516,6 @@ xfs_iroot_realloc(
 		new_max = cur_max + rec_diff;
 		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-				XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
 				KM_SLEEP | KM_NOFS);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
@@ -660,7 +659,6 @@ xfs_idata_realloc(
 				ifp->if_u1.if_data =
 					kmem_realloc(ifp->if_u1.if_data,
 							real_size,
-							ifp->if_real_bytes,
 							KM_SLEEP | KM_NOFS);
 			}
 		} else {
@@ -1376,8 +1374,7 @@ xfs_iext_realloc_direct(
 		if (rnew_size != ifp->if_real_bytes) {
 			ifp->if_u1.if_extents =
 				kmem_realloc(ifp->if_u1.if_extents,
-						rnew_size,
-						ifp->if_real_bytes, KM_NOFS);
+						rnew_size, KM_NOFS);
 		}
 		if (rnew_size > ifp->if_real_bytes) {
 			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1458,8 @@ xfs_iext_realloc_indirect(
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
 	} else {
-		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-			kmem_realloc(ifp->if_u1.if_ext_irec,
-				new_size, size, KM_NOFS);
+		ifp->if_u1.if_ext_irec =
+			kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
 	}
 }
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 396565f43247..bf6e80703613 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans(
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+	ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
 	memcpy(&ptr[old_len], dp, len);
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 536a0ee9cd5a..654799f716fc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -89,7 +89,6 @@ xfs_uuid_mount(
 	if (hole < 0) {
 		xfs_uuid_table = kmem_realloc(xfs_uuid_table,
 			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-			xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
 			KM_SLEEP);
 		hole = xfs_uuid_table_size++;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6e3e6d55e51774ec7cfc24975749bbddb28a9051 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Wed, 6 Apr 2016 09:47:21 +1000
Subject: xfs: mute some sparse warnings

These three warnings are fixed:

fs/xfs/xfs_inode.c:1033:44: warning: Using plain integer as NULL pointer
fs/xfs/xfs_inode_item.c:525:20: warning: context imbalance in 'xfs_inode_item_push' - unexpected unlock
fs/xfs/xfs_dquot.c:696:1: warning: symbol 'xfs_dq_get_next_id' was not declared. Should it be static?

Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_dquot.c      | 2 +-
 fs/xfs/xfs_inode.c      | 2 +-
 fs/xfs/xfs_inode_item.c | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 316b2a1bdba5..8f51370c95c4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -692,7 +692,7 @@ error0:
  * end of the chunk, skip ahead to first id in next allocated chunk
  * using the SEEK_DATA interface.
  */
-int
+static int
 xfs_dq_get_next_id(
 	xfs_mount_t		*mp,
 	uint			type,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..1445a99a6868 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1030,7 +1030,7 @@ xfs_dir_ialloc(
 			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
 		}
 
-		code = xfs_trans_roll(&tp, 0);
+		code = xfs_trans_roll(&tp, NULL);
 		if (committed != NULL)
 			*committed = 1;
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c48b5b18d771..d02cbab18f45 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -479,6 +479,8 @@ STATIC uint
 xfs_inode_item_push(
 	struct xfs_log_item	*lip,
 	struct list_head	*buffer_list)
+		__releases(&lip->li_ailp->xa_lock)
+		__acquires(&lip->li_ailp->xa_lock)
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-- 
cgit v1.2.3-59-g8ed1b


From 9bdd9bd69b826875531bb1b2efb6aeb8d70e6f72 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 18 May 2016 10:56:41 +1000
Subject: xfs: buffer ->bi_end_io function requires irq-safe lock

Reports have surfaced of a lockdep splat complaining about an
irq-safe -> irq-unsafe locking order in the xfs_buf_bio_end_io() bio
completion handler. This only occurs when I/O errors are present
because bp->b_lock is only acquired in this context to protect
setting an error on the buffer. The problem is that this lock can be
acquired with the (request_queue) q->queue_lock held. See
scsi_end_request() or ata_qc_schedule_eh(), for example.

Replace the locked test/set of b_io_error with a cmpxchg() call.
This eliminates the need for the lock and thus the lock ordering
problem goes away.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9a2191b91137..e71cfbd5acb3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1100,22 +1100,18 @@ xfs_bwrite(
 	return error;
 }
 
-STATIC void
+static void
 xfs_buf_bio_end_io(
 	struct bio		*bio)
 {
-	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
+	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
 
 	/*
 	 * don't overwrite existing errors - otherwise we can lose errors on
 	 * buffers that require multiple bios to complete.
 	 */
-	if (bio->bi_error) {
-		spin_lock(&bp->b_lock);
-		if (!bp->b_io_error)
-			bp->b_io_error = bio->bi_error;
-		spin_unlock(&bp->b_lock);
-	}
+	if (bio->bi_error)
+		cmpxchg(&bp->b_io_error, 0, bio->bi_error);
 
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-- 
cgit v1.2.3-59-g8ed1b


From 192852be8b5fb14268c2133fe9ce5312e4745963 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 10:58:51 +1000
Subject: xfs: configurable error behavior via sysfs

We need to be able to change the way XFS behaviours in error
conditions depending on the type of underlying storage. This is
necessary for handling non-traditional block devices with extended
error cases, such as thin provisioned devices that can return ENOSPC
as an IO error.

Introduce the basic sysfs infrastructure needed to define and
configure error behaviours. This is done to be generic enough to
extend to configuring behaviour in other error conditions, such as
ENOMEM, which also has different desired behaviours according to
machine configuration.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.c | 10 +++++++++-
 fs/xfs/xfs_mount.h | 20 ++++++++++++++++++++
 fs/xfs/xfs_sysfs.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_sysfs.h |  3 +++
 4 files changed, 84 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 536a0ee9cd5a..677c3e0da472 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -690,10 +690,15 @@ xfs_mountfs(
 	if (error)
 		goto out_remove_sysfs;
 
-	error = xfs_uuid_mount(mp);
+	error = xfs_error_sysfs_init(mp);
 	if (error)
 		goto out_del_stats;
 
+
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out_remove_error_sysfs;
+
 	/*
 	 * Set the minimum read and write sizes
 	 */
@@ -968,6 +973,8 @@ xfs_mountfs(
 	xfs_da_unmount(mp);
  out_remove_uuid:
 	xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+	xfs_error_sysfs_del(mp);
  out_del_stats:
 	xfs_sysfs_del(&mp->m_stats.xs_kobj);
  out_remove_sysfs:
@@ -1056,6 +1063,7 @@ xfs_unmountfs(
 #endif
 	xfs_free_perag(mp);
 
+	xfs_error_sysfs_del(mp);
 	xfs_sysfs_del(&mp->m_stats.xs_kobj);
 	xfs_sysfs_del(&mp->m_kobj);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bac6b3435591..d639795b0310 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -37,6 +37,24 @@ enum {
 	XFS_LOWSP_MAX,
 };
 
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+	XFS_ERR_CLASS_MAX,
+};
+enum {
+	XFS_ERR_ERRNO_MAX,
+};
+
+struct xfs_error_cfg {
+	struct xfs_kobj	kobj;
+	int		max_retries;
+};
+
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
@@ -127,6 +145,8 @@ typedef struct xfs_mount {
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
 	struct xfs_kobj		m_kobj;
+	struct xfs_kobj		m_error_kobj;
+	struct xfs_error_cfg	m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
 	struct xstats		m_stats;	/* per-fs stats */
 
 	struct workqueue_struct *m_buf_workqueue;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 6ced4f143494..74e394071242 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -17,10 +17,11 @@
  */
 
 #include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
@@ -362,3 +363,53 @@ struct kobj_type xfs_log_ktype = {
 	.sysfs_ops = &xfs_sysfs_ops,
 	.default_attrs = xfs_log_attrs,
 };
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ *	...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static struct attribute *xfs_error_attrs[] = {
+	NULL,
+};
+
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+struct kobj_type xfs_error_cfg_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_attrs = xfs_error_attrs,
+};
+
+struct kobj_type xfs_error_ktype = {
+	.release = xfs_sysfs_release,
+};
+
+int
+xfs_error_sysfs_init(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	/* .../xfs/<dev>/error/ */
+	error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+				&mp->m_kobj, "error");
+	return error;
+}
+
+void
+xfs_error_sysfs_del(
+	struct xfs_mount	*mp)
+{
+	xfs_sysfs_del(&mp->m_error_kobj);
+}
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index be692e59938d..d04637181ef2 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -58,4 +58,7 @@ xfs_sysfs_del(
 	wait_for_completion(&kobj->complete);
 }
 
+int	xfs_error_sysfs_init(struct xfs_mount *mp);
+void	xfs_error_sysfs_del(struct xfs_mount *mp);
+
 #endif	/* __XFS_SYSFS_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From ffd40ef697dfd3e06f44b1bb5fea93079de8c77d Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:01:00 +1000
Subject: xfs: introduce metadata IO error class

Now we have the basic infrastructure, add the first error class so
we can build up the infrastructure in a meaningful way. Add the
metadata async write IO error class and sysfs entry, and introduce a
default configuration that matches the existing "retry forever"
behavior for async write metadata buffers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.h |  3 +++
 fs/xfs/xfs_sysfs.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d639795b0310..352a5c88d7e9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,11 @@ enum {
  * Error numbers define the errors that are configurable.
  */
 enum {
+	XFS_ERR_METADATA,
 	XFS_ERR_CLASS_MAX,
 };
 enum {
+	XFS_ERR_DEFAULT,
 	XFS_ERR_ERRNO_MAX,
 };
 
@@ -146,6 +148,7 @@ typedef struct xfs_mount {
 						/* low free space thresholds */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
+	struct xfs_kobj		m_error_meta_kobj;
 	struct xfs_error_cfg	m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
 	struct xstats		m_stats;	/* per-fs stats */
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 74e394071242..07c95999541e 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -399,11 +399,34 @@ int
 xfs_error_sysfs_init(
 	struct xfs_mount	*mp)
 {
+	struct xfs_error_cfg	*cfg;
 	int			error;
 
 	/* .../xfs/<dev>/error/ */
 	error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
 				&mp->m_kobj, "error");
+	if (error)
+		return error;
+
+	/* .../xfs/<dev>/error/metadata/ */
+	error = xfs_sysfs_init(&mp->m_error_meta_kobj, &xfs_error_ktype,
+				&mp->m_error_kobj, "metadata");
+	if (error)
+		goto out_error;
+
+	cfg = &mp->m_error_cfg[XFS_ERR_METADATA][XFS_ERR_DEFAULT];
+	error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+				&mp->m_error_meta_kobj, "default");
+	if (error)
+		goto out_error_meta;
+	cfg->max_retries = -1;
+
+	return 0;
+
+out_error_meta:
+	xfs_sysfs_del(&mp->m_error_meta_kobj);
+out_error:
+	xfs_sysfs_del(&mp->m_error_kobj);
 	return error;
 }
 
@@ -411,5 +434,16 @@ void
 xfs_error_sysfs_del(
 	struct xfs_mount	*mp)
 {
+	struct xfs_error_cfg	*cfg;
+	int			i, j;
+
+	for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+		for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+			cfg = &mp->m_error_cfg[i][j];
+
+			xfs_sysfs_del(&cfg->kobj);
+		}
+	}
+	xfs_sysfs_del(&mp->m_error_meta_kobj);
 	xfs_sysfs_del(&mp->m_error_kobj);
 }
-- 
cgit v1.2.3-59-g8ed1b


From df3093907ccc718459c54c99da29dd774af41186 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:05:33 +1000
Subject: xfs: add configurable error support to metadata buffers

With the error configuration handle for async metadata write errors
in place, we can now add initial support to the IO error processing
in xfs_buf_iodone_error().

Add an infrastructure function to look up the configuration handle,
and rearrange the error handling to prepare the way for different
error handling conigurations to be used.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h      |   1 +
 fs/xfs/xfs_buf_item.c | 112 ++++++++++++++++++++++++++++++--------------------
 fs/xfs/xfs_mount.h    |   3 ++
 fs/xfs/xfs_sysfs.c    |  17 ++++++++
 fs/xfs/xfs_trace.h    |   1 -
 5 files changed, 88 insertions(+), 46 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4eb89bd4ee73..adef116db0c3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -183,6 +183,7 @@ typedef struct xfs_buf {
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
 	int			b_error;	/* error code on I/O */
+	int			b_last_error;	/* previous async I/O error */
 	const struct xfs_buf_ops	*b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 99e91a0e554e..b8d0cd4adb81 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks(
 	}
 }
 
-/*
- * This is the iodone() function for buffers which have had callbacks
- * attached to them by xfs_buf_attach_iodone().  It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
+static bool
+xfs_buf_iodone_callback_error(
 	struct xfs_buf		*bp)
 {
 	struct xfs_log_item	*lip = bp->b_fspriv;
 	struct xfs_mount	*mp = lip->li_mountp;
 	static ulong		lasttime;
 	static xfs_buftarg_t	*lasttarg;
-
-	if (likely(!bp->b_error))
-		goto do_callbacks;
+	struct xfs_error_cfg	*cfg;
 
 	/*
 	 * If we've already decided to shutdown the filesystem because of
 	 * I/O errors, there's no point in giving this a retry.
 	 */
-	if (XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_buf_stale(bp);
-		bp->b_flags |= XBF_DONE;
-		trace_xfs_buf_item_iodone(bp, _RET_IP_);
-		goto do_callbacks;
-	}
+	if (XFS_FORCED_SHUTDOWN(mp))
+		goto out_stale;
 
 	if (bp->b_target != lasttarg ||
 	    time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1066,80 @@ xfs_buf_iodone_callbacks(
 	}
 	lasttarg = bp->b_target;
 
+	/* synchronous writes will have callers process the error */
+	if (!(bp->b_flags & XBF_ASYNC))
+		goto out_stale;
+
+	trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+	ASSERT(bp->b_iodone != NULL);
+
 	/*
 	 * If the write was asynchronous then no one will be looking for the
-	 * error.  Clear the error state and write the buffer out again.
-	 *
-	 * XXX: This helps against transient write errors, but we need to find
-	 * a way to shut the filesystem down if the writes keep failing.
-	 *
-	 * In practice we'll shut the filesystem down soon as non-transient
-	 * errors tend to affect the whole device and a failing log write
-	 * will make us give up.  But we really ought to do better here.
+	 * error.  If this is the first failure of this type, clear the error
+	 * state and write the buffer out again. This means we always retry an
+	 * async write failure at least once, but we also need to set the buffer
+	 * up to behave correctly now for repeated failures.
 	 */
-	if (bp->b_flags & XBF_ASYNC) {
-		ASSERT(bp->b_iodone != NULL);
-
-		trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+	if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+	     bp->b_last_error != bp->b_error) {
+		bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+			        XBF_DONE | XBF_WRITE_FAIL);
+		bp->b_last_error = bp->b_error;
+		xfs_buf_ioerror(bp, 0);
+		xfs_buf_submit(bp);
+		return true;
+	}
 
-		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+	/*
+	 * Repeated failure on an async write. Take action according to the
+	 * error configuration we have been set up to use.
+	 */
+	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+	if (!cfg->max_retries)
+		goto permanent_error;
 
-		if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
-			bp->b_flags |= XBF_WRITE | XBF_ASYNC |
-				       XBF_DONE | XBF_WRITE_FAIL;
-			xfs_buf_submit(bp);
-		} else {
-			xfs_buf_relse(bp);
-		}
-
-		return;
-	}
+	/* still a transient error, higher layers will retry */
+	xfs_buf_ioerror(bp, 0);
+	xfs_buf_relse(bp);
+	return true;
 
 	/*
-	 * If the write of the buffer was synchronous, we want to make
-	 * sure to return the error to the caller of xfs_bwrite().
+	 * Permanent error - we need to trigger a shutdown if we haven't already
+	 * to indicate that inconsistency will result from this action.
 	 */
+permanent_error:
+	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
 	xfs_buf_stale(bp);
 	bp->b_flags |= XBF_DONE;
-
 	trace_xfs_buf_error_relse(bp, _RET_IP_);
+	return false;
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+	struct xfs_buf		*bp)
+{
+	/*
+	 * If there is an error, process it. Some errors require us
+	 * to run callbacks after failure processing is done so we
+	 * detect that and take appropriate action.
+	 */
+	if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+		return;
+
+	/*
+	 * Successful IO or permanent error. Either way, we can clear the
+	 * retry state here in preparation for the next error that may occur.
+	 */
+	bp->b_last_error = 0;
 
-do_callbacks:
 	xfs_buf_do_callbacks(bp);
 	bp->b_fspriv = NULL;
 	bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 352a5c88d7e9..0c5a97644d78 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -387,4 +387,7 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 int	xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
 			xfs_off_t count_fsb);
 
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+		int error_class, int error);
+
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 07c95999541e..1cb5a85409af 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -447,3 +447,20 @@ xfs_error_sysfs_del(
 	xfs_sysfs_del(&mp->m_error_meta_kobj);
 	xfs_sysfs_del(&mp->m_error_kobj);
 }
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+	struct xfs_mount	*mp,
+	int			error_class,
+	int			error)
+{
+	struct xfs_error_cfg	*cfg;
+
+	switch (error) {
+	default:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+		break;
+	}
+
+	return cfg;
+}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008e..a133dd4c43bc 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
-- 
cgit v1.2.3-59-g8ed1b


From ef6a50fbb1bba7951aa23adcfb40e99ca72dc51c Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:06:44 +1000
Subject: xfs: introduce table-based init for error behaviors

Before we start expanding the number of error classes and errors we
can configure behaviour for, we need a simple and clear way to
define the default behaviour that we initialized each mount with.
Introduce a table based method for keeping the initial configuration
in, and apply that to the existing initialization code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_sysfs.c | 72 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 12 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 1cb5a85409af..71046d904985 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -395,11 +395,67 @@ struct kobj_type xfs_error_ktype = {
 	.release = xfs_sysfs_release,
 };
 
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+	char		*name;
+	int		max_retries;
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+	{ .name = "default",
+	  .max_retries = -1,
+	},
+};
+
+static int
+xfs_error_sysfs_init_class(
+	struct xfs_mount	*mp,
+	int			class,
+	const char		*parent_name,
+	struct xfs_kobj		*parent_kobj,
+	const struct xfs_error_init init[])
+{
+	struct xfs_error_cfg	*cfg;
+	int			error;
+	int			i;
+
+	ASSERT(class < XFS_ERR_CLASS_MAX);
+
+	error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+				&mp->m_error_kobj, parent_name);
+	if (error)
+		return error;
+
+	for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+		cfg = &mp->m_error_cfg[class][i];
+		error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+					parent_kobj, init[i].name);
+		if (error)
+			goto out_error;
+
+		cfg->max_retries = init[i].max_retries;
+	}
+	return 0;
+
+out_error:
+	/* unwind the entries that succeeded */
+	for (i--; i >= 0; i--) {
+		cfg = &mp->m_error_cfg[class][i];
+		xfs_sysfs_del(&cfg->kobj);
+	}
+	xfs_sysfs_del(parent_kobj);
+	return error;
+}
+
 int
 xfs_error_sysfs_init(
 	struct xfs_mount	*mp)
 {
-	struct xfs_error_cfg	*cfg;
 	int			error;
 
 	/* .../xfs/<dev>/error/ */
@@ -409,22 +465,14 @@ xfs_error_sysfs_init(
 		return error;
 
 	/* .../xfs/<dev>/error/metadata/ */
-	error = xfs_sysfs_init(&mp->m_error_meta_kobj, &xfs_error_ktype,
-				&mp->m_error_kobj, "metadata");
+	error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+				"metadata", &mp->m_error_meta_kobj,
+				xfs_error_meta_init);
 	if (error)
 		goto out_error;
 
-	cfg = &mp->m_error_cfg[XFS_ERR_METADATA][XFS_ERR_DEFAULT];
-	error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
-				&mp->m_error_meta_kobj, "default");
-	if (error)
-		goto out_error_meta;
-	cfg->max_retries = -1;
-
 	return 0;
 
-out_error_meta:
-	xfs_sysfs_del(&mp->m_error_meta_kobj);
 out_error:
 	xfs_sysfs_del(&mp->m_error_kobj);
 	return error;
-- 
cgit v1.2.3-59-g8ed1b


From a5ea70d25d76950e11690110b526374307d05d81 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:08:15 +1000
Subject: xfs: add configuration of error failure speed

On reception of an error, we can fail immediately, perform some
bound amount of retries or retry indefinitely. The current behaviour
we have is to retry forever.

However, we'd like the ability to choose how long the filesystem
should try after an error, it can either fail immediately, retry a
few times, or retry forever. This is implemented by using
max_retries sysfs attribute, to hold the amount of times we allow
the filesystem to retry after an error. Being -1 a special case
where the filesystem will retry indefinitely.

Add both a maximum retry count and a retry timeout so that we can
bound by time and/or physical IO attempts.

Finally, plumb these into xfs_buf_iodone error processing so that
the error behaviour follows the selected configuration.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.h      | 21 ++++++++++++-
 fs/xfs/xfs_buf_item.c | 13 +++++++--
 fs/xfs/xfs_mount.h    |  3 ++
 fs/xfs/xfs_sysfs.c    | 81 ++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 111 insertions(+), 7 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index adef116db0c3..8bfb974f0772 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -183,7 +183,26 @@ typedef struct xfs_buf {
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
 	int			b_error;	/* error code on I/O */
-	int			b_last_error;	/* previous async I/O error */
+
+	/*
+	 * async write failure retry count. Initialised to zero on the first
+	 * failure, then when it exceeds the maximum configured without a
+	 * success the write is considered to be failed permanently and the
+	 * iodone handler will take appropriate action.
+	 *
+	 * For retry timeouts, we record the jiffie of the first failure. This
+	 * means that we can change the retry timeout for buffers already under
+	 * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+	 *
+	 * last_error is used to ensure that we are getting repeated errors, not
+	 * different errors. e.g. a block device might change ENOSPC to EIO when
+	 * a failure timeout occurs, so we want to re-initialise the error
+	 * retry behaviour appropriately when that happens.
+	 */
+	int			b_retries;
+	unsigned long		b_first_retry_time; /* in jiffies */
+	int			b_last_error;
+
 	const struct xfs_buf_ops	*b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index b8d0cd4adb81..0d95c59f7c68 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1085,6 +1085,9 @@ xfs_buf_iodone_callback_error(
 		bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
 			        XBF_DONE | XBF_WRITE_FAIL);
 		bp->b_last_error = bp->b_error;
+		bp->b_retries = 0;
+		bp->b_first_retry_time = jiffies;
+
 		xfs_buf_ioerror(bp, 0);
 		xfs_buf_submit(bp);
 		return true;
@@ -1095,8 +1098,13 @@ xfs_buf_iodone_callback_error(
 	 * error configuration we have been set up to use.
 	 */
 	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
-	if (!cfg->max_retries)
-		goto permanent_error;
+
+	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+	    ++bp->b_retries > cfg->max_retries)
+			goto permanent_error;
+	if (cfg->retry_timeout &&
+	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+			goto permanent_error;
 
 	/* still a transient error, higher layers will retry */
 	xfs_buf_ioerror(bp, 0);
@@ -1139,6 +1147,7 @@ xfs_buf_iodone_callbacks(
 	 * retry state here in preparation for the next error that may occur.
 	 */
 	bp->b_last_error = 0;
+	bp->b_retries = 0;
 
 	xfs_buf_do_callbacks(bp);
 	bp->b_fspriv = NULL;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0c5a97644d78..2fafa9438bcf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -52,9 +52,12 @@ enum {
 	XFS_ERR_ERRNO_MAX,
 };
 
+#define XFS_ERR_RETRY_FOREVER	-1
+
 struct xfs_error_cfg {
 	struct xfs_kobj	kobj;
 	int		max_retries;
+	unsigned long	retry_timeout;	/* in jiffies, 0 = no timeout */
 };
 
 typedef struct xfs_mount {
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 71046d904985..918d144febd9 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -374,10 +374,6 @@ struct kobj_type xfs_log_ktype = {
  * and any other future type of IO (e.g. special inode or directory error
  * handling) we care to support.
  */
-static struct attribute *xfs_error_attrs[] = {
-	NULL,
-};
-
 static inline struct xfs_error_cfg *
 to_error_cfg(struct kobject *kobject)
 {
@@ -385,6 +381,79 @@ to_error_cfg(struct kobject *kobject)
 	return container_of(kobj, struct xfs_error_cfg, kobj);
 }
 
+static ssize_t
+max_retries_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+
+static ssize_t
+max_retries_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1)
+		return -EINVAL;
+
+	cfg->max_retries = val;
+	return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%ld\n",
+			jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	/* 1 day timeout maximum */
+	if (val < 0 || val > 86400)
+		return -EINVAL;
+
+	cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+	return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static struct attribute *xfs_error_attrs[] = {
+	ATTR_LIST(max_retries),
+	ATTR_LIST(retry_timeout_seconds),
+	NULL,
+};
+
+
 struct kobj_type xfs_error_cfg_ktype = {
 	.release = xfs_sysfs_release,
 	.sysfs_ops = &xfs_sysfs_ops,
@@ -404,11 +473,13 @@ struct kobj_type xfs_error_ktype = {
 struct xfs_error_init {
 	char		*name;
 	int		max_retries;
+	int		retry_timeout;	/* in seconds */
 };
 
 static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
 	{ .name = "default",
 	  .max_retries = -1,
+	  .retry_timeout = 0,
 	},
 };
 
@@ -439,6 +510,8 @@ xfs_error_sysfs_init_class(
 			goto out_error;
 
 		cfg->max_retries = init[i].max_retries;
+		cfg->retry_timeout = msecs_to_jiffies(
+					init[i].retry_timeout * MSEC_PER_SEC);
 	}
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From e0a431b3a3cc3d0a4c38ccfca8c7320fde40efb6 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:09:28 +1000
Subject: xfs: add configuration handlers for specific errors

now most of the infrastructure is in place, we can start adding
support for configuring specific errors such as ENODEV, ENOSPC, EIO,
etc. Add these error configurations and configure them all to have
appropriate behaviours. That is, all will be configured to retry
forever by default, except for ENODEV, which is an unrecoverable
error, so it will be configured to not retry on error

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_mount.h |  3 +++
 fs/xfs/xfs_sysfs.c | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 2fafa9438bcf..72ec3e3c988e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -49,6 +49,9 @@ enum {
 };
 enum {
 	XFS_ERR_DEFAULT,
+	XFS_ERR_EIO,
+	XFS_ERR_ENOSPC,
+	XFS_ERR_ENODEV,
 	XFS_ERR_ERRNO_MAX,
 };
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 918d144febd9..084a606840a1 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -478,9 +478,20 @@ struct xfs_error_init {
 
 static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
 	{ .name = "default",
-	  .max_retries = -1,
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
 	  .retry_timeout = 0,
 	},
+	{ .name = "EIO",
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
+	  .retry_timeout = 0,
+	},
+	{ .name = "ENOSPC",
+	  .max_retries = XFS_ERR_RETRY_FOREVER,
+	  .retry_timeout = 0,
+	},
+	{ .name = "ENODEV",
+	  .max_retries = 0,
+	},
 };
 
 static int
@@ -578,6 +589,15 @@ xfs_error_get_cfg(
 	struct xfs_error_cfg	*cfg;
 
 	switch (error) {
+	case EIO:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+		break;
+	case ENOSPC:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+		break;
+	case ENODEV:
+		cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+		break;
 	default:
 		cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From e6b3bb78962e65c4ad125598755cfbf2a8779e86 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Wed, 18 May 2016 11:11:27 +1000
Subject: xfs: add "fail at unmount" error handling configuration

If we take "retry forever" literally on metadata IO errors, we can
hang at unmount, once it retries those writes forever. This is the
default behavior, unfortunately.

Add an error configuration option for this behavior and default it
to "fail" so that an unmount will trigger actuall errors, a shutdown
and allow the unmount to succeed. It will be noisy, though, as it
will log the errors and shutdown that occurs.

To fix this, we need to mark the filesystem as being in the process
of unmounting. Do this with a mount flag that is added at the
appropriate time (i.e. before the blocking AIL sync). We also need
to add this flag if mount fails after the initial phase of log
recovery has been run.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf_item.c |  4 ++++
 fs/xfs/xfs_mount.c    | 12 ++++++++++++
 fs/xfs/xfs_mount.h    |  2 ++
 fs/xfs/xfs_sysfs.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0d95c59f7c68..34257992934c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1106,6 +1106,10 @@ xfs_buf_iodone_callback_error(
 	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
 			goto permanent_error;
 
+	/* At unmount we may treat errors differently */
+	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+		goto permanent_error;
+
 	/* still a transient error, higher layers will retry */
 	xfs_buf_ioerror(bp, 0);
 	xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 677c3e0da472..7c05a22c1a73 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -681,6 +681,9 @@ xfs_mountfs(
 
 	xfs_set_maxicount(mp);
 
+	/* enable fail_at_unmount as default */
+	mp->m_fail_unmount = 1;
+
 	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
 	if (error)
 		goto out;
@@ -962,6 +965,7 @@ xfs_mountfs(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
  out_log_dealloc:
+	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
 	xfs_log_mount_cancel(mp);
  out_fail_wait:
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -1012,6 +1016,14 @@ xfs_unmountfs(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
+	/*
+	 * We now need to tell the world we are unmounting. This will allow
+	 * us to detect that the filesystem is going away and we should error
+	 * out anything that we have been retrying in the background. This will
+	 * prevent neverending retries in AIL pushing from hanging the unmount.
+	 */
+	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
 	/*
 	 * Flush all pending changes from the AIL.
 	 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 72ec3e3c988e..9063a9c7b2fe 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -177,6 +177,7 @@ typedef struct xfs_mount {
 	 */
 	__uint32_t		m_generation;
 
+	bool			m_fail_unmount;
 #ifdef DEBUG
 	/*
 	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -195,6 +196,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
 						   must be synchronous except
 						   for space allocations */
+#define XFS_MOUNT_UNMOUNTING	(1ULL << 1)	/* filesystem is unmounting */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
 						   operations, typically for
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 084a606840a1..4c2c55086208 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -381,6 +381,13 @@ to_error_cfg(struct kobject *kobject)
 	return container_of(kobj, struct xfs_error_cfg, kobj);
 }
 
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
 static ssize_t
 max_retries_show(
 	struct kobject	*kobject,
@@ -447,6 +454,38 @@ retry_timeout_seconds_store(
 }
 XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
 
+static ssize_t
+fail_at_unmount_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xfs_mount	*mp = err_to_mp(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xfs_mount	*mp = err_to_mp(kobject);
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 1)
+		return -EINVAL;
+
+	mp->m_fail_unmount = val;
+	return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
 static struct attribute *xfs_error_attrs[] = {
 	ATTR_LIST(max_retries),
 	ATTR_LIST(retry_timeout_seconds),
@@ -462,6 +501,7 @@ struct kobj_type xfs_error_cfg_ktype = {
 
 struct kobj_type xfs_error_ktype = {
 	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
 };
 
 /*
@@ -548,6 +588,12 @@ xfs_error_sysfs_init(
 	if (error)
 		return error;
 
+	error = sysfs_create_file(&mp->m_error_kobj.kobject,
+				  ATTR_LIST(fail_at_unmount));
+
+	if (error)
+		goto out_error;
+
 	/* .../xfs/<dev>/error/metadata/ */
 	error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
 				"metadata", &mp->m_error_meta_kobj,
-- 
cgit v1.2.3-59-g8ed1b


From 8179c03629de67f515d3ab825b5a9428687d4b85 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 13:52:42 +1000
Subject: xfs: remove xfs_fs_evict_inode()

Joe Lawrence reported a list_add corruption with 4.6-rc1 when
testing some custom md administration code that made it's own
block device nodes for the md array. The simple test loop of:

for i in {0..100}; do
	mknod --mode=0600 $tmp/tmp_node b $MAJOR $MINOR
	mdadm --detail --export $tmp/tmp_node > /dev/null
	rm -f $tmp/tmp_node
done


Would produce this warning in bd_acquire() when mdadm opened the
device node:

list_add double add: new=ffff88043831c7b8, prev=ffff8804380287d8, next=ffff88043831c7b8.

And then produce this from bd_forget from kdevtmpfs evicting a block
dev inode:

list_del corruption. prev->next should be ffff8800bb83eb10, but was ffff88043831c7b8

This is a regression caused by commit c19b3b05 ("xfs: mode di_mode
to vfs inode"). The issue is that xfs_inactive() frees the
unlinked inode, and the above commit meant that this freeing zeroed
the mode in the struct inode. The problem is that after evict() has
called ->evict_inode, it expects the i_mode to be intact so that it
can call bd_forget() or cd_forget() to drop the reference to the
block device inode attached to the XFS inode.

In reality, the only thing we do in xfs_fs_evict_inode() that is not
generic is call xfs_inactive(). We can move the xfs_inactive() call
to xfs_fs_destroy_inode() without any problems at all, and this
will leave the VFS inode intact until it is completely done with it.

So, remove xfs_fs_evict_inode(), and do the work it used to do in
->destroy_inode instead.

cc: <stable@vger.kernel.org> # 4.6
Reported-by: Joe Lawrence <joe.lawrence@stratus.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d760934109b5..3c39f3ad2e32 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -928,7 +928,7 @@ xfs_fs_alloc_inode(
 
 /*
  * Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
  */
 STATIC void
 xfs_fs_destroy_inode(
@@ -938,9 +938,14 @@ xfs_fs_destroy_inode(
 
 	trace_xfs_destroy_inode(ip);
 
-	XFS_STATS_INC(ip->i_mount, vn_reclaim);
+	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	XFS_STATS_INC(ip->i_mount, vn_rele);
+	XFS_STATS_INC(ip->i_mount, vn_remove);
+
+	xfs_inactive(ip);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+	XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
 	/*
 	 * We should never get here with one of the reclaim flags already set.
@@ -987,24 +992,6 @@ xfs_fs_inode_init_once(
 		     "xfsino", ip->i_ino);
 }
 
-STATIC void
-xfs_fs_evict_inode(
-	struct inode		*inode)
-{
-	xfs_inode_t		*ip = XFS_I(inode);
-
-	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-
-	trace_xfs_evict_inode(ip);
-
-	truncate_inode_pages_final(&inode->i_data);
-	clear_inode(inode);
-	XFS_STATS_INC(ip->i_mount, vn_rele);
-	XFS_STATS_INC(ip->i_mount, vn_remove);
-
-	xfs_inactive(ip);
-}
-
 /*
  * We do an unlocked check for XFS_IDONTCACHE here because we are already
  * serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1663,7 +1650,6 @@ xfs_fs_free_cached_objects(
 static const struct super_operations xfs_super_operations = {
 	.alloc_inode		= xfs_fs_alloc_inode,
 	.destroy_inode		= xfs_fs_destroy_inode,
-	.evict_inode		= xfs_fs_evict_inode,
 	.drop_inode		= xfs_fs_drop_inode,
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
-- 
cgit v1.2.3-59-g8ed1b


From b1438f477934f5a4d5a44df26f3079a7575d5946 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 13:53:42 +1000
Subject: xfs: xfs_iflush_cluster fails to abort on error

When a failure due to an inode buffer occurs, the error handling
fails to abort the inode writeback correctly. This can result in the
inode being reclaimed whilst still in the AIL, leading to
use-after-free situations as well as filesystems that cannot be
unmounted as the inode log items left in the AIL never get removed.

Fix this by ensuring fatal errors from xfs_imap_to_bp() result in
the inode flush being aborted correctly.

cc: <stable@vger.kernel.org> # 3.10.x-
Reported-by: Shyam Kaushik <shyam@zadarastorage.com>
Diagnosed-by: Shyam Kaushik <shyam@zadarastorage.com>
Tested-by: Shyam Kaushik <shyam@zadarastorage.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..696936cad0fa 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3327,7 +3327,7 @@ xfs_iflush(
 	struct xfs_buf		**bpp)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_buf		*bp;
+	struct xfs_buf		*bp = NULL;
 	struct xfs_dinode	*dip;
 	int			error;
 
@@ -3369,14 +3369,22 @@ xfs_iflush(
 	}
 
 	/*
-	 * Get the buffer containing the on-disk inode.
+	 * Get the buffer containing the on-disk inode. We are doing a try-lock
+	 * operation here, so we may get  an EAGAIN error. In that case, we
+	 * simply want to return with the inode still dirty.
+	 *
+	 * If we get any other error, we effectively have a corruption situation
+	 * and we cannot flush the inode, so we treat it the same as failing
+	 * xfs_iflush_int().
 	 */
 	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
 			       0);
-	if (error || !bp) {
+	if (error == -EAGAIN) {
 		xfs_ifunlock(ip);
 		return error;
 	}
+	if (error)
+		goto corrupt_out;
 
 	/*
 	 * First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3412,8 @@ xfs_iflush(
 	return 0;
 
 corrupt_out:
-	xfs_buf_relse(bp);
+	if (bp)
+		xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
 	error = -EFSCORRUPTED;
-- 
cgit v1.2.3-59-g8ed1b


From 51b07f30a71c27405259a0248206ed4e22adbee2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 13:54:22 +1000
Subject: xfs: fix inode validity check in xfs_iflush_cluster

Some careless idiot(*) wrote crap code in commit 1a3e8f3 ("xfs:
convert inode cache lookups to use RCU locking") back in late 2010,
and so xfs_iflush_cluster checks the wrong inode for whether it is
still valid under RCU protection. Fix it to lock and check the
correct inode.

(*) Careless-idiot: Dave Chinner <dchinner@redhat.com>

cc: <stable@vger.kernel.org> # 3.10.x-
Discovered-by: Brain Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 696936cad0fa..a955b0212453 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3205,13 +3205,13 @@ xfs_iflush_cluster(
 		 * We need to check under the i_flags_lock for a valid inode
 		 * here. Skip it if it is not valid or the wrong inode.
 		 */
-		spin_lock(&ip->i_flags_lock);
-		if (!ip->i_ino ||
+		spin_lock(&iq->i_flags_lock);
+		if (!iq->i_ino ||
 		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
-			spin_unlock(&ip->i_flags_lock);
+			spin_unlock(&iq->i_flags_lock);
 			continue;
 		}
-		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&iq->i_flags_lock);
 
 		/*
 		 * Do an un-protected check to see if the inode is dirty and
-- 
cgit v1.2.3-59-g8ed1b


From 7d3aa7fe970791f1a674b14572a411accf2f4d4e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 13:54:23 +1000
Subject: xfs: skip stale inodes in xfs_iflush_cluster

We don't write back stale inodes so we should skip them in
xfs_iflush_cluster, too.

cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a955b0212453..3cbc9031731b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3207,6 +3207,7 @@ xfs_iflush_cluster(
 		 */
 		spin_lock(&iq->i_flags_lock);
 		if (!iq->i_ino ||
+		    __xfs_iflags_test(iq, XFS_ISTALE) ||
 		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
 			spin_unlock(&iq->i_flags_lock);
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From 32b43ab6fb983e5a117048443e628c235cd2c5bd Mon Sep 17 00:00:00 2001
From: Alex Lyakas <alex@zadarastorage.com>
Date: Wed, 18 May 2016 14:01:52 +1000
Subject: xfs: optimise xfs_iext_destroy

When unmounting XFS, we call:

xfs_inode_free => xfs_idestroy_fork => xfs_iext_destroy

This goes over the whole indirection array and calls
xfs_iext_irec_remove for each one of the erps (from the last one to
the first one). As a result, we keep shrinking (reallocating
actually) the indirection array until we shrink out all of its
elements. When we have files with huge numbers of extents, umount
takes 30-80 sec, depending on the amount of files that XFS loaded
and the amount of indirection entries of each file. The unmount
stack looks like:

[<ffffffffc0b6d200>] xfs_iext_realloc_indirect+0x40/0x60 [xfs]
[<ffffffffc0b6cd8e>] xfs_iext_irec_remove+0xee/0xf0 [xfs]
[<ffffffffc0b6cdcd>] xfs_iext_destroy+0x3d/0xb0 [xfs]
[<ffffffffc0b6cef6>] xfs_idestroy_fork+0xb6/0xf0 [xfs]
[<ffffffffc0b87002>] xfs_inode_free+0xb2/0xc0 [xfs]
[<ffffffffc0b87260>] xfs_reclaim_inode+0x250/0x340 [xfs]
[<ffffffffc0b87583>] xfs_reclaim_inodes_ag+0x233/0x370 [xfs]
[<ffffffffc0b8823d>] xfs_reclaim_inodes+0x1d/0x20 [xfs]
[<ffffffffc0b96feb>] xfs_unmountfs+0x7b/0x1a0 [xfs]
[<ffffffffc0b98e4d>] xfs_fs_put_super+0x2d/0x70 [xfs]
[<ffffffff811e9e36>] generic_shutdown_super+0x76/0x100
[<ffffffff811ea207>] kill_block_super+0x27/0x70
[<ffffffff811ea519>] deactivate_locked_super+0x49/0x60
[<ffffffff811eaaee>] deactivate_super+0x4e/0x70
[<ffffffff81207593>] cleanup_mnt+0x43/0x90
[<ffffffff81207632>] __cleanup_mnt+0x12/0x20
[<ffffffff8108f8e7>] task_work_run+0xa7/0xe0
[<ffffffff81014ff7>] do_notify_resume+0x97/0xb0
[<ffffffff81717c6f>] int_signal+0x12/0x17

Further, this reallocation prevents us from freeing the extent list
from a RCU callback as allocation can block. Hence if the extent
list is in indirect format, optimise the freeing of the extent list
to only use kmem_free calls by freeing entire extent buffer pages at
a time, rather than extent by extent.

[dchinner: simplified freeing loop based on Christoph's suggestion]

Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c8..8fe6617a8653 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -1496,6 +1496,24 @@ xfs_iext_indirect_to_direct(
 	}
 }
 
+/*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+	struct xfs_ifork *ifp)
+{
+	int		nlists;
+	int		i;
+
+	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+	for (i = 0; i < nlists; i++)
+		kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+	kmem_free(ifp->if_u1.if_ext_irec);
+	ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+
 /*
  * Free incore file extents.
  */
@@ -1504,14 +1522,7 @@ xfs_iext_destroy(
 	xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
 	if (ifp->if_flags & XFS_IFEXTIREC) {
-		int	erp_idx;
-		int	nlists;
-
-		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-			xfs_iext_irec_remove(ifp, erp_idx);
-		}
-		ifp->if_flags &= ~XFS_IFEXTIREC;
+		xfs_iext_irec_remove_all(ifp);
 	} else if (ifp->if_real_bytes) {
 		kmem_free(ifp->if_u1.if_extents);
 	} else if (ifp->if_bytes) {
-- 
cgit v1.2.3-59-g8ed1b


From 1f2dcfe89edac4e3bf5b76c56f745191f921fd2a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:01:53 +1000
Subject: xfs: xfs_inode_free() isn't RCU safe

The xfs_inode freed in xfs_inode_free() has multiple allocated
structures attached to it. We free these in xfs_inode_free() before
we mark the inode as invalid, and before we run call_rcu() to queue
the structure for freeing.

Unfortunately, this freeing can race with other accesses that are in
the RCU current grace period that have found the inode in the radix
tree with a valid state.  This includes xfs_iflush_cluster(), which
calls xfs_inode_clean(), and that accesses the inode log item on the
xfs_inode.

The log item structure is freed in xfs_inode_free(), so there is the
possibility we can be accessing freed memory in xfs_iflush_cluster()
after validating the xfs_inode structure as being valid for this RCU
context. Hence we can get spuriously incorrect clean state returned
from such checks. This can lead to use thinking the inode is dirty
when it is, in fact, clean, and so incorrectly attaching it to the
buffer for IO and completion processing.

This then leads to use-after-free situations on the xfs_inode itself
if the IO completes after the current RCU grace period expires. The
buffer callbacks will access the xfs_inode and try to do all sorts
of things it shouldn't with freed memory.

IOWs, xfs_iflush_cluster() only works correctly when racing with
inode reclaim if the inode log item is present and correctly stating
the inode is clean. If the inode is being freed, then reclaim has
already made sure the inode is clean, and hence xfs_iflush_cluster
can skip it. However, we are accessing the inode inode under RCU
read lock protection and so also must ensure that all dynamically
allocated memory we reference in this context is not freed until the
RCU grace period expires.

To fix this, move all the potential memory freeing into
xfs_inode_free_callback() so that we are guarantee RCU protected
lookup code will always have the memory structures it needs
available during the RCU grace period that lookup races can occur
in.

Discovered-by: Brain Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_icache.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bf2d60749278..0c94cde41016 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -94,13 +94,6 @@ xfs_inode_free_callback(
 	struct inode		*inode = container_of(head, struct inode, i_rcu);
 	struct xfs_inode	*ip = XFS_I(inode);
 
-	kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-	struct xfs_inode	*ip)
-{
 	switch (VFS_I(ip)->i_mode & S_IFMT) {
 	case S_IFREG:
 	case S_IFDIR:
@@ -118,6 +111,13 @@ xfs_inode_free(
 		ip->i_itemp = NULL;
 	}
 
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
 	 * appears to be reclaimed with an invalid inode number when in the
-- 
cgit v1.2.3-59-g8ed1b


From 8a17d7ddedb4d9031f046ae0e97c40b46aa69db5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:09:12 +1000
Subject: xfs: mark reclaimed inodes invalid earlier

The last thing we do before using call_rcu() on an xfs_inode to be
freed is mark it as invalid. This means there is a window between
when we know for certain that the inode is going to be freed and
when we do actually mark it as "freed".

This is important in the context of RCU lookups - we can look up the
inode, find that it is valid, and then use it as such not realising
that it is in the final stages of being freed.

As such, mark the inode as being invalid the moment we know it is
going to be reclaimed. This can be done while we still hold the
XFS_ILOCK_EXCL and the flush lock in xfs_inode_reclaim, meaning that
it occurs well before we remove it from the radix tree, and that
the i_flags_lock, the XFS_ILOCK and the inode flush lock all act as
synchronisation points for detecting that an inode is about to go
away.

For defensive purposes, this allows us to add a further check to
xfs_iflush_cluster to ensure we skip inodes that are being freed
after we grab the XFS_ILOCK_SHARED and the flush lock - we know that
if the inode number if valid while we have these locks held we know
that it has not progressed through reclaim to the point where it is
clean and is about to be freed.

[bfoster: fixed __xfs_inode_clear_reclaim() using ip->i_ino after it
	  had already been zeroed.]

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_icache.c | 46 ++++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_inode.c  | 13 +++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0c94cde41016..57fcd5917a66 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -114,6 +114,18 @@ xfs_inode_free_callback(
 	kmem_zone_free(xfs_inode_zone, ip);
 }
 
+static void
+__xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!xfs_isiflocked(ip));
+	XFS_STATS_DEC(ip->i_mount, vn_active);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
 void
 xfs_inode_free(
 	struct xfs_inode	*ip)
@@ -129,12 +141,7 @@ xfs_inode_free(
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!xfs_isiflocked(ip));
-	XFS_STATS_DEC(ip->i_mount, vn_active);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+	__xfs_inode_free(ip);
 }
 
 /*
@@ -772,8 +779,7 @@ __xfs_inode_set_reclaim_tag(
 	if (!pag->pag_ici_reclaimable) {
 		/* propagate the reclaim tag up into the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno,
 				XFS_ICI_RECLAIM_TAG);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
@@ -817,8 +823,7 @@ __xfs_inode_clear_reclaim(
 	if (!pag->pag_ici_reclaimable) {
 		/* clear the reclaim tag from the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno,
 				XFS_ICI_RECLAIM_TAG);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
 	int			sync_mode)
 {
 	struct xfs_buf		*bp = NULL;
+	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
 	int			error;
 
 restart:
@@ -993,6 +999,22 @@ restart:
 
 	xfs_iflock(ip);
 reclaim:
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always appears
+	 * to be reclaimed with an invalid inode number when in the free state.
+	 * We do this as early as possible under the ILOCK and flush lock so
+	 * that xfs_iflush_cluster() can be guaranteed to detect races with us
+	 * here. By doing this, we guarantee that once xfs_iflush_cluster has
+	 * locked both the XFS_ILOCK and the flush lock that it will see either
+	 * a valid, flushable inode that will serialise correctly against the
+	 * locks below, or it will see a clean (and invalid) inode that it can
+	 * skip.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
 	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -1006,7 +1028,7 @@ reclaim:
 	 */
 	spin_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root,
-				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+				XFS_INO_TO_AGINO(ip->i_mount, ino)))
 		ASSERT(0);
 	__xfs_inode_clear_reclaim(pag, ip);
 	spin_unlock(&pag->pag_ici_lock);
@@ -1023,7 +1045,7 @@ reclaim:
 	xfs_qm_dqdetach(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-	xfs_inode_free(ip);
+	__xfs_inode_free(ip);
 	return error;
 
 out_ifunlock:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cbc9031731b..e3b27982b3b2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3239,6 +3239,19 @@ xfs_iflush_cluster(
 			continue;
 		}
 
+
+		/*
+		 * Check the inode number again, just to be certain we are not
+		 * racing with freeing in xfs_reclaim_inode(). See the comments
+		 * in that function for more information as to why the initial
+		 * check is not sufficient.
+		 */
+		if (!iq->i_ino) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
 		/*
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
-- 
cgit v1.2.3-59-g8ed1b


From 5a90e53e8124d3ebe4b2a6309fa3c3225c23a62a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:09:13 +1000
Subject: xfs: xfs_iflush_cluster has range issues

xfs_iflush_cluster() does a gang lookup on the radix tree, meaning
it can find inodes beyond the current cluster if there is sparse
cache population. gang lookups return results in ascending index
order, so stop trying to cluster inodes once the first inode outside
the cluster mask is detected.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e3b27982b3b2..6c746d79925e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3207,11 +3207,20 @@ xfs_iflush_cluster(
 		 */
 		spin_lock(&iq->i_flags_lock);
 		if (!iq->i_ino ||
-		    __xfs_iflags_test(iq, XFS_ISTALE) ||
-		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+		    __xfs_iflags_test(iq, XFS_ISTALE)) {
 			spin_unlock(&iq->i_flags_lock);
 			continue;
 		}
+
+		/*
+		 * Once we fall off the end of the cluster, no point checking
+		 * any more inodes in the list because they will also all be
+		 * outside the cluster.
+		 */
+		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+			spin_unlock(&iq->i_flags_lock);
+			break;
+		}
 		spin_unlock(&iq->i_flags_lock);
 
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From 194293631d009254348f43710a7673bbb84a4172 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:09:46 +1000
Subject: xfs: rename variables in xfs_iflush_cluster for clarity

The cluster inode variable uses unconventional naming - iq - which
makes it hard to distinguish it between the inode passed into the
function - ip - and that is a vector for mistakes to be made.
Rename all the cluster inode variables to use a more conventional
prefixes to reduce potential future confusion (cilist, cilist_size,
cip).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 74 +++++++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6c746d79925e..54cc7b05708f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3162,16 +3162,16 @@ out_trans_cancel:
 
 STATIC int
 xfs_iflush_cluster(
-	xfs_inode_t	*ip,
-	xfs_buf_t	*bp)
+	struct xfs_inode	*ip,
+	struct xfs_buf		*bp)
 {
-	xfs_mount_t		*mp = ip->i_mount;
+	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_perag	*pag;
 	unsigned long		first_index, mask;
 	unsigned long		inodes_per_cluster;
-	int			ilist_size;
-	xfs_inode_t		**ilist;
-	xfs_inode_t		*iq;
+	int			cilist_size;
+	struct xfs_inode	**cilist;
+	struct xfs_inode	*cip;
 	int			nr_found;
 	int			clcount = 0;
 	int			bufwasdelwri;
@@ -3180,23 +3180,23 @@ xfs_iflush_cluster(
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
 	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
-	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
-	if (!ilist)
+	cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+	cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
+	if (!cilist)
 		goto out_put;
 
 	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
 	rcu_read_lock();
 	/* really need a gang lookup range call here */
-	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
 					first_index, inodes_per_cluster);
 	if (nr_found == 0)
 		goto out_free;
 
 	for (i = 0; i < nr_found; i++) {
-		iq = ilist[i];
-		if (iq == ip)
+		cip = cilist[i];
+		if (cip == ip)
 			continue;
 
 		/*
@@ -3205,10 +3205,10 @@ xfs_iflush_cluster(
 		 * We need to check under the i_flags_lock for a valid inode
 		 * here. Skip it if it is not valid or the wrong inode.
 		 */
-		spin_lock(&iq->i_flags_lock);
-		if (!iq->i_ino ||
-		    __xfs_iflags_test(iq, XFS_ISTALE)) {
-			spin_unlock(&iq->i_flags_lock);
+		spin_lock(&cip->i_flags_lock);
+		if (!cip->i_ino ||
+		    __xfs_iflags_test(cip, XFS_ISTALE)) {
+			spin_unlock(&cip->i_flags_lock);
 			continue;
 		}
 
@@ -3217,18 +3217,18 @@ xfs_iflush_cluster(
 		 * any more inodes in the list because they will also all be
 		 * outside the cluster.
 		 */
-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
-			spin_unlock(&iq->i_flags_lock);
+		if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+			spin_unlock(&cip->i_flags_lock);
 			break;
 		}
-		spin_unlock(&iq->i_flags_lock);
+		spin_unlock(&cip->i_flags_lock);
 
 		/*
 		 * Do an un-protected check to see if the inode is dirty and
 		 * is a candidate for flushing.  These checks will be repeated
 		 * later after the appropriate locks are acquired.
 		 */
-		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+		if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
 			continue;
 
 		/*
@@ -3236,15 +3236,15 @@ xfs_iflush_cluster(
 		 * then this inode cannot be flushed and is skipped.
 		 */
 
-		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+		if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
 			continue;
-		if (!xfs_iflock_nowait(iq)) {
-			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		if (!xfs_iflock_nowait(cip)) {
+			xfs_iunlock(cip, XFS_ILOCK_SHARED);
 			continue;
 		}
-		if (xfs_ipincount(iq)) {
-			xfs_ifunlock(iq);
-			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		if (xfs_ipincount(cip)) {
+			xfs_ifunlock(cip);
+			xfs_iunlock(cip, XFS_ILOCK_SHARED);
 			continue;
 		}
 
@@ -3255,9 +3255,9 @@ xfs_iflush_cluster(
 		 * in that function for more information as to why the initial
 		 * check is not sufficient.
 		 */
-		if (!iq->i_ino) {
-			xfs_ifunlock(iq);
-			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		if (!cip->i_ino) {
+			xfs_ifunlock(cip);
+			xfs_iunlock(cip, XFS_ILOCK_SHARED);
 			continue;
 		}
 
@@ -3265,18 +3265,18 @@ xfs_iflush_cluster(
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
 		 */
-		if (!xfs_inode_clean(iq)) {
+		if (!xfs_inode_clean(cip)) {
 			int	error;
-			error = xfs_iflush_int(iq, bp);
+			error = xfs_iflush_int(cip, bp);
 			if (error) {
-				xfs_iunlock(iq, XFS_ILOCK_SHARED);
+				xfs_iunlock(cip, XFS_ILOCK_SHARED);
 				goto cluster_corrupt_out;
 			}
 			clcount++;
 		} else {
-			xfs_ifunlock(iq);
+			xfs_ifunlock(cip);
 		}
-		xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		xfs_iunlock(cip, XFS_ILOCK_SHARED);
 	}
 
 	if (clcount) {
@@ -3286,7 +3286,7 @@ xfs_iflush_cluster(
 
 out_free:
 	rcu_read_unlock();
-	kmem_free(ilist);
+	kmem_free(cilist);
 out_put:
 	xfs_perag_put(pag);
 	return 0;
@@ -3329,8 +3329,8 @@ cluster_corrupt_out:
 	/*
 	 * Unlocks the flush lock
 	 */
-	xfs_iflush_abort(iq, false);
-	kmem_free(ilist);
+	xfs_iflush_abort(cip, false);
+	kmem_free(cilist);
 	xfs_perag_put(pag);
 	return -EFSCORRUPTED;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 545c0889d26d47e1139c527002eb131343d13b63 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:11:41 +1000
Subject: xfs: simplify inode reclaim tagging interfaces

Inode radix tree tagging for reclaim passes a lot of unnecessary
variables around. Over time the xfs-perag has grown a xfs_mount
backpointer, and an internal agno so we don't need to pass other
variables into the tagging functions to supply this information.

Rework the functions to pass the minimal variable set required
and simplify the internal logic and flow.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_icache.c | 98 +++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 57fcd5917a66..789f8c32e65f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,8 +37,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
-				struct xfs_perag *pag, struct xfs_inode *ip);
+STATIC void xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, xfs_ino_t ino);
 
 /*
  * Allocate and initialise an xfs_inode.
@@ -271,7 +270,7 @@ xfs_iget_cache_hit(
 		 */
 		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 		ip->i_flags |= XFS_INEW;
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
 
 		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -768,30 +767,46 @@ xfs_reclaim_worker(
 }
 
 static void
-__xfs_inode_set_reclaim_tag(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
+xfs_perag_set_reclaim_tag(
+	struct xfs_perag	*pag)
 {
-	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	if (pag->pag_ici_reclaimable++)
+		return;
+
+	/* propagate the reclaim tag up into the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
 			   XFS_ICI_RECLAIM_TAG);
+	spin_unlock(&mp->m_perag_lock);
 
-	if (!pag->pag_ici_reclaimable) {
-		/* propagate the reclaim tag up into the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno,
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
+	/* schedule periodic background inode reclaim */
+	xfs_reclaim_work_queue(mp);
 
-		/* schedule periodic background inode reclaim */
-		xfs_reclaim_work_queue(ip->i_mount);
+	trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
 
-		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-	pag->pag_ici_reclaimable++;
+static void
+xfs_perag_clear_reclaim_tag(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	if (--pag->pag_ici_reclaimable)
+		return;
+
+	/* clear the reclaim tag from the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+			     XFS_ICI_RECLAIM_TAG);
+	spin_unlock(&mp->m_perag_lock);
+	trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
 }
 
+
 /*
  * We set the inode flag atomically with the radix tree tag.
  * Once we get tag lookups on the radix tree, this inode flag
@@ -799,47 +814,34 @@ __xfs_inode_set_reclaim_tag(
  */
 void
 xfs_inode_set_reclaim_tag(
-	xfs_inode_t	*ip)
+	struct xfs_inode	*ip)
 {
-	struct xfs_mount *mp = ip->i_mount;
-	struct xfs_perag *pag;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_set_reclaim_tag(pag, ip);
+
+	radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+	xfs_perag_set_reclaim_tag(pag);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
 	spin_unlock(&ip->i_flags_lock);
 	spin_unlock(&pag->pag_ici_lock);
 	xfs_perag_put(pag);
 }
 
 STATIC void
-__xfs_inode_clear_reclaim(
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	pag->pag_ici_reclaimable--;
-	if (!pag->pag_ici_reclaimable) {
-		/* clear the reclaim tag from the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno,
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
-	xfs_mount_t	*mp,
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
+xfs_inode_clear_reclaim_tag(
+	struct xfs_perag	*pag,
+	xfs_ino_t		ino)
 {
 	radix_tree_tag_clear(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-	__xfs_inode_clear_reclaim(pag, ip);
+			     XFS_INO_TO_AGINO(pag->pag_mount, ino),
+			     XFS_ICI_RECLAIM_TAG);
+	xfs_perag_clear_reclaim_tag(pag);
 }
 
 /*
@@ -1030,7 +1032,7 @@ reclaim:
 	if (!radix_tree_delete(&pag->pag_ici_root,
 				XFS_INO_TO_AGINO(ip->i_mount, ino)))
 		ASSERT(0);
-	__xfs_inode_clear_reclaim(pag, ip);
+	xfs_perag_clear_reclaim_tag(pag);
 	spin_unlock(&pag->pag_ici_lock);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From ad438c4038968e5ca5248f851212634e474983e8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 May 2016 14:20:08 +1000
Subject: xfs: move reclaim tagging functions

Rearrange the inode tagging functions so that they are higher up in
xfs_cache.c and so there is no need for forward prototypes to be
defined. This is purely code movement, no other change.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 234 ++++++++++++++++++++++++++--------------------------
 1 file changed, 116 insertions(+), 118 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 789f8c32e65f..99ee6eee5e0b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,8 +37,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-STATIC void xfs_inode_clear_reclaim_tag(struct xfs_perag *pag, xfs_ino_t ino);
-
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -143,6 +141,122 @@ xfs_inode_free(
 	__xfs_inode_free(ip);
 }
 
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	if (pag->pag_ici_reclaimable++)
+		return;
+
+	/* propagate the reclaim tag up into the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+			   XFS_ICI_RECLAIM_TAG);
+	spin_unlock(&mp->m_perag_lock);
+
+	/* schedule periodic background inode reclaim */
+	xfs_reclaim_work_queue(mp);
+
+	trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+
+	ASSERT(spin_is_locked(&pag->pag_ici_lock));
+	if (--pag->pag_ici_reclaimable)
+		return;
+
+	/* clear the reclaim tag from the perag radix tree */
+	spin_lock(&mp->m_perag_lock);
+	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+			     XFS_ICI_RECLAIM_TAG);
+	spin_unlock(&mp->m_perag_lock);
+	trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+
+	radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+	xfs_perag_set_reclaim_tag(pag);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+	struct xfs_perag	*pag,
+	xfs_ino_t		ino)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			     XFS_INO_TO_AGINO(pag->pag_mount, ino),
+			     XFS_ICI_RECLAIM_TAG);
+	xfs_perag_clear_reclaim_tag(pag);
+}
+
 /*
  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
  * part of the structure. This is made more complex by the fact we store
@@ -728,122 +842,6 @@ xfs_inode_ag_iterator_tag(
 	return last_error;
 }
 
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-	struct xfs_mount        *mp)
-{
-
-	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-	}
-	rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_reclaim_work);
-
-	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_reclaim_work_queue(mp);
-}
-
-static void
-xfs_perag_set_reclaim_tag(
-	struct xfs_perag	*pag)
-{
-	struct xfs_mount	*mp = pag->pag_mount;
-
-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
-	if (pag->pag_ici_reclaimable++)
-		return;
-
-	/* propagate the reclaim tag up into the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
-			   XFS_ICI_RECLAIM_TAG);
-	spin_unlock(&mp->m_perag_lock);
-
-	/* schedule periodic background inode reclaim */
-	xfs_reclaim_work_queue(mp);
-
-	trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
-}
-
-static void
-xfs_perag_clear_reclaim_tag(
-	struct xfs_perag	*pag)
-{
-	struct xfs_mount	*mp = pag->pag_mount;
-
-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
-	if (--pag->pag_ici_reclaimable)
-		return;
-
-	/* clear the reclaim tag from the perag radix tree */
-	spin_lock(&mp->m_perag_lock);
-	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
-			     XFS_ICI_RECLAIM_TAG);
-	spin_unlock(&mp->m_perag_lock);
-	trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
-}
-
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	spin_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-
-	radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			   XFS_ICI_RECLAIM_TAG);
-	xfs_perag_set_reclaim_tag(pag);
-	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-
-	spin_unlock(&ip->i_flags_lock);
-	spin_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-}
-
-STATIC void
-xfs_inode_clear_reclaim_tag(
-	struct xfs_perag	*pag,
-	xfs_ino_t		ino)
-{
-	radix_tree_tag_clear(&pag->pag_ici_root,
-			     XFS_INO_TO_AGINO(pag->pag_mount, ino),
-			     XFS_ICI_RECLAIM_TAG);
-	xfs_perag_clear_reclaim_tag(pag);
-}
-
 /*
  * Grab the inode for reclaim exclusively.
  * Return 0 if we grabbed it, non-zero otherwise.
-- 
cgit v1.2.3-59-g8ed1b


From 690a7871225b7f91f8fe13c465a79e183a61cfe3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 May 2016 10:29:15 +1000
Subject: xfs: fix warning in xfs_finish_page_writeback for non-debug builds

blockmask is unused if ASSERTs are disabled.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b5f1c66bbb58..736461a861b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -94,15 +94,14 @@ xfs_finish_page_writeback(
 	struct bio_vec		*bvec,
 	int			error)
 {
-	unsigned int		blockmask = (1 << inode->i_blkbits) - 1;
 	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
 	struct buffer_head	*head, *bh;
 	unsigned int		off = 0;
 
 	ASSERT(bvec->bv_offset < PAGE_SIZE);
-	ASSERT((bvec->bv_offset & blockmask) == 0);
+	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
 	ASSERT(end < PAGE_SIZE);
-	ASSERT((bvec->bv_len & blockmask) == 0);
+	ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
 
 	bh = head = page_buffers(bvec->bv_page);
 
-- 
cgit v1.2.3-59-g8ed1b