From 0ee7a3f6b5b2f22bb69bfc6c60d0ea0777003098 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:44:14 +1100
Subject: xfs: don't take the IOLOCK exclusive for direct I/O page invalidation

XFS historically took the iolock exclusive when invalidating pages
before direct I/O operations to protect against writeback starvations.

But this writeback starvation issues has been fixed a long time ago
in the core writeback code, and all other file systems manage to do
without the exclusive lock.  Convert XFS over to avoid the exclusive
lock in this case, and also move to range invalidations like done
by the other file systems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 98 +++++++++++++++++--------------------------------------
 1 file changed, 30 insertions(+), 68 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a314fc7b56fa..0dc9971d3c84 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -249,6 +249,7 @@ xfs_file_dio_aio_read(
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			isize = i_size_read(inode);
 	size_t			count = iov_iter_count(to);
+	loff_t			end = iocb->ki_pos + count - 1;
 	struct iov_iter		data;
 	struct xfs_buftarg	*target;
 	ssize_t			ret = 0;
@@ -272,49 +273,21 @@ xfs_file_dio_aio_read(
 
 	file_accessed(iocb->ki_filp);
 
-	/*
-	 * Locking is a bit tricky here. If we take an exclusive lock for direct
-	 * IO, we effectively serialise all new concurrent read IO to this file
-	 * and block it behind IO that is currently in progress because IO in
-	 * progress holds the IO lock shared. We only need to hold the lock
-	 * exclusive to blow away the page cache, so only take lock exclusively
-	 * if the page cache needs invalidation. This allows the normal direct
-	 * IO case of no page cache pages to proceeed concurrently without
-	 * serialisation.
-	 */
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 	if (mapping->nrpages) {
-		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+		if (ret)
+			goto out_unlock;
 
 		/*
-		 * The generic dio code only flushes the range of the particular
-		 * I/O. Because we take an exclusive lock here, this whole
-		 * sequence is considerably more expensive for us. This has a
-		 * noticeable performance impact for any file with cached pages,
-		 * even when outside of the range of the particular I/O.
-		 *
-		 * Hence, amortize the cost of the lock against a full file
-		 * flush and reduce the chances of repeated iolock cycles going
-		 * forward.
+		 * Invalidate whole pages. This can return an error if we fail
+		 * to invalidate a page, but this should never happen on XFS.
+		 * Warn if it does fail.
 		 */
-		if (mapping->nrpages) {
-			ret = filemap_write_and_wait(mapping);
-			if (ret) {
-				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
-				return ret;
-			}
-
-			/*
-			 * Invalidate whole pages. This can return an error if
-			 * we fail to invalidate a page, but this should never
-			 * happen on XFS. Warn if it does fail.
-			 */
-			ret = invalidate_inode_pages2(mapping);
-			WARN_ON_ONCE(ret);
-			ret = 0;
-		}
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+		ret = 0;
 	}
 
 	data = *to;
@@ -324,8 +297,9 @@ xfs_file_dio_aio_read(
 		iocb->ki_pos += ret;
 		iov_iter_advance(to, ret);
 	}
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
+out_unlock:
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
@@ -570,61 +544,49 @@ xfs_file_dio_aio_write(
 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
 
-	/* "unaligned" here means not aligned to a filesystem block */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask))
-		unaligned_io = 1;
-
 	/*
-	 * We don't need to take an exclusive lock unless there page cache needs
-	 * to be invalidated or unaligned IO is being executed. We don't need to
-	 * consider the EOF extension case here because
-	 * xfs_file_aio_write_checks() will relock the inode as necessary for
-	 * EOF zeroing cases and fill out the new inode size as appropriate.
+	 * Don't take the exclusive iolock here unless the I/O is unaligned to
+	 * the file system block size.  We don't need to consider the EOF
+	 * extension case here because xfs_file_aio_write_checks() will relock
+	 * the inode as necessary for EOF zeroing cases and fill out the new
+	 * inode size as appropriate.
 	 */
-	if (unaligned_io || mapping->nrpages)
+	if ((iocb->ki_pos & mp->m_blockmask) ||
+	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
+		unaligned_io = 1;
 		iolock = XFS_IOLOCK_EXCL;
-	else
+	} else {
 		iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, iolock);
-
-	/*
-	 * Recheck if there are cached pages that need invalidate after we got
-	 * the iolock to protect against other threads adding new pages while
-	 * we were waiting for the iolock.
-	 */
-	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
-		xfs_rw_iunlock(ip, iolock);
-		iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, iolock);
 	}
 
+	xfs_rw_ilock(ip, iolock);
+
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 	count = iov_iter_count(from);
 	end = iocb->ki_pos + count - 1;
 
-	/*
-	 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
-	 */
 	if (mapping->nrpages) {
-		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
 		if (ret)
 			goto out;
+
 		/*
 		 * Invalidate whole pages. This can return an error if we fail
 		 * to invalidate a page, but this should never happen on XFS.
 		 * Warn if it does fail.
 		 */
-		ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
 		WARN_ON_ONCE(ret);
 		ret = 0;
 	}
 
 	/*
 	 * If we are doing unaligned IO, wait for all other IO to drain,
-	 * otherwise demote the lock if we had to flush cached pages
+	 * otherwise demote the lock if we had to take the exclusive lock
+	 * for other reasons in xfs_file_aio_write_checks.
 	 */
 	if (unaligned_io)
 		inode_dio_wait(inode);
-- 
cgit v1.2.3-59-g8ed1b


From 4fbc2c65255f77b315a4ee3ccac397d677a35737 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:48:54 +1100
Subject: xfs: remove the same fs check from xfs_file_share_range

The VFS already does the check, and the placement of this duplicate
is in the way of the following locking rework.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0dc9971d3c84..194f8f396e4d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -965,9 +965,6 @@ xfs_file_share_range(
 	    IS_SWAPFILE(inode_out))
 		return -ETXTBSY;
 
-	/* Reflink only works within this filesystem. */
-	if (inode_in->i_sb != inode_out->i_sb)
-		return -EXDEV;
 	same_inode = (inode_in->i_ino == inode_out->i_ino);
 
 	/* Don't reflink dirs, pipes, sockets... */
-- 
cgit v1.2.3-59-g8ed1b


From a62e82b35b97e60e9e22a4e303900f342139822f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:49:03 +1100
Subject: xfs: fix the same_inode check in xfs_file_share_range

The VFS i_ino is an unsigned long, while XFS inode numbers are 64-bit
wide, so checking i_ino for equality could lead to rate false positives
on 32-bit architectures.  Just compare the inode pointers themselves
to be safe.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 194f8f396e4d..d5b835e82b2d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -965,7 +965,7 @@ xfs_file_share_range(
 	    IS_SWAPFILE(inode_out))
 		return -ETXTBSY;
 
-	same_inode = (inode_in->i_ino == inode_out->i_ino);
+	same_inode = (inode_in == inode_out);
 
 	/* Don't reflink dirs, pipes, sockets... */
 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-- 
cgit v1.2.3-59-g8ed1b


From 576177818e6f1e65f6109ed4a8fae8b60131c861 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:49:19 +1100
Subject: xfs: move inode locking from xfs_reflink_remap_range to
 xfs_file_share_range

We need the iolock protection to stabilizie the IS_SWAPFILE and
IS_IMMUTABLE values, as well as preventing new buffered writers
re-dirtying the file data that we just wrote out.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c    | 62 ++++++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_reflink.c | 15 -------------
 2 files changed, 41 insertions(+), 36 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d5b835e82b2d..663761edd778 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -958,38 +958,54 @@ xfs_file_share_range(
 	inode_out = file_inode(file_out);
 	bs = inode_out->i_sb->s_blocksize;
 
+	/* Lock both files against IO */
+	same_inode = (inode_in == inode_out);
+	if (same_inode) {
+		xfs_ilock(XFS_I(inode_in), XFS_IOLOCK_EXCL);
+		xfs_ilock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL);
+	} else {
+		xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out),
+				XFS_IOLOCK_EXCL);
+		xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out),
+				XFS_MMAPLOCK_EXCL);
+	}
+
 	/* Don't touch certain kinds of inodes */
+	ret = -EPERM;
 	if (IS_IMMUTABLE(inode_out))
-		return -EPERM;
-	if (IS_SWAPFILE(inode_in) ||
-	    IS_SWAPFILE(inode_out))
-		return -ETXTBSY;
-
-	same_inode = (inode_in == inode_out);
+		goto out_unlock;
+	ret = -ETXTBSY;
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		goto out_unlock;
 
 	/* Don't reflink dirs, pipes, sockets... */
+	ret = -EISDIR;
 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		return -EISDIR;
+		goto out_unlock;
+	ret = -EINVAL;
 	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
-		return -EINVAL;
+		goto out_unlock;
 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Don't share DAX file data for now. */
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Are we going all the way to the end? */
 	isize = i_size_read(inode_in);
-	if (isize == 0)
-		return 0;
+	if (isize == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
 	if (len == 0)
 		len = isize - pos_in;
 
 	/* Ensure offsets don't wrap and the input is inside i_size */
 	if (pos_in + len < pos_in || pos_out + len < pos_out ||
 	    pos_in + len > isize)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Don't allow dedupe past EOF in the dest file */
 	if (is_dedupe) {
@@ -997,7 +1013,7 @@ xfs_file_share_range(
 
 		disize = i_size_read(inode_out);
 		if (pos_out >= disize || pos_out + len > disize)
-			return -EINVAL;
+			goto out_unlock;
 	}
 
 	/* If we're linking to EOF, continue to the block boundary. */
@@ -1009,28 +1025,32 @@ xfs_file_share_range(
 	/* Only reflink if we're aligned to block boundaries */
 	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
 	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Don't allow overlapped reflink within the same file */
 	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Wait for the completion of any pending IOs on srcfile */
 	ret = xfs_file_wait_for_io(inode_in, pos_in, len);
 	if (ret)
-		goto out;
+		goto out_unlock;
 	ret = xfs_file_wait_for_io(inode_out, pos_out, len);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	if (is_dedupe)
 		flags |= XFS_REFLINK_DEDUPE;
 	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
 			pos_out, len, flags);
-	if (ret < 0)
-		goto out;
 
-out:
+out_unlock:
+	xfs_iunlock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(XFS_I(inode_in), XFS_IOLOCK_EXCL);
+	if (!same_inode) {
+		xfs_iunlock(XFS_I(inode_out), XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(XFS_I(inode_out), XFS_IOLOCK_EXCL);
+	}
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d48a7cc2fe00..3b1c1a6bb5da 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1341,15 +1341,6 @@ xfs_reflink_remap_range(
 
 	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
 
-	/* Lock both files against IO */
-	if (src->i_ino == dest->i_ino) {
-		xfs_ilock(src, XFS_IOLOCK_EXCL);
-		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
-	} else {
-		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
-		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
-	}
-
 	/*
 	 * Check that the extents are the same.
 	 */
@@ -1401,12 +1392,6 @@ xfs_reflink_remap_range(
 		goto out_error;
 
 out_error:
-	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
-	xfs_iunlock(src, XFS_IOLOCK_EXCL);
-	if (src->i_ino != dest->i_ino) {
-		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
-	}
 	if (error)
 		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
 	return error;
-- 
cgit v1.2.3-59-g8ed1b


From ec40759902556f21f37641ad9f19d02c4dd4b555 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:49:55 +1100
Subject: xfs: remove xfs_file_wait_for_io

filemap_write_and_wait_range operates on full pages, so there is no
need for the rounding operations.  Additionally this allows us to
micro-optimize by skipping the second inode_dio_wait for a
intra-file clone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 39 ++++++++++-----------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 663761edd778..93729752bccb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -909,32 +909,6 @@ out_unlock:
 	return error;
 }
 
-/*
- * Flush all file writes out to disk.
- */
-static int
-xfs_file_wait_for_io(
-	struct inode	*inode,
-	loff_t		offset,
-	size_t		len)
-{
-	loff_t		rounding;
-	loff_t		ioffset;
-	loff_t		iendoffset;
-	loff_t		bs;
-	int		ret;
-
-	bs = inode->i_sb->s_blocksize;
-	inode_dio_wait(inode);
-
-	rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
-	ioffset = round_down(offset, rounding);
-	iendoffset = round_up(offset + len, rounding) - 1;
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-					   iendoffset);
-	return ret;
-}
-
 /* Hook up to the VFS reflink function */
 STATIC int
 xfs_file_share_range(
@@ -1031,11 +1005,18 @@ xfs_file_share_range(
 	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
 		goto out_unlock;
 
-	/* Wait for the completion of any pending IOs on srcfile */
-	ret = xfs_file_wait_for_io(inode_in, pos_in, len);
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
+			pos_in, pos_in + len - 1);
 	if (ret)
 		goto out_unlock;
-	ret = xfs_file_wait_for_io(inode_out, pos_out, len);
+
+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
+			pos_out, pos_out + len - 1);
 	if (ret)
 		goto out_unlock;
 
-- 
cgit v1.2.3-59-g8ed1b


From 5faaf4fa0a20d38edc4df57baf24ea35b7e91178 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:50:07 +1100
Subject: xfs: merge xfs_reflink_remap_range and xfs_file_share_range

There is no clear division of responsibility between those functions, so
just merge them into one to keep the code simple.  Also move
xfs_file_wait_for_io to xfs_reflink.c together with its only caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c    | 132 +-------------------------------------
 fs/xfs/xfs_reflink.c | 178 +++++++++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_reflink.h |   7 +-
 3 files changed, 143 insertions(+), 174 deletions(-)

(limited to 'fs/xfs/xfs_file.c')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 93729752bccb..6e4f7f900fea 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -909,132 +909,6 @@ out_unlock:
 	return error;
 }
 
-/* Hook up to the VFS reflink function */
-STATIC int
-xfs_file_share_range(
-	struct file	*file_in,
-	loff_t		pos_in,
-	struct file	*file_out,
-	loff_t		pos_out,
-	u64		len,
-	bool		is_dedupe)
-{
-	struct inode	*inode_in;
-	struct inode	*inode_out;
-	ssize_t		ret;
-	loff_t		bs;
-	loff_t		isize;
-	int		same_inode;
-	loff_t		blen;
-	unsigned int	flags = 0;
-
-	inode_in = file_inode(file_in);
-	inode_out = file_inode(file_out);
-	bs = inode_out->i_sb->s_blocksize;
-
-	/* Lock both files against IO */
-	same_inode = (inode_in == inode_out);
-	if (same_inode) {
-		xfs_ilock(XFS_I(inode_in), XFS_IOLOCK_EXCL);
-		xfs_ilock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL);
-	} else {
-		xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out),
-				XFS_IOLOCK_EXCL);
-		xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out),
-				XFS_MMAPLOCK_EXCL);
-	}
-
-	/* Don't touch certain kinds of inodes */
-	ret = -EPERM;
-	if (IS_IMMUTABLE(inode_out))
-		goto out_unlock;
-	ret = -ETXTBSY;
-	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-		goto out_unlock;
-
-	/* Don't reflink dirs, pipes, sockets... */
-	ret = -EISDIR;
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		goto out_unlock;
-	ret = -EINVAL;
-	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
-		goto out_unlock;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		goto out_unlock;
-
-	/* Don't share DAX file data for now. */
-	if (IS_DAX(inode_in) || IS_DAX(inode_out))
-		goto out_unlock;
-
-	/* Are we going all the way to the end? */
-	isize = i_size_read(inode_in);
-	if (isize == 0) {
-		ret = 0;
-		goto out_unlock;
-	}
-
-	if (len == 0)
-		len = isize - pos_in;
-
-	/* Ensure offsets don't wrap and the input is inside i_size */
-	if (pos_in + len < pos_in || pos_out + len < pos_out ||
-	    pos_in + len > isize)
-		goto out_unlock;
-
-	/* Don't allow dedupe past EOF in the dest file */
-	if (is_dedupe) {
-		loff_t	disize;
-
-		disize = i_size_read(inode_out);
-		if (pos_out >= disize || pos_out + len > disize)
-			goto out_unlock;
-	}
-
-	/* If we're linking to EOF, continue to the block boundary. */
-	if (pos_in + len == isize)
-		blen = ALIGN(isize, bs) - pos_in;
-	else
-		blen = len;
-
-	/* Only reflink if we're aligned to block boundaries */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-		goto out_unlock;
-
-	/* Don't allow overlapped reflink within the same file */
-	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
-		goto out_unlock;
-
-	/* Wait for the completion of any pending IOs on both files */
-	inode_dio_wait(inode_in);
-	if (!same_inode)
-		inode_dio_wait(inode_out);
-
-	ret = filemap_write_and_wait_range(inode_in->i_mapping,
-			pos_in, pos_in + len - 1);
-	if (ret)
-		goto out_unlock;
-
-	ret = filemap_write_and_wait_range(inode_out->i_mapping,
-			pos_out, pos_out + len - 1);
-	if (ret)
-		goto out_unlock;
-
-	if (is_dedupe)
-		flags |= XFS_REFLINK_DEDUPE;
-	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
-			pos_out, len, flags);
-
-out_unlock:
-	xfs_iunlock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL);
-	xfs_iunlock(XFS_I(inode_in), XFS_IOLOCK_EXCL);
-	if (!same_inode) {
-		xfs_iunlock(XFS_I(inode_out), XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(XFS_I(inode_out), XFS_IOLOCK_EXCL);
-	}
-	return ret;
-}
-
 STATIC ssize_t
 xfs_file_copy_range(
 	struct file	*file_in,
@@ -1046,7 +920,7 @@ xfs_file_copy_range(
 {
 	int		error;
 
-	error = xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+	error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
 				     len, false);
 	if (error)
 		return error;
@@ -1061,7 +935,7 @@ xfs_file_clone_range(
 	loff_t		pos_out,
 	u64		len)
 {
-	return xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
 				     len, false);
 }
 
@@ -1084,7 +958,7 @@ xfs_file_dedupe_range(
 	if (len > XFS_MAX_DEDUPE_LEN)
 		len = XFS_MAX_DEDUPE_LEN;
 
-	error = xfs_file_share_range(src_file, loff, dst_file, dst_loff,
+	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
 				     len, true);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3b1c1a6bb5da..6592daa833a4 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1312,19 +1312,26 @@ out_error:
  */
 int
 xfs_reflink_remap_range(
-	struct xfs_inode	*src,
-	xfs_off_t		srcoff,
-	struct xfs_inode	*dest,
-	xfs_off_t		destoff,
-	xfs_off_t		len,
-	unsigned int		flags)
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	u64			len,
+	bool			is_dedupe)
 {
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
 	struct xfs_mount	*mp = src->i_mount;
+	loff_t			bs = inode_out->i_sb->s_blocksize;
+	bool			same_inode = (inode_in == inode_out);
 	xfs_fileoff_t		sfsbno, dfsbno;
 	xfs_filblks_t		fsblen;
-	int			error;
 	xfs_extlen_t		cowextsize;
-	bool			is_same;
+	loff_t			isize;
+	ssize_t			ret;
+	loff_t			blen;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return -EOPNOTSUPP;
@@ -1332,48 +1339,135 @@ xfs_reflink_remap_range(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
+	/* Lock both files against IO */
+	if (same_inode) {
+		xfs_ilock(src, XFS_IOLOCK_EXCL);
+		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
+	} else {
+		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
+	}
+
+	/* Don't touch certain kinds of inodes */
+	ret = -EPERM;
+	if (IS_IMMUTABLE(inode_out))
+		goto out_unlock;
+
+	ret = -ETXTBSY;
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		goto out_unlock;
+
+
+	/* Don't reflink dirs, pipes, sockets... */
+	ret = -EISDIR;
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		goto out_unlock;
+	ret = -EINVAL;
+	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+		goto out_unlock;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		goto out_unlock;
+
 	/* Don't reflink realtime inodes */
 	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
-		return -EINVAL;
+		goto out_unlock;
+
+	/* Don't share DAX file data for now. */
+	if (IS_DAX(inode_in) || IS_DAX(inode_out))
+		goto out_unlock;
+
+	/* Are we going all the way to the end? */
+	isize = i_size_read(inode_in);
+	if (isize == 0) {
+		ret = 0;
+		goto out_unlock;
+	}
+
+	if (len == 0)
+		len = isize - pos_in;
+
+	/* Ensure offsets don't wrap and the input is inside i_size */
+	if (pos_in + len < pos_in || pos_out + len < pos_out ||
+	    pos_in + len > isize)
+		goto out_unlock;
 
-	if (flags & ~XFS_REFLINK_ALL)
-		return -EINVAL;
+	/* Don't allow dedupe past EOF in the dest file */
+	if (is_dedupe) {
+		loff_t	disize;
 
-	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
+		disize = i_size_read(inode_out);
+		if (pos_out >= disize || pos_out + len > disize)
+			goto out_unlock;
+	}
+
+	/* If we're linking to EOF, continue to the block boundary. */
+	if (pos_in + len == isize)
+		blen = ALIGN(isize, bs) - pos_in;
+	else
+		blen = len;
+
+	/* Only reflink if we're aligned to block boundaries */
+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+		goto out_unlock;
+
+	/* Don't allow overlapped reflink within the same file */
+	if (same_inode) {
+		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+			goto out_unlock;
+	}
+
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
+			pos_in, pos_in + len - 1);
+	if (ret)
+		goto out_unlock;
+
+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
+			pos_out, pos_out + len - 1);
+	if (ret)
+		goto out_unlock;
+
+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
 	/*
 	 * Check that the extents are the same.
 	 */
-	if (flags & XFS_REFLINK_DEDUPE) {
-		is_same = false;
-		error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
-				destoff, len, &is_same);
-		if (error)
-			goto out_error;
+	if (is_dedupe) {
+		bool		is_same = false;
+
+		ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
+				len, &is_same);
+		if (ret)
+			goto out_unlock;
 		if (!is_same) {
-			error = -EBADE;
-			goto out_error;
+			ret = -EBADE;
+			goto out_unlock;
 		}
 	}
 
-	error = xfs_reflink_set_inode_flag(src, dest);
-	if (error)
-		goto out_error;
+	ret = xfs_reflink_set_inode_flag(src, dest);
+	if (ret)
+		goto out_unlock;
 
 	/*
 	 * Invalidate the page cache so that we can clear any CoW mappings
 	 * in the destination file.
 	 */
-	truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff,
-				   PAGE_ALIGN(destoff + len) - 1);
+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
+				   PAGE_ALIGN(pos_out + len) - 1);
 
-	dfsbno = XFS_B_TO_FSBT(mp, destoff);
-	sfsbno = XFS_B_TO_FSBT(mp, srcoff);
+	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
+	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
 	fsblen = XFS_B_TO_FSB(mp, len);
-	error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-			destoff + len);
-	if (error)
-		goto out_error;
+	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
+			pos_out + len);
+	if (ret)
+		goto out_unlock;
 
 	/*
 	 * Carry the cowextsize hint from src to dest if we're sharing the
@@ -1381,20 +1475,24 @@ xfs_reflink_remap_range(
 	 * has a cowextsize hint, and the destination file does not.
 	 */
 	cowextsize = 0;
-	if (srcoff == 0 && len == i_size_read(VFS_I(src)) &&
+	if (pos_in == 0 && len == i_size_read(inode_in) &&
 	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-	    destoff == 0 && len >= i_size_read(VFS_I(dest)) &&
+	    pos_out == 0 && len >= i_size_read(inode_out) &&
 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
 		cowextsize = src->i_d.di_cowextsize;
 
-	error = xfs_reflink_update_dest(dest, destoff + len, cowextsize);
-	if (error)
-		goto out_error;
+	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
 
-out_error:
-	if (error)
-		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
-	return error;
+out_unlock:
+	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(src, XFS_IOLOCK_EXCL);
+	if (src->i_ino != dest->i_ino) {
+		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
+	}
+	if (ret)
+		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 5dc3c8ac12aa..7ddd9f69560d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -43,11 +43,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-#define XFS_REFLINK_DEDUPE	1	/* only reflink if contents match */
-#define XFS_REFLINK_ALL		(XFS_REFLINK_DEDUPE)
-extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff,
-		struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
-		unsigned int flags);
+extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
 extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
 		struct xfs_trans **tpp);
 extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
-- 
cgit v1.2.3-59-g8ed1b