From 0d41e1d28c2e969094ef7933b8521f1e08d30251 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 5 Oct 2018 19:04:22 +1000
Subject: xfs: refactor clonerange preparation into a separate helper

Refactor all the reflink preparation steps into a separate helper
that we'll use to land all the upcoming fixes for insufficient input
checks.

This rework also moves the invalidation of the destination range to
the prep function so that it is done before the range is remapped.
This ensures that nobody can access the data in range being remapped
until the remap is complete.

[dgc: fix xfs_reflink_remap_prep() return value and caller check to
handle vfs_clone_file_prep_inodes() returning 0 to mean "nothing to
do". ]

[dgc: make sure length changed by vfs_clone_file_prep_inodes() gets
propagated back to XFS code that does the remapping. ]

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_reflink.c | 100 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5289e22cb081..da1a447bef51 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1220,35 +1220,47 @@ retry:
 	return 0;
 }
 
+/* Unlock both inodes after they've been prepped for a range clone. */
+STATIC void
+xfs_reflink_remap_unlock(
+	struct file		*file_in,
+	struct file		*file_out)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	bool			same_inode = (inode_in == inode_out);
+
+	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+	if (!same_inode)
+		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+	inode_unlock(inode_out);
+	if (!same_inode)
+		inode_unlock_shared(inode_in);
+}
+
 /*
- * Link a range of blocks from one file to another.
+ * Prepare two files for range cloning.  Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file
+ * will be truncated, and any leases on the out file will have been broken.
  */
-int
-xfs_reflink_remap_range(
+STATIC int
+xfs_reflink_remap_prep(
 	struct file		*file_in,
 	loff_t			pos_in,
 	struct file		*file_out,
 	loff_t			pos_out,
-	u64			len,
+	u64			*len,
 	bool			is_dedupe)
 {
 	struct inode		*inode_in = file_inode(file_in);
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
-	struct xfs_mount	*mp = src->i_mount;
 	bool			same_inode = (inode_in == inode_out);
-	xfs_fileoff_t		sfsbno, dfsbno;
-	xfs_filblks_t		fsblen;
-	xfs_extlen_t		cowextsize;
 	ssize_t			ret;
 
-	if (!xfs_sb_version_hasreflink(&mp->m_sb))
-		return -EOPNOTSUPP;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
 	/* Lock both files against IO */
 	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
 	if (ret)
@@ -1270,7 +1282,7 @@ xfs_reflink_remap_range(
 		goto out_unlock;
 
 	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-			&len, is_dedupe);
+			len, is_dedupe);
 	if (ret <= 0)
 		goto out_unlock;
 
@@ -1279,8 +1291,6 @@ xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
-	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
 	/*
 	 * Clear out post-eof preallocations because we don't have page cache
 	 * backing the delayed allocations and they'll never get freed on
@@ -1297,6 +1307,51 @@ xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
+	/* Zap any page cache for the destination file's range. */
+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
+				   PAGE_ALIGN(pos_out + *len) - 1);
+	return 1;
+out_unlock:
+	xfs_reflink_remap_unlock(file_in, file_out);
+	return ret;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+	struct file		*file_in,
+	loff_t			pos_in,
+	struct file		*file_out,
+	loff_t			pos_out,
+	u64			len,
+	bool			is_dedupe)
+{
+	struct inode		*inode_in = file_inode(file_in);
+	struct xfs_inode	*src = XFS_I(inode_in);
+	struct inode		*inode_out = file_inode(file_out);
+	struct xfs_inode	*dest = XFS_I(inode_out);
+	struct xfs_mount	*mp = src->i_mount;
+	xfs_fileoff_t		sfsbno, dfsbno;
+	xfs_filblks_t		fsblen;
+	xfs_extlen_t		cowextsize;
+	ssize_t			ret;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Prepare and then clone file data. */
+	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+			&len, is_dedupe);
+	if (ret <= 0)
+		return ret;
+
+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
 	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
 	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
 	fsblen = XFS_B_TO_FSB(mp, len);
@@ -1305,10 +1360,6 @@ xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
-	/* Zap any page cache for the destination file's range. */
-	truncate_inode_pages_range(&inode_out->i_data, pos_out,
-				   PAGE_ALIGN(pos_out + len) - 1);
-
 	/*
 	 * Carry the cowextsize hint from src to dest if we're sharing the
 	 * entire source file to the entire destination file, the source file
@@ -1325,12 +1376,7 @@ xfs_reflink_remap_range(
 			is_dedupe);
 
 out_unlock:
-	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-	if (!same_inode)
-		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
-	inode_unlock(inode_out);
-	if (!same_inode)
-		inode_unlock_shared(inode_in);
+	xfs_reflink_remap_unlock(file_in, file_out);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 410fdc72b05afabef3afb51167085799dcc7b3cf Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 5 Oct 2018 19:04:27 +1000
Subject: xfs: zero posteof blocks when cloning above eof

When we're reflinking between two files and the destination file range
is well beyond the destination file's EOF marker, zero any posteof
speculative preallocations in the destination file so that we don't
expose stale disk contents.  The previous strategy of trying to clear
the preallocations does not work if the destination file has the
PREALLOC flag set.

Uncovered by shared/010.

Reported-by: Zorro Lang <zlang@redhat.com>
Bugzilla-id: https://bugzilla.kernel.org/show_bug.cgi?id=201259
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_reflink.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index da1a447bef51..f135748d8282 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1240,6 +1240,26 @@ xfs_reflink_remap_unlock(
 		inode_unlock_shared(inode_in);
 }
 
+/*
+ * If we're reflinking to a point past the destination file's EOF, we must
+ * zero any speculative post-EOF preallocations that sit between the old EOF
+ * and the destination file offset.
+ */
+static int
+xfs_reflink_zero_posteof(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	loff_t			isize = i_size_read(VFS_I(ip));
+
+	if (pos <= isize)
+		return 0;
+
+	trace_xfs_zero_eof(ip, isize, pos - isize);
+	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
+			&xfs_iomap_ops);
+}
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file
@@ -1292,15 +1312,12 @@ xfs_reflink_remap_prep(
 		goto out_unlock;
 
 	/*
-	 * Clear out post-eof preallocations because we don't have page cache
-	 * backing the delayed allocations and they'll never get freed on
-	 * their own.
+	 * Zero existing post-eof speculative preallocations in the destination
+	 * file.
 	 */
-	if (xfs_can_free_eofblocks(dest, true)) {
-		ret = xfs_free_eofblocks(dest);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = xfs_reflink_zero_posteof(dest, pos_out);
+	if (ret)
+		goto out_unlock;
 
 	/* Set flags and remap blocks. */
 	ret = xfs_reflink_set_inode_flag(src, dest);
-- 
cgit v1.2.3-59-g8ed1b


From 7debbf015f580693680f3d2a3cef0cf99dcef688 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 5 Oct 2018 19:05:41 +1000
Subject: xfs: update ctime and remove suid before cloning files

Before cloning into a file, update the ctime and remove sensitive
attributes like suid, just like we'd do for a regular file write.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_reflink.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f135748d8282..59da9708e9c1 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1264,6 +1264,7 @@ xfs_reflink_zero_posteof(
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file
  * will be truncated, and any leases on the out file will have been broken.
+ * This function borrows heavily from xfs_file_aio_write_checks.
  */
 STATIC int
 xfs_reflink_remap_prep(
@@ -1327,6 +1328,30 @@ xfs_reflink_remap_prep(
 	/* Zap any page cache for the destination file's range. */
 	truncate_inode_pages_range(&inode_out->i_data, pos_out,
 				   PAGE_ALIGN(pos_out + *len) - 1);
+
+	/* If we're altering the file contents... */
+	if (!is_dedupe) {
+		/*
+		 * ...update the timestamps (which will grab the ilock again
+		 * from xfs_fs_dirty_inode, so we have to call it before we
+		 * take the ilock).
+		 */
+		if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+			ret = file_update_time(file_out);
+			if (ret)
+				goto out_unlock;
+		}
+
+		/*
+		 * ...clear the security bits if the process is not being run
+		 * by root.  This keeps people from modifying setuid and setgid
+		 * binaries.
+		 */
+		ret = file_remove_privs(file_out);
+		if (ret)
+			goto out_unlock;
+	}
+
 	return 1;
 out_unlock:
 	xfs_reflink_remap_unlock(file_in, file_out);
-- 
cgit v1.2.3-59-g8ed1b


From dceeb47b0ed65e14de53507a8a9c32a90831cfa1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sat, 6 Oct 2018 11:44:19 +1000
Subject: xfs: fix data corruption w/ unaligned dedupe ranges

A deduplication data corruption is Exposed by fstests generic/505 on
XFS. It is caused by extending the block match range to include the
partial EOF block, but then allowing unknown data beyond EOF to be
considered a "match" to data in the destination file because the
comparison is only made to the end of the source file. This corrupts
the destination file when the source extent is shared with it.

XFS only supports whole block dedupe, but we still need to appear to
support whole file dedupe correctly.  Hence if the dedupe request
includes the last block of the souce file, don't include it in the
actual XFS dedupe operation. If the rest of the range dedupes
successfully, then report the partial last block as deduped, too, so
that userspace sees it as a successful dedupe rather than return
EINVAL because we can't dedupe unaligned blocks.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_reflink.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 59da9708e9c1..f889398e25d6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1265,6 +1265,19 @@ xfs_reflink_zero_posteof(
  * will have the iolock and mmaplock held, the page cache of the out file
  * will be truncated, and any leases on the out file will have been broken.
  * This function borrows heavily from xfs_file_aio_write_checks.
+ *
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
+ * EOF block in the source dedupe range because it's not a complete block match,
+ * hence can introduce a corruption into the file that has it's
+ * block replaced.
+ *
+ * Despite this issue, we still need to report that range as successfully
+ * deduped to avoid confusing userspace with EINVAL errors on completely
+ * matching file data. The only time that an unaligned length will be passed to
+ * us is when it spans the EOF block of the source file, so if we simply mask it
+ * down to be block aligned here the we will dedupe everything but that partial
+ * EOF block.
  */
 STATIC int
 xfs_reflink_remap_prep(
@@ -1307,6 +1320,14 @@ xfs_reflink_remap_prep(
 	if (ret <= 0)
 		goto out_unlock;
 
+	/*
+	 * If the dedupe data matches, chop off the partial EOF block
+	 * from the source file so we don't try to dedupe the partial
+	 * EOF block.
+	 */
+	if (is_dedupe)
+		*len &= ~((u64)i_blocksize(inode_in) - 1);
+
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From b39989009bdb84992915c9869f58094ed5becf10 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 6 Oct 2018 11:44:39 +1000
Subject: xfs: fix data corruption w/ unaligned reflink ranges

When reflinking sub-file ranges, a data corruption can occur when
the source file range includes a partial EOF block. This shares the
unknown data beyond EOF into the second file at a position inside
EOF, exposing stale data in the second file.

XFS only supports whole block sharing, but we still need to
support whole file reflink correctly.  Hence if the reflink
request includes the last block of the souce file, only proceed with
the reflink operation if it lands at or past the destination file's
current EOF. If it lands within the destination file EOF, reject the
entire request with -EINVAL and make the caller go the hard way.

This avoids the data corruption vector, but also avoids disruption
of returning EINVAL to userspace for the common case of whole file
cloning.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_reflink.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f889398e25d6..42ea7bab9144 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1262,22 +1262,32 @@ xfs_reflink_zero_posteof(
 
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
- * will have the iolock and mmaplock held, the page cache of the out file
- * will be truncated, and any leases on the out file will have been broken.
- * This function borrows heavily from xfs_file_aio_write_checks.
+ * will have the iolock and mmaplock held, the page cache of the out file will
+ * be truncated, and any leases on the out file will have been broken.  This
+ * function borrows heavily from xfs_file_aio_write_checks.
  *
  * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
  * checked that the bytes beyond EOF physically match. Hence we cannot use the
  * EOF block in the source dedupe range because it's not a complete block match,
- * hence can introduce a corruption into the file that has it's
- * block replaced.
+ * hence can introduce a corruption into the file that has it's block replaced.
  *
- * Despite this issue, we still need to report that range as successfully
- * deduped to avoid confusing userspace with EINVAL errors on completely
- * matching file data. The only time that an unaligned length will be passed to
- * us is when it spans the EOF block of the source file, so if we simply mask it
- * down to be block aligned here the we will dedupe everything but that partial
- * EOF block.
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
+ * "block aligned" for the purposes of cloning entire files.  However, if the
+ * source file range includes the EOF block and it lands within the existing EOF
+ * of the destination file, then we can expose stale data from beyond the source
+ * file EOF in the destination file.
+ *
+ * XFS doesn't support partial block sharing, so in both cases we have check
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
+ * down to the previous whole block and ignore the partial EOF block. While this
+ * means we can't dedupe the last block of a file, this is an acceptible
+ * tradeoff for simplicity on implementation.
+ *
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
+ * block of the destination file. If the partial EOF block lies inside the
+ * existing destination EOF, then we have to abort the clone to avoid exposing
+ * stale data in the destination file. Hence we reject these clone attempts with
+ * -EINVAL in this case.
  */
 STATIC int
 xfs_reflink_remap_prep(
@@ -1293,6 +1303,7 @@ xfs_reflink_remap_prep(
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	bool			same_inode = (inode_in == inode_out);
+	u64			blkmask = i_blocksize(inode_in) - 1;
 	ssize_t			ret;
 
 	/* Lock both files against IO */
@@ -1325,8 +1336,18 @@ xfs_reflink_remap_prep(
 	 * from the source file so we don't try to dedupe the partial
 	 * EOF block.
 	 */
-	if (is_dedupe)
-		*len &= ~((u64)i_blocksize(inode_in) - 1);
+	if (is_dedupe) {
+		*len &= ~blkmask;
+	} else if (*len & blkmask) {
+		/*
+		 * The user is attempting to share a partial EOF block,
+		 * if it's inside the destination EOF then reject it.
+		 */
+		if (pos_out + *len < i_size_read(inode_out)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
 
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
-- 
cgit v1.2.3-59-g8ed1b