From 0d41e1d28c2e969094ef7933b8521f1e08d30251 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Oct 2018 19:04:22 +1000 Subject: xfs: refactor clonerange preparation into a separate helper Refactor all the reflink preparation steps into a separate helper that we'll use to land all the upcoming fixes for insufficient input checks. This rework also moves the invalidation of the destination range to the prep function so that it is done before the range is remapped. This ensures that nobody can access the data in range being remapped until the remap is complete. [dgc: fix xfs_reflink_remap_prep() return value and caller check to handle vfs_clone_file_prep_inodes() returning 0 to mean "nothing to do". ] [dgc: make sure length changed by vfs_clone_file_prep_inodes() gets propagated back to XFS code that does the remapping. ] Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 100 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5289e22cb081..da1a447bef51 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1220,35 +1220,47 @@ retry: return 0; } +/* Unlock both inodes after they've been prepped for a range clone. */ +STATIC void +xfs_reflink_remap_unlock( + struct file *file_in, + struct file *file_out) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + bool same_inode = (inode_in == inode_out); + + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); + if (!same_inode) + inode_unlock_shared(inode_in); +} + /* - * Link a range of blocks from one file to another. + * Prepare two files for range cloning. Upon a successful return both inodes + * will have the iolock and mmaplock held, the page cache of the out file + * will be truncated, and any leases on the out file will have been broken. */ -int -xfs_reflink_remap_range( +STATIC int +xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; bool same_inode = (inode_in == inode_out); - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; ssize_t ret; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* Lock both files against IO */ ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); if (ret) @@ -1270,7 +1282,7 @@ xfs_reflink_remap_range( goto out_unlock; ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); + len, is_dedupe); if (ret <= 0) goto out_unlock; @@ -1279,8 +1291,6 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* * Clear out post-eof preallocations because we don't have page cache * backing the delayed allocations and they'll never get freed on @@ -1297,6 +1307,51 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + *len) - 1); + return 1; +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + return ret; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + xfs_extlen_t cowextsize; + ssize_t ret; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, is_dedupe); + if (ret <= 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1305,10 +1360,6 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file @@ -1325,12 +1376,7 @@ xfs_reflink_remap_range( is_dedupe); out_unlock: - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock_shared(inode_in); + xfs_reflink_remap_unlock(file_in, file_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; -- cgit v1.2.3-59-g8ed1b From 410fdc72b05afabef3afb51167085799dcc7b3cf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Oct 2018 19:04:27 +1000 Subject: xfs: zero posteof blocks when cloning above eof When we're reflinking between two files and the destination file range is well beyond the destination file's EOF marker, zero any posteof speculative preallocations in the destination file so that we don't expose stale disk contents. The previous strategy of trying to clear the preallocations does not work if the destination file has the PREALLOC flag set. Uncovered by shared/010. Reported-by: Zorro Lang Bugzilla-id: https://bugzilla.kernel.org/show_bug.cgi?id=201259 Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index da1a447bef51..f135748d8282 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1240,6 +1240,26 @@ xfs_reflink_remap_unlock( inode_unlock_shared(inode_in); } +/* + * If we're reflinking to a point past the destination file's EOF, we must + * zero any speculative post-EOF preallocations that sit between the old EOF + * and the destination file offset. + */ +static int +xfs_reflink_zero_posteof( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + + if (pos <= isize) + return 0; + + trace_xfs_zero_eof(ip, isize, pos - isize); + return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, + &xfs_iomap_ops); +} + /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file @@ -1292,15 +1312,12 @@ xfs_reflink_remap_prep( goto out_unlock; /* - * Clear out post-eof preallocations because we don't have page cache - * backing the delayed allocations and they'll never get freed on - * their own. + * Zero existing post-eof speculative preallocations in the destination + * file. */ - if (xfs_can_free_eofblocks(dest, true)) { - ret = xfs_free_eofblocks(dest); - if (ret) - goto out_unlock; - } + ret = xfs_reflink_zero_posteof(dest, pos_out); + if (ret) + goto out_unlock; /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); -- cgit v1.2.3-59-g8ed1b From 7debbf015f580693680f3d2a3cef0cf99dcef688 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 5 Oct 2018 19:05:41 +1000 Subject: xfs: update ctime and remove suid before cloning files Before cloning into a file, update the ctime and remove sensitive attributes like suid, just like we'd do for a regular file write. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f135748d8282..59da9708e9c1 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1264,6 +1264,7 @@ xfs_reflink_zero_posteof( * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file * will be truncated, and any leases on the out file will have been broken. + * This function borrows heavily from xfs_file_aio_write_checks. */ STATIC int xfs_reflink_remap_prep( @@ -1327,6 +1328,30 @@ xfs_reflink_remap_prep( /* Zap any page cache for the destination file's range. */ truncate_inode_pages_range(&inode_out->i_data, pos_out, PAGE_ALIGN(pos_out + *len) - 1); + + /* If we're altering the file contents... */ + if (!is_dedupe) { + /* + * ...update the timestamps (which will grab the ilock again + * from xfs_fs_dirty_inode, so we have to call it before we + * take the ilock). + */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + goto out_unlock; + } + + /* + * ...clear the security bits if the process is not being run + * by root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + return 1; out_unlock: xfs_reflink_remap_unlock(file_in, file_out); -- cgit v1.2.3-59-g8ed1b From dceeb47b0ed65e14de53507a8a9c32a90831cfa1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 6 Oct 2018 11:44:19 +1000 Subject: xfs: fix data corruption w/ unaligned dedupe ranges A deduplication data corruption is Exposed by fstests generic/505 on XFS. It is caused by extending the block match range to include the partial EOF block, but then allowing unknown data beyond EOF to be considered a "match" to data in the destination file because the comparison is only made to the end of the source file. This corrupts the destination file when the source extent is shared with it. XFS only supports whole block dedupe, but we still need to appear to support whole file dedupe correctly. Hence if the dedupe request includes the last block of the souce file, don't include it in the actual XFS dedupe operation. If the rest of the range dedupes successfully, then report the partial last block as deduped, too, so that userspace sees it as a successful dedupe rather than return EINVAL because we can't dedupe unaligned blocks. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59da9708e9c1..f889398e25d6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1265,6 +1265,19 @@ xfs_reflink_zero_posteof( * will have the iolock and mmaplock held, the page cache of the out file * will be truncated, and any leases on the out file will have been broken. * This function borrows heavily from xfs_file_aio_write_checks. + * + * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't + * checked that the bytes beyond EOF physically match. Hence we cannot use the + * EOF block in the source dedupe range because it's not a complete block match, + * hence can introduce a corruption into the file that has it's + * block replaced. + * + * Despite this issue, we still need to report that range as successfully + * deduped to avoid confusing userspace with EINVAL errors on completely + * matching file data. The only time that an unaligned length will be passed to + * us is when it spans the EOF block of the source file, so if we simply mask it + * down to be block aligned here the we will dedupe everything but that partial + * EOF block. */ STATIC int xfs_reflink_remap_prep( @@ -1307,6 +1320,14 @@ xfs_reflink_remap_prep( if (ret <= 0) goto out_unlock; + /* + * If the dedupe data matches, chop off the partial EOF block + * from the source file so we don't try to dedupe the partial + * EOF block. + */ + if (is_dedupe) + *len &= ~((u64)i_blocksize(inode_in) - 1); + /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) -- cgit v1.2.3-59-g8ed1b From b39989009bdb84992915c9869f58094ed5becf10 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 6 Oct 2018 11:44:39 +1000 Subject: xfs: fix data corruption w/ unaligned reflink ranges When reflinking sub-file ranges, a data corruption can occur when the source file range includes a partial EOF block. This shares the unknown data beyond EOF into the second file at a position inside EOF, exposing stale data in the second file. XFS only supports whole block sharing, but we still need to support whole file reflink correctly. Hence if the reflink request includes the last block of the souce file, only proceed with the reflink operation if it lands at or past the destination file's current EOF. If it lands within the destination file EOF, reject the entire request with -EINVAL and make the caller go the hard way. This avoids the data corruption vector, but also avoids disruption of returning EINVAL to userspace for the common case of whole file cloning. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f889398e25d6..42ea7bab9144 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1262,22 +1262,32 @@ xfs_reflink_zero_posteof( /* * Prepare two files for range cloning. Upon a successful return both inodes - * will have the iolock and mmaplock held, the page cache of the out file - * will be truncated, and any leases on the out file will have been broken. - * This function borrows heavily from xfs_file_aio_write_checks. + * will have the iolock and mmaplock held, the page cache of the out file will + * be truncated, and any leases on the out file will have been broken. This + * function borrows heavily from xfs_file_aio_write_checks. * * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't * checked that the bytes beyond EOF physically match. Hence we cannot use the * EOF block in the source dedupe range because it's not a complete block match, - * hence can introduce a corruption into the file that has it's - * block replaced. + * hence can introduce a corruption into the file that has it's block replaced. * - * Despite this issue, we still need to report that range as successfully - * deduped to avoid confusing userspace with EINVAL errors on completely - * matching file data. The only time that an unaligned length will be passed to - * us is when it spans the EOF block of the source file, so if we simply mask it - * down to be block aligned here the we will dedupe everything but that partial - * EOF block. + * In similar fashion, the VFS file cloning also allows partial EOF blocks to be + * "block aligned" for the purposes of cloning entire files. However, if the + * source file range includes the EOF block and it lands within the existing EOF + * of the destination file, then we can expose stale data from beyond the source + * file EOF in the destination file. + * + * XFS doesn't support partial block sharing, so in both cases we have check + * these cases ourselves. For dedupe, we can simply round the length to dedupe + * down to the previous whole block and ignore the partial EOF block. While this + * means we can't dedupe the last block of a file, this is an acceptible + * tradeoff for simplicity on implementation. + * + * For cloning, we want to share the partial EOF block if it is also the new EOF + * block of the destination file. If the partial EOF block lies inside the + * existing destination EOF, then we have to abort the clone to avoid exposing + * stale data in the destination file. Hence we reject these clone attempts with + * -EINVAL in this case. */ STATIC int xfs_reflink_remap_prep( @@ -1293,6 +1303,7 @@ xfs_reflink_remap_prep( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); bool same_inode = (inode_in == inode_out); + u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; /* Lock both files against IO */ @@ -1325,8 +1336,18 @@ xfs_reflink_remap_prep( * from the source file so we don't try to dedupe the partial * EOF block. */ - if (is_dedupe) - *len &= ~((u64)i_blocksize(inode_in) - 1); + if (is_dedupe) { + *len &= ~blkmask; + } else if (*len & blkmask) { + /* + * The user is attempting to share a partial EOF block, + * if it's inside the destination EOF then reject it. + */ + if (pos_out + *len < i_size_read(inode_out)) { + ret = -EINVAL; + goto out_unlock; + } + } /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); -- cgit v1.2.3-59-g8ed1b