From 2c25716dcc25a0420c4ad49d6e6bf61e60a21434 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:44 +1030 Subject: btrfs: zlib: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 1 (zlib) Which has an inline compressed extent at file offset 0, and its decompressed size is 4K, allowing us to reflink that 4K range to another location (which will not be compressed). If we do such reflink on a subpage system, it would fail like this: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should switch our output buffer. In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Add an extra ASSERT() inside btrfs_decompress() to make sure the input/output size never exceeds one sector. - Use Z_FINISH flag to make sure the decompression happens in one go - Remove the loop needed to switch input/output buffers - Use correct destination offset inside the destination page - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And resulted a correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/compression.h') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..2b4dfb1b010c 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); -- cgit v1.2.3-59-g8ed1b From 6a69631ec9b1b23784c012a855bbb23012a6dbeb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:45 +1030 Subject: btrfs: lzo: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 2 (lzo) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is from the very beginning of btrfs). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Use memcpy_to_page() to copy the contents to the destination page - Use memzero_page() to zero out the tailing part - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/lzo.c | 34 +++++++++------------------------- 2 files changed, 10 insertions(+), 26 deletions(-) (limited to 'fs/btrfs/compression.h') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2b4dfb1b010c..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } -- cgit v1.2.3-59-g8ed1b From 1e7f6def8b2370ecefb54b3c8f390ff894b0c51b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:46 +1030 Subject: btrfs: zstd: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 3 (zstd) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zstd_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is copied from other decompression code). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector, thus we should not need to do any buffer switch. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modification: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Simplify the main loop Since the input/output buffer should never switch, we only need one zstd_decompress_stream() call. - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 75 +++++++++++++++----------------------------------- 2 files changed, 23 insertions(+), 54 deletions(-) (limited to 'fs/btrfs/compression.h') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d073d..97fe3ebf11a2 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d4..346c46d88d07 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -20,6 +20,7 @@ #include "misc.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -618,80 +619,48 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff + to_copy, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } -- cgit v1.2.3-59-g8ed1b From e01a83e12604aa2f8d4ab359ec44e341a2248b4a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 22 Jan 2024 15:39:01 -0800 Subject: Revert "btrfs: zstd: fix and simplify the inline extent decompression" This reverts commit 1e7f6def8b2370ecefb54b3c8f390ff894b0c51b. It causes my machine to not even boot, and Klara Modin reports that the cause is that small zstd-compressed files return garbage when read. Reported-by: Klara Modin Link: https://lore.kernel.org/linux-btrfs/CABq1_vj4GpUeZpVG49OHCo-3sdbe2-2ROcu_xDvUG-6-5zPRXg@mail.gmail.com/ Reported-and-bisected-by: Linus Torvalds Acked-by: David Sterba Cc: Qu Wenruo Signed-off-by: Linus Torvalds --- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 75 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/compression.h') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 97fe3ebf11a2..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 346c46d88d07..0d66db8bc1d4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -20,7 +20,6 @@ #include "misc.h" #include "compression.h" #include "ctree.h" -#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -619,48 +618,80 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); - const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - unsigned long to_copy = 0; + size_t ret2; + unsigned long total_out = 0; + unsigned long pg_offset = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); + ret = -EIO; goto finish; } + destlen = min_t(size_t, destlen, PAGE_SIZE); + workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = sectorsize; - - /* - * Since both input and output buffers should not exceed one sector, - * one call should end the decompression. - */ - ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret)) { - pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", - zstd_get_error_code(ret)); - goto finish; + workspace->out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: zstd_decompress_stream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, + &workspace->in_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + workspace->out_buf.size - buf_offset); + + memcpy_to_page(dest_page, pg_offset, + workspace->out_buf.dst + buf_offset, bytes); + + pg_offset += bytes; } - to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff + to_copy, workspace->out_buf.dst, to_copy); + ret = 0; finish: - /* Error or early end. */ - if (unlikely(to_copy < destlen)) { - ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + if (pg_offset < destlen) { + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } -- cgit v1.2.3-59-g8ed1b