aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--arch/parisc/include/asm/cacheflush.h6
-rw-r--r--arch/parisc/kernel/cache.c2
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c88
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-group.c34
-rw-r--r--fs/btrfs/block-rsv.c21
-rw-r--r--fs/btrfs/block-rsv.h15
-rw-r--r--fs/btrfs/btrfs_inode.h25
-rw-r--r--fs/btrfs/compression.c359
-rw-r--r--fs/btrfs/compression.h18
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delalloc-space.c6
-rw-r--r--fs/btrfs/delayed-inode.c395
-rw-r--r--fs/btrfs/delayed-inode.h11
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/disk-io.c268
-rw-r--r--fs/btrfs/disk-io.h17
-rw-r--r--fs/btrfs/extent-tree.c149
-rw-r--r--fs/btrfs/extent_io.c873
-rw-r--r--fs/btrfs/extent_io.h15
-rw-r--r--fs/btrfs/file.c29
-rw-r--r--fs/btrfs/free-space-cache.c3
-rw-r--r--fs/btrfs/inode.c764
-rw-r--r--fs/btrfs/ioctl.c150
-rw-r--r--fs/btrfs/lzo.c28
-rw-r--r--fs/btrfs/ordered-data.c40
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/raid56.c792
-rw-r--r--fs/btrfs/raid56.h168
-rw-r--r--fs/btrfs/reflink.c19
-rw-r--r--fs/btrfs/scrub.c71
-rw-r--r--fs/btrfs/send.c781
-rw-r--r--fs/btrfs/send.h169
-rw-r--r--fs/btrfs/space-info.c110
-rw-r--r--fs/btrfs/space-info.h8
-rw-r--r--fs/btrfs/struct-funcs.c11
-rw-r--r--fs/btrfs/subpage.c4
-rw-r--r--fs/btrfs/super.c36
-rw-r--r--fs/btrfs/sysfs.c186
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/transaction.c26
-rw-r--r--fs/btrfs/tree-log.c29
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c362
-rw-r--r--fs/btrfs/volumes.h46
-rw-r--r--fs/btrfs/zlib.c42
-rw-r--r--fs/btrfs/zoned.c131
-rw-r--r--fs/btrfs/zoned.h18
-rw-r--r--fs/btrfs/zstd.c33
-rw-r--r--include/linux/blkdev.h5
-rw-r--r--include/linux/highmem-internal.h10
-rw-r--r--include/trace/events/btrfs.h158
-rw-r--r--include/uapi/linux/btrfs.h10
-rw-r--r--mm/highmem.c2
57 files changed, 3842 insertions, 2829 deletions
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 8d03b3b26229..0bdee6724132 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -22,7 +22,7 @@ void flush_kernel_icache_range_asm(unsigned long, unsigned long);
void flush_user_dcache_range_asm(unsigned long, unsigned long);
void flush_kernel_dcache_range_asm(unsigned long, unsigned long);
void purge_kernel_dcache_range_asm(unsigned long, unsigned long);
-void flush_kernel_dcache_page_asm(void *);
+void flush_kernel_dcache_page_asm(const void *addr);
void flush_kernel_icache_page(void *);
/* Cache flush operations */
@@ -31,7 +31,7 @@ void flush_cache_all_local(void);
void flush_cache_all(void);
void flush_cache_mm(struct mm_struct *mm);
-void flush_kernel_dcache_page_addr(void *addr);
+void flush_kernel_dcache_page_addr(const void *addr);
#define flush_kernel_dcache_range(start,size) \
flush_kernel_dcache_range_asm((start), (start)+(size));
@@ -75,7 +75,7 @@ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr);
void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr);
#define ARCH_HAS_FLUSH_ON_KUNMAP
-static inline void kunmap_flush_on_unmap(void *addr)
+static inline void kunmap_flush_on_unmap(const void *addr)
{
flush_kernel_dcache_page_addr(addr);
}
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index a9bc578e4c52..993999a65e54 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -549,7 +549,7 @@ extern void purge_kernel_dcache_page_asm(unsigned long);
extern void clear_user_page_asm(void *, unsigned long);
extern void copy_user_page_asm(void *, void *, unsigned long);
-void flush_kernel_dcache_page_addr(void *addr)
+void flush_kernel_dcache_page_addr(const void *addr)
{
unsigned long flags;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 07960529b360..6e2596ddae10 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,7 +13,6 @@ struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ebc392ea1d74..d385357e19b6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2028,10 +2028,29 @@ out:
return ret;
}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+ void *ctx, bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -2049,17 +2068,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx, ignore_offset);
+ build_ino_list, ctx, ignore_offset);
return ret;
}
-typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx);
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
-static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
{
int ret = 0;
int slot;
@@ -2068,6 +2085,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u32 name_len;
u64 parent = 0;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2103,8 +2122,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
fs_root->root_key.objectid);
- ret = iterate(parent, name_len,
- (unsigned long)(iref + 1), eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -2118,15 +2137,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
{
int ret;
int slot;
u64 offset = 0;
u64 parent;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
u32 item_size;
@@ -2162,8 +2181,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
name_len = btrfs_inode_extref_name_len(eb, extref);
- ret = iterate(parent, name_len,
- (unsigned long)&extref->name, eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
if (ret)
break;
@@ -2180,34 +2199,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path, iterate_irefs_t *iterate,
- void *ctx)
-{
- int ret;
- int found_refs = 0;
-
- ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
- if (!ret)
- ++found_refs;
- else if (ret != -ENOENT)
- return ret;
-
- ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
- if (ret == -ENOENT && found_refs)
- return 0;
-
- return ret;
-}
-
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx)
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
{
- struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
@@ -2248,8 +2246,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
- return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ba454032dbe2..2759de7d324c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
+ struct btrfs_path *path, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ede389f2602d..c3aecfb0a71d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1051,8 +1051,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
+ WARN_ON(block_group->zone_is_active &&
+ block_group->space_info->active_total_bytes
+ < block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
+ if (block_group->zone_is_active)
+ block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
@@ -1816,11 +1821,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
@@ -2108,7 +2112,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
trace_btrfs_add_block_group(info, cache, 0);
btrfs_update_space_info(info, cache->flags, cache->length,
cache->used, cache->bytes_super,
- cache->zone_unusable, &space_info);
+ cache->zone_unusable, cache->zone_is_active,
+ &space_info);
cache->space_info = space_info;
@@ -2178,7 +2183,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
}
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, 0, &space_info);
+ 0, 0, false, &space_info);
bg->space_info = space_info;
link_block_group(bg);
@@ -2559,7 +2564,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
cache->bytes_super, cache->zone_unusable,
- &cache->space_info);
+ cache->zone_is_active, &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2659,6 +2664,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ if (ret < 0)
+ goto out;
+
ret = inc_block_group_ro(cache, 0);
if (ret == -ETXTBSY)
goto unlock_out;
@@ -3761,6 +3774,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
@@ -3884,6 +3898,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
ret = PTR_ERR(bg);
} else {
/*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ if (ret < 0)
+ return;
+
+ /*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b3ee49b0b1e8..06be0644dd37 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
+ block_rsv->full = true;
} else {
num_bytes = 0;
}
@@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
- dest->full = 1;
+ dest->full = true;
num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
@@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
@@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
@@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
struct btrfs_block_rsv *block_rsv;
@@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
- block_rsv->full = 0;
+ block_rsv->full = false;
ret = 0;
}
spin_unlock(&block_rsv->lock);
@@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
if (update_size)
block_rsv->size += num_bytes;
else if (block_rsv->reserved >= block_rsv->size)
- block_rsv->full = 1;
+ block_rsv->full = true;
spin_unlock(&block_rsv->lock);
}
@@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
@@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
btrfs_try_granting_tickets(fs_info, sinfo);
}
- if (block_rsv->reserved == block_rsv->size)
- block_rsv->full = 1;
- else
- block_rsv->full = 0;
+ block_rsv->full = (block_rsv->reserved == block_rsv->size);
if (block_rsv->size >= sinfo->total_bytes)
sinfo->force_alloc = CHUNK_ALLOC_FORCE;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 3b67ff08d434..0c183709be00 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum;
/*
* Types of block reserves
*/
-enum {
+enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
@@ -25,9 +25,10 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned short full;
- unsigned short type;
- unsigned short failfast;
+ bool full;
+ bool failfast;
+ /* Block reserve type, one of BTRFS_BLOCK_RSV_* */
+ enum btrfs_rsv_type type:8;
/*
* Qgroup equivalent for @size @reserved
@@ -49,13 +50,13 @@ struct btrfs_block_rsv {
u64 qgroup_rsv_reserved;
};
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type);
void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 33811e896623..b160b8e124e0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -279,19 +279,31 @@ static inline void btrfs_insert_inode_hash(struct inode *inode)
__insert_inode_hash(inode, h);
}
+#if BITS_PER_LONG == 32
+
+/*
+ * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
+ * we use the inode's location objectid which is a u64 to avoid truncation.
+ */
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
u64 ino = inode->location.objectid;
- /*
- * !ino: btree_inode
- * type == BTRFS_ROOT_ITEM_KEY: subvol dir
- */
- if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
+ if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
ino = inode->vfs_inode.i_ino;
return ino;
}
+#else
+
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ return inode->vfs_inode.i_ino;
+}
+
+#endif
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
@@ -305,8 +317,7 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
if (root == root->fs_info->tree_root &&
btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
return true;
- if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
- return true;
+
return false;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a82b9f17f476..e84d22c5c6a8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -136,109 +136,14 @@ static int compression_decompress(int type, struct list_head *ws,
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
- unsigned long disk_size)
-{
- return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
-}
-
-static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
- u64 disk_start)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u32 csum_size = fs_info->csum_size;
- const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
- unsigned int i;
- char *kaddr;
- u8 csum[BTRFS_CSUM_SIZE];
- struct compressed_bio *cb = bio->bi_private;
- u8 *cb_sum = cb->sums;
-
- if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
- return 0;
-
- shash->tfm = fs_info->csum_shash;
-
- for (i = 0; i < cb->nr_pages; i++) {
- u32 pg_offset;
- u32 bytes_left = PAGE_SIZE;
- page = cb->compressed_pages[i];
-
- /* Determine the remaining bytes inside the page first */
- if (i == cb->nr_pages - 1)
- bytes_left = cb->compressed_len - i * PAGE_SIZE;
-
- /* Hash through the page sector by sector */
- for (pg_offset = 0; pg_offset < bytes_left;
- pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
- crypto_shash_digest(shash, kaddr + pg_offset,
- sectorsize, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(&csum, cb_sum, csum_size) != 0) {
- btrfs_print_data_csum_error(inode, disk_start,
- csum, cb_sum, cb->mirror_num);
- if (btrfs_bio(bio)->device)
- btrfs_dev_stat_inc_and_print(
- btrfs_bio(bio)->device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- return -EIO;
- }
- cb_sum += csum_size;
- disk_start += sectorsize;
- }
- }
- return 0;
-}
-
-/*
- * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
- *
- * Return true if there is no pending bio nor io.
- * Return false otherwise.
- */
-static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- unsigned int bi_size = 0;
- bool last_io = false;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * At endio time, bi_iter.bi_size doesn't represent the real bio size.
- * Thus here we have to iterate through all segments to grab correct
- * bio size.
- */
- bio_for_each_segment_all(bvec, bio, iter_all)
- bi_size += bvec->bv_len;
-
- if (bio->bi_status)
- cb->status = bio->bi_status;
-
- ASSERT(bi_size && bi_size <= cb->compressed_len);
- last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
- &cb->pending_sectors);
- /*
- * Here we must wake up the possible error handler after all other
- * operations on @cb finished, or we can race with
- * finish_compressed_bio_*() which may free @cb.
- */
- wake_up_var(cb);
-
- return last_io;
-}
-
static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
+ if (cb->status == BLK_STS_OK)
+ cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+
/* Release the compressed pages */
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
@@ -247,85 +152,63 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
- if (cb->status != BLK_STS_OK) {
+ if (cb->status != BLK_STS_OK)
cb->orig_bio->bi_status = cb->status;
- bio_endio(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * We have verified the checksum already, set page checked so
- * the end_io handlers know about it
- */
- ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
- u64 bvec_start = page_offset(bvec->bv_page) +
- bvec->bv_offset;
-
- btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
- bvec->bv_page, bvec_start,
- bvec->bv_len);
- }
-
- bio_endio(cb->orig_bio);
- }
+ bio_endio(cb->orig_bio);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
}
-/* when we finish reading compressed pages from the disk, we
- * decompress them and then run the bio end_io routines on the
- * decompressed pages (in the inode address space).
- *
- * This allows the checksumming and other IO error handling routines
- * to work normally
- *
- * The compressed pages are freed here, and it must be run
- * in process context
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
*/
static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- unsigned int mirror = btrfs_bio(bio)->mirror_num;
- int ret = 0;
-
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
-
- /*
- * Record the correct mirror_num in cb->orig_bio so that
- * read-repair can work properly.
- */
- btrfs_bio(cb->orig_bio)->mirror_num = mirror;
- cb->mirror_num = mirror;
-
- /*
- * Some IO in this cb have failed, just skip checksum as there
- * is no way it could be correct.
- */
- if (cb->status != BLK_STS_OK)
- goto csum_failed;
-
- inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), bio,
- bio->bi_iter.bi_sector << 9);
- if (ret)
- goto csum_failed;
+ struct inode *inode = cb->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ blk_status_t status = bio->bi_status;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (!status &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ clean_io_failure(fs_info, &bi->io_failure_tree,
+ &bi->io_tree, start, bv.bv_page,
+ btrfs_ino(bi), bv.bv_offset);
+ } else {
+ int ret;
+
+ refcount_inc(&cb->pending_ios);
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ btrfs_submit_data_read_bio);
+ if (ret) {
+ refcount_dec(&cb->pending_ios);
+ status = errno_to_blk_status(ret);
+ }
+ }
+ }
- /* ok, we're the last bio for this extent, lets start
- * the decompression.
- */
- ret = btrfs_decompress_bio(cb);
+ if (status)
+ cb->status = status;
-csum_failed:
- if (ret)
- cb->status = errno_to_blk_status(ret);
- finish_compressed_bio_read(cb);
-out:
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -403,6 +286,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
kfree(cb);
}
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -414,29 +305,18 @@ static void end_compressed_bio_write(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
+ if (bio->bi_status)
+ cb->status = bio->bi_status;
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ if (refcount_dec_and_test(&cb->pending_ios)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- finish_compressed_bio_write(cb);
-out:
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
bio_put(bio);
}
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct bio *bio, int mirror_num)
-{
- blk_status_t ret;
-
- ASSERT(bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- return ret;
-}
-
/*
* Allocate a compressed_bio, which will be used to read/write on-disk
* (aka, compressed) * data.
@@ -487,7 +367,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
return ERR_PTR(ret);
}
*next_stripe_start = disk_bytenr + geom.len;
-
+ refcount_inc(&cb->pending_ios);
return bio;
}
@@ -514,26 +394,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct compressed_bio *cb;
u64 cur_disk_bytenr = disk_start;
u64 next_stripe_start;
- blk_status_t ret;
+ blk_status_t ret = BLK_STS_OK;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
- cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
- cb->orig_bio = NULL;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
if (blkcg_css)
@@ -554,8 +433,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
&next_stripe_start);
if (IS_ERR(bio)) {
ret = errno_to_blk_status(PTR_ERR(bio));
- bio = NULL;
- goto finish_cb;
+ break;
}
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
@@ -599,44 +477,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
- if (ret)
- goto finish_cb;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ break;
+ }
}
- ret = submit_compressed_bio(fs_info, bio, 0);
- if (ret)
- goto finish_cb;
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
bio = NULL;
}
cond_resched();
}
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
-finish_cb:
if (blkcg_css)
kthread_associate_blkcg(NULL);
- if (bio) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
- /* Last byte of @cb is submitted, endio will free @cb */
- if (cur_disk_bytenr == disk_start + compressed_len)
- return ret;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_start + compressed_len - cur_disk_bytenr) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_write(cb);
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_write(cb);
return ret;
}
@@ -765,7 +624,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
- flush_dcache_page(page);
}
}
@@ -819,7 +677,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
blk_status_t ret;
int ret2;
int i;
- u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -837,17 +694,15 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb) {
ret = BLK_STS_RESOURCE;
goto out;
}
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = inode;
- cb->mirror_num = mirror_num;
- sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
@@ -893,9 +748,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
REQ_OP_READ, end_compressed_bio_read,
&next_stripe_start);
if (IS_ERR(comp_bio)) {
- ret = errno_to_blk_status(PTR_ERR(comp_bio));
- comp_bio = NULL;
- goto finish_cb;
+ cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+ break;
}
}
/*
@@ -931,22 +785,33 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
submit = true;
if (submit) {
- unsigned int nr_sectors;
+ /* Save the original iter for read repair */
+ if (bio_op(comp_bio) == REQ_OP_READ)
+ btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- if (ret)
- goto finish_cb;
+ /*
+ * Save the initial offset of this chunk, as there
+ * is no direct correlation between compressed pages and
+ * the original file offset. The field is only used for
+ * priting error messages.
+ */
+ btrfs_bio(comp_bio)->file_offset = file_offset;
- nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- fs_info->sectorsize);
- sums += fs_info->csum_size * nr_sectors;
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
+ if (ret) {
+ comp_bio->bi_status = ret;
+ bio_endio(comp_bio);
+ break;
+ }
- ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
- if (ret)
- goto finish_cb;
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
comp_bio = NULL;
}
}
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
return;
fail:
@@ -964,25 +829,6 @@ out:
bio->bi_status = ret;
bio_endio(bio);
return;
-finish_cb:
- if (comp_bio) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
- /* All bytes of @cb is submitted, endio will free @cb */
- if (cur_disk_byte == disk_bytenr + compressed_len)
- return;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_bytenr + compressed_len - cur_disk_byte) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish @cb manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb);
}
/*
@@ -1481,7 +1327,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
ASSERT(copy_start - decompressed < buf_len);
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
buf + copy_start - decompressed, copy_len);
- flush_dcache_page(bvec.bv_page);
cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2b56d63e01ce..1aa02903de69 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
- refcount_t pending_sectors;
+ /* Number of outstanding bios */
+ refcount_t pending_ios;
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -59,16 +59,12 @@ struct compressed_bio {
/* IO errors */
blk_status_t status;
- int mirror_num;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u8 sums[];
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c21e214d29e..4db85b9dc7ed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -107,14 +107,6 @@ struct btrfs_ioctl_encoded_io_args;
#define BTRFS_STAT_CURR 0
#define BTRFS_STAT_PREV 1
-/*
- * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
- */
-static inline u32 count_max_extents(u64 size)
-{
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-}
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -230,6 +222,13 @@ struct btrfs_root_backup {
#define BTRFS_SUPER_INFO_SIZE 4096
/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -248,8 +247,12 @@ struct btrfs_super_block {
__le64 chunk_root;
__le64 log_root;
- /* this will help find the new super based on the log root */
- __le64 log_root_transid;
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
__le64 total_bytes;
__le64 bytes_used;
__le64 root_dir_objectid;
@@ -635,6 +638,9 @@ enum {
/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -656,6 +662,18 @@ enum btrfs_exclusive_operation {
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -850,11 +868,11 @@ struct btrfs_fs_info {
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -1032,6 +1050,12 @@ struct btrfs_fs_info {
u32 csums_per_leaf;
u32 stripesize;
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
@@ -1047,6 +1071,8 @@ struct btrfs_fs_info {
*/
u64 zone_size;
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
@@ -1063,6 +1089,11 @@ struct btrfs_fs_info {
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
+ /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */
+ wait_queue_head_t zone_finish_wait;
+
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
@@ -2475,8 +2506,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
- log_root_transid, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
@@ -2733,8 +2762,16 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums,
+ u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
+}
+
/*
- * Take the number of bytes to be checksummmed and figure out how many leaves
+ * Take the number of bytes to be checksummed and figure out how many leaves
* it would require to store the csums for that many bytes.
*/
static inline u64 btrfs_csum_bytes_to_leaves(
@@ -3251,11 +3288,18 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3305,9 +3349,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits);
+ u32 bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
@@ -3353,6 +3397,12 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
@@ -4009,6 +4059,19 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zone_size > 0;
}
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 36ab0859a263..1e8f17ff829e 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -273,7 +273,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
- u64 nr_extents = count_max_extents(num_bytes);
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
@@ -350,7 +350,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
+ nr_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
@@ -413,7 +413,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
unsigned num_extents;
spin_lock(&inode->lock);
- num_extents = count_max_extents(num_bytes);
+ num_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 748bf6b0d860..e7f34871a132 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->p_list);
}
-static inline int btrfs_is_continuous_delayed_item(
- struct btrfs_delayed_item *item1,
- struct btrfs_delayed_item *item2)
-{
- if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
- item1->key.objectid == item2->key.objectid &&
- item1->key.type == item2->key.type &&
- item1->key.offset + 1 == item2->key.offset)
- return 1;
- return 0;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -398,8 +386,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
- struct btrfs_delayed_item *ins,
- int action)
+ struct btrfs_delayed_item *ins)
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
@@ -408,9 +395,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
int cmp;
bool leftmost = true;
- if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (action == BTRFS_DELAYED_DELETION_ITEM)
+ else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
root = &delayed_node->del_root;
else
BUG();
@@ -436,32 +423,19 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
- ins->ins_or_del = action;
- if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
- action == BTRFS_DELAYED_INSERTION_ITEM &&
+ /* Delayed items are always for dir index items. */
+ ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY);
+
+ if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM &&
ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ delayed_node->index_cnt = ins->key.offset + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
return 0;
}
-static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_INSERTION_ITEM);
-}
-
-static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_DELETION_ITEM);
-}
-
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
@@ -573,7 +547,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
num_bytes, 1);
- item->bytes_reserved = num_bytes;
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
}
return ret;
@@ -599,6 +579,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
+}
+
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -672,22 +667,53 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * Insert a single delayed item or a batch of delayed items that have consecutive
- * keys if they exist.
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
*/
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
- const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
int total_size;
char *ins_data = NULL;
int ret;
+ bool continuous_keys_only = false;
+
+ lockdep_assert_held(&node->mutex);
+
+ /*
+ * During normal operation the delayed index offset is continuously
+ * increasing, so we can batch insert all items as there will not be any
+ * overlapping keys in the tree.
+ *
+ * The exception to this is log replay, where we may have interleaved
+ * offsets in the tree, so our batch needs to be continuous keys only in
+ * order to ensure we do not end up with out of order items in our leaf.
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ continuous_keys_only = true;
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(first_item->bytes_reserved == 0);
list_add_tail(&first_item->tree_list, &item_list);
batch.total_data_size = first_item->data_len;
@@ -699,9 +725,19 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
int next_size;
next = __btrfs_next_delayed_item(curr);
- if (!next || !btrfs_is_continuous_delayed_item(curr, next))
+ if (!next)
+ break;
+
+ /*
+ * We cannot allow gaps in the key space if we're doing log
+ * replay.
+ */
+ if (continuous_keys_only &&
+ (next->key.offset != curr->key.offset + 1))
break;
+ ASSERT(next->bytes_reserved == 0);
+
next_size = next->data_len + sizeof(struct btrfs_item);
if (total_size + next_size > max_size)
break;
@@ -758,9 +794,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * For normal operations we will batch an entire leaf's worth of delayed
+ * items, so if there are more items to process we can decrement
+ * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
+ *
+ * However for log replay we may not have inserted an entire leaf's
+ * worth of items, we may have not had continuous items, so decrementing
+ * here would mess up the index_item_leaves accounting. For this case
+ * only clean up the accounting when there are no items left.
+ */
+ if (next && !continuous_keys_only) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else if (!next) {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
- btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
out:
@@ -796,62 +864,75 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- struct list_head head;
- int nitems, i, last_item;
- int ret = 0;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
- BUG_ON(!path->nodes[0]);
+ ASSERT(leaf != NULL);
- leaf = path->nodes[0];
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
- i = path->slots[0];
- last_item = btrfs_header_nritems(leaf) - 1;
- if (i > last_item)
- return -ENOENT; /* FIXME: Is errno suitable? */
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
- next = item;
- INIT_LIST_HEAD(&head);
- btrfs_item_key_to_cpu(leaf, &key, i);
- nitems = 0;
/*
- * count the number of the dir index items that we can delete in batch
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
*/
- while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ while (slot < last_slot) {
+ struct btrfs_key key;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
- break;
-
- i++;
- if (i > last_item)
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_comp_cpu_keys(&next->key, &key) != 0)
break;
- btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
}
- if (!nitems)
- return 0;
-
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
if (ret)
- goto out;
+ return ret;
+
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
+ item->key.objectid, total_reserved_size,
+ 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- return ret;
+ return 0;
}
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
@@ -859,43 +940,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_deletion_item(node);
- if (!curr)
- goto delete_fail;
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
- ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- if (ret < 0)
- goto delete_fail;
- else if (ret > 0) {
/*
- * can't find the item which the node points to, so this node
- * is invalid, just drop it.
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
*/
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- btrfs_release_delayed_item(prev);
- ret = 0;
- btrfs_release_path(path);
- if (curr) {
- mutex_unlock(&node->mutex);
- goto do_again;
- } else
- goto delete_fail;
+ mutex_unlock(&node->mutex);
}
- btrfs_batch_delete_items(trans, root, path, curr);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-delete_fail:
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1354,9 +1444,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
int ret;
delayed_node = btrfs_get_or_create_delayed_node(dir);
@@ -1372,6 +1466,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
delayed_item->key.objectid = btrfs_ino(dir);
delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
delayed_item->key.offset = index;
+ delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1381,15 +1476,52 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
mutex_lock(&delayed_node->mutex);
- ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root,
+ delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_item(delayed_item);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+ }
+
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1417,8 +1549,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
return 1;
}
- btrfs_delayed_item_release_metadata(node->root, item);
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
mutex_unlock(&node->mutex);
return 0;
}
@@ -1451,6 +1612,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
item->key = item_key;
+ item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM;
ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
/*
@@ -1465,7 +1627,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_lock(&node->mutex);
- ret = __btrfs_add_delayed_deletion_item(node, item);
+ ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1833,12 +1995,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
}
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
btrfs_delayed_item_release_metadata(root, curr_item);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b2412160c5bc..9795dc295a18 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -58,6 +58,17 @@ struct btrfs_delayed_node {
u64 index_cnt;
unsigned long flags;
int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
};
struct btrfs_delayed_item {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 99f37fca2e96..36a3debe9493 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = 0;
+ delayed_rsv->full = false;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
@@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
if (num_bytes)
delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = 1;
+ delayed_refs_rsv->full = true;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a7dd6ba25e99..f43196a893ca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -587,7 +587,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
ASSERT(!IS_ERR(em));
map = em->map_lookup;
- num_extents = cur_extent = 0;
+ num_extents = 0;
+ cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1b23d806a647..4c3166f3c725 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -51,7 +51,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -64,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
@@ -256,8 +221,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
goto out;
}
btrfs_err_rl(eb->fs_info,
- "parent transid verify failed on %llu wanted %llu found %llu",
- eb->start,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
ret = 1;
clear_extent_buffer_uptodate(eb);
@@ -587,21 +552,23 @@ static int validate_extent_buffer(struct extent_buffer *eb)
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
- eb->start, found_start);
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on block %llu",
- eb->start);
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "bad tree block level %d on %llu",
- (int)btrfs_header_level(eb), eb->start);
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
ret = -EIO;
goto out;
}
@@ -612,8 +579,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- eb->start,
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
@@ -638,8 +605,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
- "block=%llu read time tree block corruption detected",
- eb->start);
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
out:
return ret;
}
@@ -740,58 +707,6 @@ err:
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
-{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
- }
-
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
-{
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
-
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
-
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -816,7 +731,6 @@ static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async;
struct inode *inode;
- blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
inode = async->inode;
@@ -834,11 +748,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
- if (ret) {
- async->bio->bi_status = ret;
- bio_endio(async->bio);
- }
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -849,16 +759,23 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return BLK_STS_RESOURCE;
+ return false;
async->inode = inode;
async->bio = bio;
@@ -876,7 +793,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
btrfs_queue_work(fs_info->hipri_workers, &async->work);
else
btrfs_queue_work(fs_info->workers, &async->work);
- return 0;
+ return true;
}
static blk_status_t btree_csum_one_bio(struct bio *bio)
@@ -902,7 +819,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
{
/*
* when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_map_bio
+ * submission context. Just jump into btrfs_submit_bio.
*/
return btree_csum_one_bio(bio);
}
@@ -924,32 +841,29 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
+ bio->bi_opf |= REQ_META;
+
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
- ret = btree_csum_one_bio(bio);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else {
- /*
- * kthread helpers are used to submit writes so that
- * checksumming can happen in parallel across all CPUs
- */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- btree_submit_bio_start);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
}
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
+
+ ret = btree_csum_one_bio(bio);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
#ifdef CONFIG_MIGRATION
@@ -1870,7 +1784,7 @@ again:
fail:
/*
* If our caller provided us an anonymous device, then it's his
- * responsability to free it in case we fail. So we have to set our
+ * responsibility to free it in case we fail. So we have to set our
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
@@ -1953,25 +1867,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
return root;
}
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
-
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
static int cleaner_kthread(void *arg)
{
struct btrfs_fs_info *fs_info = arg;
@@ -2278,10 +2173,14 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2295,8 +2194,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2426,7 +2325,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
+ BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
+ BTRFS_I(inode)->location.type = 0;
+ BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
btrfs_insert_inode_hash(inode);
}
@@ -2475,25 +2376,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2508,7 +2402,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
if (!(fs_info->workers && fs_info->hipri_workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -2535,6 +2429,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
@@ -3253,6 +3150,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
+ init_waitqueue_head(&fs_info->zone_finish_wait);
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
@@ -3260,6 +3158,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
+
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
@@ -3591,16 +3491,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
@@ -3638,8 +3528,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
- if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- btrfs_info(fs_info, "has skinny extents");
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
/*
* mixed block groups end up with duplicate but slightly offset
@@ -3668,6 +3562,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4ee8c42c9f78..8993b428e09c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,13 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -121,11 +114,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
@@ -145,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data,
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a3afc15430ce..ea3ec1e761e8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are discarding.
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_io_stripe *stripe;
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
int i;
num_bytes = end - cur;
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bioc, 0);
- /*
- * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
- * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
- * thus we can't continue anyway.
- */
- if (ret < 0)
- goto out;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
- stripe = bioc->stripes;
- for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
u64 bytes;
- struct btrfs_device *device = stripe->dev;
- if (!device->bdev) {
+ if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes);
- if (!ret) {
- discarded_bytes += bytes;
- } else if (ret != -EOPNOTSUPP) {
+ if (ret) {
/*
- * Logic errors or -ENOMEM, or -EIO, but
- * unlikely to happen.
- *
- * And since there are two loops, explicitly
- * go to out to avoid confusion.
+ * Keep going if discard is not supported by the
+ * device.
*/
- btrfs_put_bioc(bioc);
- goto out;
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
}
-
- /*
- * Just in case we get back EOPNOTSUPP for some reason,
- * just ignore the return value so we don't screw up
- * people calling discard_extent.
- */
- ret = 0;
}
- btrfs_put_bioc(bioc);
+ kfree(stripes);
+ if (ret)
+ break;
cur += num_bytes;
}
-out:
btrfs_bio_counter_dec(fs_info);
-
if (actual_bytes)
*actual_bytes = discarded_bytes;
-
-
- if (ret == -EOPNOTSUPP)
- ret = 0;
return ret;
}
@@ -3981,23 +3965,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
}
}
-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ /* If we can activate new zone, just allocate a chunk and use it */
+ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return 0;
+
+ /*
+ * We already reached the max active zones. Try to finish one block
+ * group to make a room for a new block group. This is only possible
+ * for a data block group because btrfs_zone_finish() may need to wait
+ * for a running transaction which can cause a deadlock for metadata
+ * allocation.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int ret = btrfs_zone_finish_one_bg(fs_info);
+
+ if (ret == 1)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * If we have enough free space left in an already active block group
+ * and we can't activate any other zone now, do not allow allocating a
+ * new chunk and let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
+ /*
+ * Even min_alloc_size is not left in any block groups. Since we cannot
+ * activate a new block group, allocating it may not help. Let's tell a
+ * caller to try again and hope it progress something by writing some
+ * parts of the region. That is only possible for data block groups,
+ * where a part of the region can be written.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+ return -EAGAIN;
+
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+ * there is nothing to do anyway, so let's go with it.
+ */
+ return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
- return true;
+ return 0;
case BTRFS_EXTENT_ALLOC_ZONED:
- /*
- * If we have enough free space left in an already
- * active block group and we can't activate any other
- * zone now, do not allow allocating a new chunk and
- * let find_free_extent() retry with a smaller size.
- */
- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
- return false;
- return true;
+ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -4079,8 +4103,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
int exist = 0;
/*Check if allocation policy allows to create a new chunk */
- if (!can_allocate_chunk(fs_info, ffe_ctl))
- return -ENOSPC;
+ ret = can_allocate_chunk(fs_info, ffe_ctl);
+ if (ret)
+ return ret;
trans = current->journal_info;
if (trans)
@@ -5992,7 +6017,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = SZ_1M, len = 0, end = 0;
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
*trimmed = 0;
@@ -6036,8 +6061,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
break;
}
- /* Ensure we skip the reserved area in the first 1M */
- start = max_t(u64, start, SZ_1M);
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
/*
* If find_first_clear_extent_bit find a range that spans the
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9eb3056b94d4..bfae67c593c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -144,6 +144,7 @@ struct tree_entry {
*/
struct btrfs_bio_ctrl {
struct bio *bio;
+ int mirror_num;
enum btrfs_compression_type compress_type;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
@@ -178,61 +179,56 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
return ret;
}
-static void submit_one_bio(struct bio *bio, int mirror_num,
- enum btrfs_compression_type compress_type)
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- struct extent_io_tree *tree = bio->bi_private;
+ struct bio *bio;
+ struct bio_vec *bv;
+ struct inode *inode;
+ int mirror_num;
+
+ if (!bio_ctrl->bio)
+ return;
- bio->bi_private = NULL;
+ bio = bio_ctrl->bio;
+ bv = bio_first_bvec_all(bio);
+ inode = bv->bv_page->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- if (is_data_inode(tree->private_data))
- btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- compress_type);
- else
- btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
- /*
- * Above submission hooks will handle the error by ending the bio,
- * which will do the cleanup properly. So here we should not return
- * any error, or the caller of submit_extent_page() will do cleanup
- * again, causing problems.
- */
-}
+ btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
-/* Cleanup unsubmitted bios */
-static void end_write_bio(struct extent_page_data *epd, int ret)
-{
- struct bio *bio = epd->bio_ctrl.bio;
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
+ else
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
- if (bio) {
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- epd->bio_ctrl.bio = NULL;
- }
+ /* The bio is owned by the bi_end_io handler now */
+ bio_ctrl->bio = NULL;
}
/*
- * Submit bio from extent page data via submit_one_bio
- *
- * Return 0 if everything is OK.
- * Return <0 for error.
+ * Submit or fail the current bio in an extent_page_data structure.
*/
-static void flush_write_bio(struct extent_page_data *epd)
+static void submit_write_bio(struct extent_page_data *epd, int ret)
{
struct bio *bio = epd->bio_ctrl.bio;
- if (bio) {
- submit_one_bio(bio, 0, 0);
- /*
- * Clean up of epd->bio is handled by its endio function.
- * And endio is either triggered by successful bio execution
- * or the error handler of submit bio hook.
- * So at this point, no matter what happened, we don't need
- * to clean up epd->bio.
- */
+ if (!bio)
+ return;
+
+ if (ret) {
+ ASSERT(ret < 0);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ /* The bio is owned by the bi_end_io handler now */
epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
}
}
@@ -376,131 +372,121 @@ void free_extent_state(struct extent_state *state)
}
}
-static struct rb_node *tree_insert(struct rb_root *root,
- struct rb_node *search_start,
- u64 offset,
- struct rb_node *node,
- struct rb_node ***p_in,
- struct rb_node **parent_in)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- if (p_in && parent_in) {
- p = *p_in;
- parent = *parent_in;
- goto do_insert;
- }
-
- p = search_start ? &search_start : &root->rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- p = &(*p)->rb_left;
- else if (offset > entry->end)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
-do_insert:
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/**
* Search @tree for an entry that contains @offset. Such entry would have
* entry->start <= offset && entry->end >= offset.
*
* @tree: the tree to search
* @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- * @p_ret: pointer where new node should be anchored (used when inserting an
+ * @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
- * This function returns a pointer to the entry that contains @offset byte
- * address. If no such entry exists, then NULL is returned and the other
- * pointer arguments to the function are filled, otherwise the found entry is
- * returned and other pointers are left untouched.
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
*/
-static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **next_ret,
- struct rb_node **prev_ret,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
+static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
{
struct rb_root *root = &tree->state;
- struct rb_node **n = &root->rb_node;
+ struct rb_node **node = &root->rb_node;
struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
struct tree_entry *entry;
- struct tree_entry *prev_entry = NULL;
- while (*n) {
- prev = *n;
+ while (*node) {
+ prev = *node;
entry = rb_entry(prev, struct tree_entry, rb_node);
- prev_entry = entry;
if (offset < entry->start)
- n = &(*n)->rb_left;
+ node = &(*node)->rb_left;
else if (offset > entry->end)
- n = &(*n)->rb_right;
+ node = &(*node)->rb_right;
else
- return *n;
+ return *node;
}
- if (p_ret)
- *p_ret = n;
+ if (node_ret)
+ *node_ret = node;
if (parent_ret)
*parent_ret = prev;
- if (next_ret) {
- orig_prev = prev;
- while (prev && offset > prev_entry->end) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
+ /* Search neighbors until we find the first one past the end */
+ while (prev && offset > entry->end) {
+ prev = rb_next(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
}
- if (prev_ret) {
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
- }
- return NULL;
+ return prev;
}
-static inline struct rb_node *
-tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
{
- struct rb_node *next= NULL;
- struct rb_node *ret;
-
- ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
- if (!ret)
- return next;
- return ret;
+ return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
+/**
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
{
- return tree_search_for_insert(tree, offset, NULL, NULL);
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct tree_entry *entry;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return *node;
+ }
+
+ orig_prev = prev;
+ while (prev && offset > entry->end) {
+ prev = rb_next(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *next_ret = prev;
+ prev = orig_prev;
+
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ while (prev && offset < entry->start) {
+ prev = rb_prev(prev);
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+
+ return NULL;
}
/*
@@ -554,7 +540,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 *bits,
+ struct extent_state *state, u32 bits,
struct extent_changeset *changeset);
/*
@@ -568,37 +554,56 @@ static void set_state_bits(struct extent_io_tree *tree,
* probably isn't what you want to call (see set/clear_extent_bit).
*/
static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state, u64 start, u64 end,
- struct rb_node ***p,
- struct rb_node **parent,
- u32 *bits, struct extent_changeset *changeset)
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
{
- struct rb_node *node;
-
- if (end < start) {
- btrfs_err(tree->fs_info,
- "insert state: end < start %llu %llu", end, start);
- WARN_ON(1);
- }
- state->start = start;
- state->end = end;
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
set_state_bits(tree, state, bits, changeset);
- node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
- if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- found->start, found->end, start, end);
- return -EEXIST;
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct tree_entry *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
}
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
merge_state(tree, state);
return 0;
}
/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
* offset inside 'orig' where it should be split.
@@ -615,7 +620,8 @@ static int insert_state(struct extent_io_tree *tree,
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
- struct rb_node *node;
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
if (tree->private_data && is_data_inode(tree->private_data))
btrfs_split_delalloc_extent(tree->private_data, orig, split);
@@ -625,12 +631,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
prealloc->state = orig->state;
orig->start = split;
- node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
- &prealloc->rb_node, NULL, NULL);
- if (node) {
- free_extent_state(prealloc);
- return -EEXIST;
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct tree_entry *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
}
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
return 0;
}
@@ -652,11 +673,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- u32 *bits, int wake,
+ u32 bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -818,8 +839,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake,
- changeset);
+ state = clear_state_bit(tree, state, bits, wake, changeset);
goto next;
}
goto search_again;
@@ -840,13 +860,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake, changeset);
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -937,9 +957,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- u32 *bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
- u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -1033,11 +1053,9 @@ again:
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
@@ -1060,7 +1078,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits, changeset);
+ set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -1116,7 +1134,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits, changeset);
+ set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -1150,8 +1168,9 @@ hit_next:
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, changeset);
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1179,7 +1198,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits, changeset);
+ set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1274,10 +1293,9 @@ again:
err = -ENOMEM;
goto out;
}
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
@@ -1294,9 +1312,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
+ set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1335,10 +1353,9 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
+ set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0,
- NULL);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1372,8 +1389,9 @@ hit_next:
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, NULL);
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1398,9 +1416,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits, NULL);
+ set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1674,7 +1692,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
/* Find first extent with bits cleared */
while (1) {
- node = __etree_search(tree, start, &next, &prev, NULL, NULL);
+ node = tree_search_prev_next(tree, start, &prev, &next);
if (!node && !next && !prev) {
/*
* Tree is completely empty, send full range and let
@@ -2007,10 +2025,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
- u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ /* The sanity tests may not set a valid fs_info. */
+ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
bool found;
@@ -2418,6 +2438,20 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
return ret;
}
+static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == failrec->num_copies)
+ return cur_mirror + 1 - failrec->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return failrec->num_copies;
+ return cur_mirror - 1;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
@@ -2430,7 +2464,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
u64 private;
struct io_failure_record *failrec;
struct extent_state *state;
- int num_copies;
+ int mirror;
int ret;
private = 0;
@@ -2454,20 +2488,19 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
EXTENT_LOCKED);
spin_unlock(&io_tree->lock);
- if (state && state->start <= failrec->start &&
- state->end >= failrec->start + failrec->len - 1) {
- num_copies = btrfs_num_copies(fs_info, failrec->logical,
- failrec->len);
- if (num_copies > 1) {
- repair_io_failure(fs_info, ino, start, failrec->len,
- failrec->logical, page, pg_offset,
- failrec->failed_mirror);
- }
- }
+ if (!state || state->start > failrec->start ||
+ state->end < failrec->start + failrec->len - 1)
+ goto out;
+
+ mirror = failrec->this_mirror;
+ do {
+ mirror = prev_mirror(failrec, mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset, mirror);
+ } while (mirror != failrec->failed_mirror);
out:
free_io_failure(failure_tree, io_tree, failrec);
-
return 0;
}
@@ -2506,17 +2539,16 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
- u64 start)
+ struct btrfs_bio *bbio,
+ unsigned int bio_offset)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 start = bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct extent_map *em;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
const u32 sectorsize = fs_info->sectorsize;
int ret;
- u64 logical;
failrec = get_state_failrec(failure_tree, start);
if (!IS_ERR(failrec)) {
@@ -2528,7 +2560,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
-
+ ASSERT(failrec->this_mirror == bbio->mirror_num);
+ ASSERT(failrec->len == fs_info->sectorsize);
return failrec;
}
@@ -2538,41 +2571,28 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
failrec->start = start;
failrec->len = sectorsize;
- failrec->this_mirror = 0;
- failrec->compress_type = BTRFS_COMPRESS_NONE;
+ failrec->failed_mirror = bbio->mirror_num;
+ failrec->this_mirror = bbio->mirror_num;
+ failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return ERR_PTR(-EIO);
- }
+ btrfs_debug(fs_info,
+ "new io failure record logical %llu start %llu",
+ failrec->logical, start);
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
+ failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
+ if (failrec->num_copies == 1) {
+ /*
+ * We only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. No
+ * matter what the error is, it is very likely to persist.
+ */
+ btrfs_debug(fs_info,
+ "cannot repair logical %llu num_copies %d",
+ failrec->logical, failrec->num_copies);
kfree(failrec);
return ERR_PTR(-EIO);
}
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->compress_type = em->compress_type;
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
/* Set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
EXTENT_LOCKED | EXTENT_DIRTY);
@@ -2589,65 +2609,16 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-static bool btrfs_check_repairable(struct inode *inode,
- struct io_failure_record *failrec,
- int failed_mirror)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
-
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
- /*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
- }
-
- /* The failure record should only contain one sector */
- ASSERT(failrec->len == fs_info->sectorsize);
-
- /*
- * There are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
- *
- * Since we're only doing repair for one sector, we only need to get
- * a good copy of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
- */
- ASSERT(failed_mirror);
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
-
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
- }
-
- return true;
-}
-
-int btrfs_repair_one_sector(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook)
{
+ u64 start = failed_bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
+ struct bio *failed_bio = &failed_bbio->bio;
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
struct btrfs_bio *repair_bbio;
@@ -2657,12 +2628,24 @@ int btrfs_repair_one_sector(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, start);
+ failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
if (IS_ERR(failrec))
return PTR_ERR(failrec);
-
- if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
+ /*
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
+ if (failrec->this_mirror == failrec->failed_mirror) {
+ btrfs_debug(fs_info,
+ "failed to repair num_copies %d this_mirror %d failed_mirror %d",
+ failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
@@ -2695,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode,
* will be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type);
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
return BLK_STS_OK;
}
@@ -2727,21 +2710,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static blk_status_t submit_data_read_repair(struct inode *inode,
- struct bio *failed_bio,
- u32 bio_offset, struct page *page,
- unsigned int pgoff,
- u64 start, u64 end,
- int failed_mirror,
- unsigned int error_bitmap)
+static void end_sector_io(struct page *page, u64 offset, bool uptodate)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_state *cached = NULL;
+
+ end_page_read(page, uptodate, offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached);
+}
+
+static void submit_data_read_repair(struct inode *inode,
+ struct btrfs_bio *failed_bbio,
+ u32 bio_offset, const struct bio_vec *bvec,
+ unsigned int error_bitmap)
{
+ const unsigned int pgoff = bvec->bv_offset;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page *page = bvec->bv_page;
+ const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+ const u64 end = start + bvec->bv_len - 1;
const u32 sectorsize = fs_info->sectorsize;
const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
- int error = 0;
int i;
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
/* This repair is only for data */
ASSERT(is_data_inode(inode));
@@ -2753,12 +2750,11 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
* We only get called on buffered IO, thus page must be mapped and bio
* must not be cloned.
*/
- ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
+ ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
/* Iterate through all the sectors in the range */
for (i = 0; i < nr_bits; i++) {
const unsigned int offset = i * sectorsize;
- struct extent_state *cached = NULL;
bool uptodate = false;
int ret;
@@ -2771,10 +2767,9 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
goto next;
}
- ret = btrfs_repair_one_sector(inode, failed_bio,
- bio_offset + offset,
- page, pgoff + offset, start + offset,
- failed_mirror, btrfs_submit_data_bio);
+ ret = btrfs_repair_one_sector(inode, failed_bbio,
+ bio_offset + offset, page, pgoff + offset,
+ btrfs_submit_data_read_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2785,24 +2780,12 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
continue;
}
/*
- * Repair failed, just record the error but still continue.
- * Or the remaining sectors will not be properly unlocked.
+ * Continue on failed repair, otherwise the remaining sectors
+ * will not be properly unlocked.
*/
- if (!error)
- error = ret;
next:
- end_page_read(page, uptodate, start + offset, sectorsize);
- if (uptodate)
- set_extent_uptodate(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached);
+ end_sector_io(page, start + offset, uptodate);
}
- return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
@@ -3017,7 +3000,6 @@ static void end_bio_extent_readpage(struct bio *bio)
*/
u32 bio_offset = 0;
int mirror;
- int ret;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -3028,6 +3010,7 @@ static void end_bio_extent_readpage(struct bio *bio)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
unsigned int error_bitmap = (unsigned int)-1;
+ bool repair = false;
u64 start;
u64 end;
u32 len;
@@ -3065,57 +3048,23 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
- ret = error_bitmap;
+ if (error_bitmap)
+ uptodate = false;
} else {
- ret = btrfs_validate_metadata_buffer(bbio,
- page, start, end, mirror);
+ if (btrfs_validate_metadata_buffer(bbio,
+ page, start, end, mirror))
+ uptodate = false;
}
- if (ret)
- uptodate = false;
- else
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start,
- page,
- btrfs_ino(BTRFS_I(inode)), 0);
}
- if (likely(uptodate))
- goto readpage_ok;
-
- if (is_data_inode(inode)) {
- /*
- * If we failed to submit the IO at all we'll have a
- * mirror_num == 0, in which case we need to just mark
- * the page with an error and unlock it and carry on.
- */
- if (mirror == 0)
- goto readpage_ok;
-
- /*
- * submit_data_read_repair() will handle all the good
- * and bad sectors, we just continue to the next bvec.
- */
- submit_data_read_repair(inode, bio, bio_offset, page,
- start - page_offset(page),
- start, end, mirror,
- error_bitmap);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
- continue;
- } else {
- struct extent_buffer *eb;
-
- eb = find_extent_buffer_readpage(fs_info, page, start);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = mirror;
- atomic_dec(&eb->io_pages);
- }
-readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, tree, start, page,
+ btrfs_ino(BTRFS_I(inode)), 0);
+
/*
* Zero out the remaining part if this range straddles
* i_size.
@@ -3132,14 +3081,44 @@ readpage_ok:
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
+ } else if (is_data_inode(inode)) {
+ /*
+ * Only try to repair bios that actually made it to a
+ * device. If the bio failed to be submitted mirror
+ * is 0 and we need to fail it without retrying.
+ *
+ * This also includes the high level bios for compressed
+ * extents - these never make it to a device and repair
+ * is already handled on the lower compressed bio.
+ */
+ if (mirror > 0)
+ repair = true;
+ } else {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer_readpage(fs_info, page, start);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
+ }
+
+ if (repair) {
+ /*
+ * submit_data_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
+ */
+ submit_data_read_repair(inode, bbio, bio_offset, bvec,
+ error_bitmap);
+ } else {
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
}
+
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
- /* Update page status and unlock */
- end_page_read(page, uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3208,19 +3187,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
-{
- struct btrfs_bio *bbio;
- struct bio *new;
-
- /* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(new);
- btrfs_bio_init(bbio);
- bbio->iter = bio->bi_iter;
- return new;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
@@ -3380,7 +3346,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->bio = bio;
bio_ctrl->compress_type = compress_type;
bio->bi_end_io = end_io_func;
- bio->bi_private = &inode->io_tree;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
@@ -3445,7 +3410,6 @@ static int submit_extent_page(blk_opf_t opf,
struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
bio_end_io_t end_io_func,
- int mirror_num,
enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
@@ -3457,10 +3421,8 @@ static int submit_extent_page(blk_opf_t opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
- if (force_bio_submit && bio_ctrl->bio) {
- submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
- bio_ctrl->bio = NULL;
- }
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
while (cur < pg_offset + size) {
u32 offset = cur - pg_offset;
@@ -3500,8 +3462,7 @@ static int submit_extent_page(blk_opf_t opf,
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
- submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
- bio_ctrl->bio = NULL;
+ submit_one_bio(bio_ctrl);
}
cur += added;
}
@@ -3649,7 +3610,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, iosize);
- flush_dcache_page(page);
}
}
begin_page_read(fs_info, page);
@@ -3664,7 +3624,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
@@ -3748,7 +3707,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct extent_state *cached = NULL;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -3781,10 +3739,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
bio_ctrl, page, disk_bytenr, iosize,
- pg_offset,
- end_bio_extent_readpage, 0,
- this_bio_flag,
- force_bio_submit);
+ pg_offset, end_bio_extent_readpage,
+ this_bio_flag, force_bio_submit);
if (ret) {
/*
* We have to unlock the remaining range, or the page
@@ -3817,8 +3773,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
*/
- if (bio_ctrl.bio)
- submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ submit_one_bio(&bio_ctrl);
return ret;
}
@@ -4101,7 +4056,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
disk_bytenr, iosize,
cur - page_offset(page),
end_bio_extent_writepage,
- 0, 0, false);
+ 0, false);
if (ret) {
has_error = true;
if (!saved_ret)
@@ -4166,10 +4121,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- if (page->index == end_index) {
+ if (page->index == end_index)
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
- flush_dcache_page(page);
- }
ret = set_page_extent_mapped(page);
if (ret < 0) {
@@ -4278,7 +4231,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -4288,7 +4241,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
}
while (1) {
@@ -4335,7 +4288,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
flush = 1;
}
lock_page(p);
@@ -4577,7 +4530,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
- blk_opf_t write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
bool no_dirty_ebs = false;
int ret;
@@ -4596,7 +4549,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
&epd->bio_ctrl, page, eb->start, eb->len,
eb->start - page_offset(page),
- end_bio_subpage_eb_writepage, 0, 0, false);
+ end_bio_subpage_eb_writepage, 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4622,7 +4575,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
{
u64 disk_bytenr = eb->start;
int i, num_pages;
- blk_opf_t write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
int ret = 0;
prepare_eb_write(eb);
@@ -4637,7 +4590,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
&epd->bio_ctrl, p, disk_bytenr,
PAGE_SIZE, 0,
end_bio_extent_buffer_writepage,
- 0, 0, false);
+ 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4751,7 +4704,7 @@ static int submit_eb_subpage(struct page *page,
cleanup:
/* We hit error, end bio for the submitted extent buffers */
- end_write_bio(epd, ret);
+ submit_write_bio(epd, ret);
return ret;
}
@@ -4930,10 +4883,6 @@ retry:
index = 0;
goto retry;
}
- if (ret < 0) {
- end_write_bio(&epd, ret);
- goto out;
- }
/*
* If something went wrong, don't allow any metadata write bio to be
* submitted.
@@ -4960,21 +4909,17 @@ retry:
* Now such dirty tree block will not be cleaned by any dirty
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
- */
- if (!BTRFS_FS_ERROR(fs_info)) {
- flush_write_bio(&epd);
- } else {
- ret = -EROFS;
- end_write_bio(&epd, ret);
- }
-out:
- btrfs_zoned_meta_io_unlock(fs_info);
- /*
+ *
* We can get ret > 0 from submit_extent_page() indicating how many ebs
* were submitted. Reset it to 0 to avoid false alerts for the caller.
*/
if (ret > 0)
ret = 0;
+ if (!ret && BTRFS_FS_ERROR(fs_info))
+ ret = -EROFS;
+ submit_write_bio(&epd, ret);
+
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
@@ -5076,7 +5021,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
lock_page(page);
}
@@ -5087,7 +5032,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
wait_on_page_writeback(page);
}
@@ -5127,7 +5072,7 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- flush_write_bio(epd);
+ submit_write_bio(epd, 0);
goto retry;
}
@@ -5138,26 +5083,6 @@ retry:
return ret;
}
-int extent_write_full_page(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
- .extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
- };
-
- ret = __extent_writepage(page, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
-
- flush_write_bio(&epd);
- return ret;
-}
-
/*
* Submit the pages in the range to bio for call sites which delalloc range has
* already been ran (aka, ordered extent inserted) and all pages are still
@@ -5215,10 +5140,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
cur = cur_end + 1;
}
- if (!found_error)
- flush_write_bio(&epd);
- else
- end_write_bio(&epd, ret);
+ submit_write_bio(&epd, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
if (found_error)
@@ -5243,13 +5165,7 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
- end_write_bio(&epd, ret);
- return ret;
- }
- flush_write_bio(&epd);
+ submit_write_bio(&epd, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
@@ -5272,9 +5188,7 @@ void extent_readahead(struct readahead_control *rac)
if (em_cached)
free_extent_map(em_cached);
-
- if (bio_ctrl.bio)
- submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ submit_one_bio(&bio_ctrl);
}
/*
@@ -6206,7 +6120,7 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return -EINVAL;
}
if (fs_info->nodesize >= PAGE_SIZE &&
- !IS_ALIGNED(start, PAGE_SIZE)) {
+ !PAGE_ALIGNED(start)) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
start, fs_info->nodesize);
@@ -6608,7 +6522,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -6640,11 +6556,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
page, eb->start, eb->len,
eb->start - page_offset(page),
- end_bio_extent_readpage, mirror_num, 0,
- true);
+ end_bio_extent_readpage, 0, true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
@@ -6653,10 +6568,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- if (bio_ctrl.bio) {
- submit_one_bio(bio_ctrl.bio, mirror_num, 0);
- bio_ctrl.bio = NULL;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6676,7 +6588,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -6747,10 +6661,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ err = submit_extent_page(REQ_OP_READ, NULL,
&bio_ctrl, page, page_offset(page),
PAGE_SIZE, 0, end_bio_extent_readpage,
- mirror_num, 0, false);
+ 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6767,10 +6681,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio_ctrl.bio) {
- submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type);
- bio_ctrl.bio = NULL;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 23d4103c8831..4bc72a87b9a9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -57,6 +57,7 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
@@ -142,15 +143,10 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
struct extent_map_tree;
-typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
-
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
@@ -247,7 +243,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
@@ -266,15 +261,13 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
- enum btrfs_compression_type compress_type;
int this_mirror;
int failed_mirror;
+ int num_copies;
};
-int btrfs_repair_one_sector(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 30e6aef9739f..66c822182ecc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1954,11 +1954,25 @@ again:
btrfs_inode_unlock(inode, ilock_flags);
- /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
+ /*
+ * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * since we currently don't have NOWAIT semantics support for buffered IO
+ * and may block there for many reasons (reserving space for example).
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ err = -EAGAIN;
+ goto out;
+ }
+
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -2041,9 +2055,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
num_written = btrfs_encoded_write(iocb, from, encoded);
num_sync = encoded->len;
} else if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = num_sync = btrfs_direct_write(iocb, from);
+ num_written = btrfs_direct_write(iocb, from);
+ num_sync = num_written;
} else {
- num_written = num_sync = btrfs_buffered_write(iocb, from);
+ num_written = btrfs_buffered_write(iocb, from);
+ num_sync = num_written;
}
btrfs_set_inode_last_sub_trans(inode);
@@ -2291,7 +2307,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
/* we've logged all the items and now have a consistent
@@ -2717,7 +2733,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
goto out;
}
rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 - update the inode
@@ -3083,7 +3099,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b1ae3ba2ca2c..996da650ecdc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3536,7 +3536,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
* data, keep it dense.
*/
if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
- cont1_bytes = min_bytes = bytes + empty_size;
+ cont1_bytes = bytes + empty_size;
+ min_bytes = cont1_bytes;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
min_bytes = fs_info->sectorsize;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3867cb0646e9..f0c97d25b4a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate);
-
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = page_offset(locked_page);
- u64 page_end = page_start + PAGE_SIZE - 1;
-
+ u64 page_start, page_end;
struct page *page;
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
while (index <= end_index) {
/*
* For locked page, we will call end_extent_writepage() on it
@@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
@@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
/*
* Here we just clear all Ordered bits for every page in the
- * range, then __endio_write_update_ordered() will handle
+ * range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
@@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
put_page(page);
}
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
- return;
- /*
- * In case this page belongs to the delalloc range being instantiated
- * then skip it, since the first page of a range is going to be
- * properly cleaned up by the caller of run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
}
- return __endio_write_update_ordered(inode, offset, bytes, false);
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = kmap_local_page(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
i++;
ptr += cur_size;
@@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
write_extent_buffer(leaf, kaddr, ptr, size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
@@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
* will unlock the full page.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(end + 1, PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
return 0;
}
@@ -678,8 +680,8 @@ again:
* Thus we must also check against @actual_end, not just @end.
*/
if (blocksize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
goto cleanup_and_bail_uncompressed;
}
@@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
* can directly submit them without interruption.
*/
ret = cow_file_range(inode, locked_page, start, end, &page_started,
- &nr_written, 0);
+ &nr_written, 0, NULL);
/* Inline extent inserted, page gets unlocked and everything is done */
if (page_started) {
ret = 0;
goto out;
}
if (ret < 0) {
- if (locked_page)
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
unlock_page(locked_page);
+ }
goto out;
}
@@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock)
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
+ u64 orig_start = start;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
@@ -1329,18 +1365,62 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ /*
+ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+ * caller to write out the successfully allocated region and retry.
+ */
+ if (done_offset && ret == -EAGAIN) {
+ if (orig_start < start)
+ *done_offset = start - 1;
+ else
+ *done_offset = start;
+ return ret;
+ } else if (ret == -EAGAIN) {
+ /* Convert to -ENOSPC since the caller cannot retry. */
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ /*
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
/*
- * If we reserved an extent for our delalloc range (or a subrange) and
- * failed to create the respective ordered extent, then it means that
- * when we reserved the extent we decremented the extent's size from
- * the data space_info's bytes_may_use counter and incremented the
- * space_info's bytes_reserved counter by the same amount. We must make
- * sure extent_clear_unlock_delalloc() does not try to decrement again
- * the data space_info's bytes_may_use counter, therefore we do not pass
- * it the flag EXTENT_CLEAR_DATA_RESV.
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
@@ -1350,12 +1430,19 @@ out_unlock:
page_ops);
start += cur_alloc_size;
if (start >= end)
- goto out;
+ return ret;
}
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
- goto out;
+ return ret;
}
/*
@@ -1538,19 +1625,42 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
u64 end, int *page_started,
unsigned long *nr_written)
{
+ u64 done_offset = end;
int ret;
+ bool locked_page_done = false;
- ret = cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 0);
- if (ret)
- return ret;
+ while (start <= end) {
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0, &done_offset);
+ if (ret && ret != -EAGAIN)
+ return ret;
- if (*page_started)
- return 0;
+ if (*page_started) {
+ ASSERT(ret == 0);
+ return 0;
+ }
+
+ if (ret == 0)
+ done_offset = end;
+
+ if (done_offset == start) {
+ struct btrfs_fs_info *info = inode->root->fs_info;
+
+ wait_var_event(&info->zone_finish_wait,
+ !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
+ continue;
+ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ }
+ locked_page_done = true;
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+ start = done_offset + 1;
+ }
- __set_page_dirty_nobuffers(locked_page);
- account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1642,7 +1752,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
}
return cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 1);
+ nr_written, 1, NULL);
}
struct can_nocow_file_extent_args {
@@ -2115,7 +2225,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
page_started, nr_written);
else
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1, NULL);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2131,6 +2241,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 size;
/* not delalloc, ignore it */
@@ -2138,7 +2249,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
size = orig->end - orig->start + 1;
- if (size > BTRFS_MAX_EXTENT_SIZE) {
+ if (size > fs_info->max_extent_size) {
u32 num_extents;
u64 new_size;
@@ -2147,10 +2258,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
- num_extents = count_max_extents(new_size);
+ num_extents = count_max_extents(fs_info, new_size);
new_size = split - orig->start;
- num_extents += count_max_extents(new_size);
- if (count_max_extents(size) >= num_extents)
+ num_extents += count_max_extents(fs_info, new_size);
+ if (count_max_extents(fs_info, size) >= num_extents)
return;
}
@@ -2167,6 +2278,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size, old_size;
u32 num_extents;
@@ -2180,7 +2292,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
- if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ if (new_size <= fs_info->max_extent_size) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
@@ -2206,10 +2318,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
* this case.
*/
old_size = other->end - other->start + 1;
- num_extents = count_max_extents(old_size);
+ num_extents = count_max_extents(fs_info, old_size);
old_size = new->end - new->start + 1;
- num_extents += count_max_extents(old_size);
- if (count_max_extents(new_size) >= num_extents)
+ num_extents += count_max_extents(fs_info, old_size);
+ if (count_max_extents(fs_info, new_size) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
@@ -2274,21 +2386,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits)
+ u32 bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
spin_lock(&BTRFS_I(inode)->lock);
@@ -2303,7 +2415,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- if (*bits & EXTENT_DEFRAG)
+ if (bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
@@ -2312,7 +2424,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
@@ -2325,14 +2437,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
- struct extent_state *state, unsigned *bits)
+ struct extent_state *state, u32 bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
- if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
@@ -2343,7 +2455,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
@@ -2356,7 +2468,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* don't need to call delalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_CLEAR_META_RESV &&
+ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, false);
@@ -2366,7 +2478,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
- (*bits & EXTENT_CLEAR_DATA_RESV))
+ (bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
@@ -2381,11 +2493,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
- if (*bits & EXTENT_ADD_INODE_BYTES)
+ if (bits & EXTENT_ADD_INODE_BYTES)
inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
@@ -2580,95 +2692,78 @@ out:
return errno_to_blk_status(ret);
}
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
- *
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type)
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *page = bio_first_bvec_all(bio)->bv_page;
- loff_t file_offset = page_offset(page);
-
- ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
if (ret)
goto out;
}
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
-
- if (compress_type != BTRFS_COMPRESS_NONE) {
- /*
- * btrfs_submit_compressed_read will handle completing
- * the bio if there were any errors, so just return
- * here.
- */
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
return;
- } else {
- /*
- * Lookup bio sums does extra checks around whether we
- * need to csum or not, which is why we ignore skip_sum
- * here.
- */
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
- if (ret)
- goto out;
- }
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (btrfs_is_data_reloc_root(root))
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
- 0, btrfs_submit_bio_start);
- goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
if (ret)
goto out;
}
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
+out:
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
+}
-mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
-out:
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /* Save the original iter for read repair */
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
/*
@@ -3075,8 +3170,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- num_bytes = ram_bytes = oe->truncated_len;
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ num_bytes = oe->truncated_len;
+ ram_bytes = num_bytes;
+ }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3102,7 +3199,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
@@ -3311,65 +3408,71 @@ out:
return ret;
}
-static void finish_ordered_fn(struct btrfs_work *work)
-{
- struct btrfs_ordered_extent *ordered_extent;
- ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
- btrfs_finish_ordered_io(ordered_extent);
-}
-
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
- finish_ordered_fn, uptodate);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+ shash->tfm = fs_info->csum_shash;
+
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
}
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
- * @io_bio: btrfs_io_bio which contains the csum
+ * @bbio: btrfs_bio which contains the csum
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
- * @start: logical offset in the file
*
* The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
*/
-static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff,
- u64 start)
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
u32 len = fs_info->sectorsize;
- const u32 csum_size = fs_info->csum_size;
- unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
ASSERT(pgoff + len <= PAGE_SIZE);
- offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
-
- kaddr = kmap_atomic(page);
- shash->tfm = fs_info->csum_shash;
-
- crypto_shash_digest(shash, kaddr + pgoff, len, csum);
- kunmap_atomic(kaddr);
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
- if (memcmp(csum, csum_expected, csum_size))
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
goto zeroit;
-
return 0;
+
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- bbio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode),
+ bbio->file_offset + bio_offset,
+ csum, csum_expected, bbio->mirror_num);
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3401,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 pg_off;
unsigned int result = 0;
- if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
- btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
- return 0;
- }
-
/*
* This only happens for NODATASUM or compressed read.
* Normally this should be covered by above check for compressed read
@@ -3438,8 +3536,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
- page_offset(page) + pg_off);
+ ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
root->fs_info->sectorsize_bits;
@@ -4228,7 +4325,7 @@ skip_backref:
/*
* If we are in a rename context, we don't need to update anything in the
* log. That will be done later during the rename by btrfs_log_new_name().
- * Besides that, doing it here would only cause extra unncessary btree
+ * Besides that, doing it here would only cause extra unnecessary btree
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
@@ -4256,8 +4353,9 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
- dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+ dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4419,7 +4517,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -4858,7 +4957,6 @@ again:
else
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
- flush_dcache_page(page);
}
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
@@ -5061,9 +5159,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
*/
if (newsize != oldsize) {
inode_inc_iversion(inode);
- if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
}
if (newsize > oldsize) {
@@ -5371,7 +5470,7 @@ void btrfs_evict_inode(struct inode *inode)
if (!rsv)
goto no_delete;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5763,14 +5862,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
- inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ inode = new_simple_dir(dir->i_sb, &location, root);
} else {
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
- }
- if (root != sub_root)
btrfs_put_root(sub_root);
- if (!IS_ERR(inode) && root != sub_root) {
+ if (IS_ERR(inode))
+ return inode;
+
down_read(&fs_info->cleanup_work_sem);
if (!sb_rdonly(inode->i_sb))
ret = btrfs_orphan_cleanup(sub_root);
@@ -6366,7 +6465,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
+ /*
+ * We don't need the path anymore, plus inheriting properties, adding
+ * ACLs, security xattrs, orphan item or adding the link, will result in
+ * allocating yet another path. So just free our path.
+ */
+ btrfs_free_path(path);
+ path = NULL;
if (args->subvol) {
struct inode *parent;
@@ -6423,8 +6528,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- ret = 0;
- goto out;
+ return 0;
discard:
/*
@@ -7506,7 +7610,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
- *map = em = em2;
+ *map = em2;
+ em = em2;
}
if (IS_ERR(em2)) {
@@ -7588,8 +7693,12 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
const u64 data_alloc_len = length;
bool unlock_extents = false;
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
if (!write)
- len = min_t(u64, len, fs_info->sectorsize);
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
lockstart = start;
lockend = start + len - 1;
@@ -7824,8 +7933,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos += submitted;
length -= submitted;
if (write)
- __endio_write_update_ordered(BTRFS_I(inode), pos,
- length, false);
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1);
@@ -7847,10 +7956,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
- __endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->file_offset,
- dip->bytes,
- !dip->bio.bi_status);
+ btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ dip->file_offset, dip->bytes,
+ !dip->bio.bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
@@ -7870,12 +7978,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
- return;
-
refcount_inc(&dip->refs);
- if (btrfs_map_bio(fs_info, bio, mirror_num))
- refcount_dec(&dip->refs);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7884,56 +7988,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- struct bio_vec bvec;
- struct bvec_iter iter;
- u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (uptodate &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
+ bv.bv_offset))) {
+ clean_io_failure(fs_info, failure_tree, io_tree, start,
+ bv.bv_page, btrfs_ino(BTRFS_I(inode)),
+ bv.bv_offset);
+ } else {
+ int ret;
- __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
- unsigned int i, nr_sectors, pgoff;
-
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec.bv_offset;
- for (i = 0; i < nr_sectors; i++) {
- u64 start = bbio->file_offset + bio_offset;
-
- ASSERT(pgoff < PAGE_SIZE);
- if (uptodate &&
- (!csum || !check_data_csum(inode, bbio,
- bio_offset, bvec.bv_page,
- pgoff, start))) {
- clean_io_failure(fs_info, failure_tree, io_tree,
- start, bvec.bv_page,
- btrfs_ino(BTRFS_I(inode)),
- pgoff);
- } else {
- int ret;
-
- ret = btrfs_repair_one_sector(inode, &bbio->bio,
- bio_offset, bvec.bv_page, pgoff,
- start, bbio->mirror_num,
- submit_dio_repair_bio);
- if (ret)
- err = errno_to_blk_status(ret);
- }
- ASSERT(bio_offset + sectorsize > bio_offset);
- bio_offset += sectorsize;
- pgoff += sectorsize;
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
}
- return err;
-}
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate)
-{
- btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
- finish_ordered_fn, uptodate);
+ return err;
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -7968,51 +8051,43 @@ static void btrfs_end_dio_bio(struct bio *bio)
btrfs_dio_private_put(dip);
}
-static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
- struct inode *inode, u64 file_offset, int async_submit)
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
- /* Check btrfs_submit_bio_hook() for rules about async submit. */
- if (async_submit)
- async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- if (!write) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto err;
- }
+ /* Save the original iter for read repair */
+ if (btrfs_op(bio) == BTRFS_MAP_READ)
+ btrfs_bio(bio)->iter = bio->bi_iter;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
- btrfs_submit_bio_start_direct_io);
- goto err;
- } else if (write) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ /* Check btrfs_submit_data_write_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+ btrfs_submit_bio_start_direct_io))
+ return;
+
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
- if (ret)
- goto err;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ return;
+ }
} else {
- u64 csum_offset;
-
- csum_offset = file_offset - dip->file_offset;
- csum_offset >>= fs_info->sectorsize_bits;
- csum_offset *= fs_info->csum_size;
- btrfs_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+ file_offset - dip->file_offset);
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0);
-err:
- return ret;
+ btrfs_submit_bio(fs_info, bio, 0);
}
static void btrfs_submit_direct(const struct iomap_iter *iter,
@@ -8125,14 +8200,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
async_submit = 1;
}
- status = btrfs_submit_dio_bio(bio, inode, file_offset,
- async_submit);
- if (status) {
- bio_put(bio);
- if (submit_len > 0)
- refcount_dec(&dip->refs);
- goto out_err_em;
- }
+ btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
dio_data->submitted += clone_len;
clone_offset += clone_len;
@@ -8181,31 +8249,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- int ret;
-
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
- /*
- * If we are under memory pressure we will call this directly from the
- * VM, we need to make sure we have the inode referenced for the ordered
- * extent. If not just return like we didn't do anything.
- */
- if (!igrab(inode)) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
- ret = extent_write_full_page(page, wbc);
- btrfs_add_delayed_iput(inode);
- return ret;
-}
-
static int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -8503,7 +8546,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* Reserving delalloc space after obtaining the page lock can lead to
* deadlock. For example, if a dirty page is locked by this function
* and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepage() function could
+ * dirty page write out, then the btrfs_writepages() function could
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
@@ -8594,10 +8637,9 @@ again:
else
zero_start = PAGE_SIZE;
- if (zero_start != PAGE_SIZE) {
+ if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- flush_dcache_page(page);
- }
+
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -8680,7 +8722,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 for the truncate slack space
@@ -9201,8 +9243,10 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_dir->i_mtime = ctime;
+ old_dir->i_ctime = ctime;
+ new_dir->i_mtime = ctime;
+ new_dir->i_ctime = ctime;
old_inode->i_ctime = ctime;
new_inode->i_ctime = ctime;
@@ -9465,9 +9509,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- old_inode->i_ctime = current_time(old_dir);
+ old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_ctime = old_dir->i_mtime;
+ new_dir->i_mtime = old_dir->i_mtime;
+ new_dir->i_ctime = old_dir->i_mtime;
+ old_inode->i_ctime = old_dir->i_mtime;
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9555,15 +9601,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
+ int ret;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
- new_dentry);
+ ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+ else
+ ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
- return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
- new_dentry, flags);
+ btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
+
+ return ret;
}
struct btrfs_delalloc_work {
@@ -10183,9 +10235,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
-static int btrfs_encoded_io_compression_from_extent(
- struct btrfs_fs_info *fs_info,
- int compress_type)
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type)
{
switch (compress_type) {
case BTRFS_COMPRESS_NONE:
@@ -10308,7 +10359,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
struct bio *bio, int mirror_num)
{
struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
@@ -10318,19 +10368,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret) {
- btrfs_bio_free_csum(bbio);
- return ret;
- }
-
atomic_inc(&priv->pending);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret) {
- atomic_dec(&priv->pending);
- btrfs_bio_free_csum(bbio);
- }
- return ret;
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return BLK_STS_OK;
}
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
@@ -10342,7 +10382,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
u32 sectorsize = fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- u64 start = priv->file_offset;
u32 bio_offset = 0;
if (priv->skip_csum || !uptodate)
@@ -10355,10 +10394,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
pgoff = bvec->bv_offset;
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
- bvec->bv_page, pgoff, start))
+ if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff))
return BLK_STS_IOERR;
- start += sectorsize;
bio_offset += sectorsize;
pgoff += sectorsize;
}
@@ -10390,11 +10428,9 @@ static void btrfs_encoded_read_endio(struct bio *bio)
bio_put(bio);
}
-static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset,
- u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages)
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size, struct page **pages)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = {
@@ -10625,7 +10661,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = count = em->block_len;
+ disk_io_size = em->block_len;
+ count = em->block_len;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
@@ -10788,15 +10825,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = -ENOMEM;
goto out_pages;
}
- kaddr = kmap(pages[i]);
+ kaddr = kmap_local_page(pages[i]);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
- kunmap(pages[i]);
+ kunmap_local(kaddr);
ret = -EFAULT;
goto out_pages;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap(pages[i]);
+ kunmap_local(kaddr);
}
for (;;) {
@@ -11425,7 +11462,6 @@ static const struct file_operations btrfs_dir_file_operations = {
*/
static const struct address_space_operations btrfs_aops = {
.read_folio = btrfs_read_folio,
- .writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f79af919bc4..fe0cc816b4eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1230,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
return em;
}
-static u32 get_extent_max_capacity(const struct extent_map *em)
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
{
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
return BTRFS_MAX_COMPRESSED;
- return BTRFS_MAX_EXTENT_SIZE;
+ return fs_info->max_extent_size;
}
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
u32 extent_thresh, u64 newer_than, bool locked)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *next;
bool ret = false;
@@ -1263,7 +1265,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
* If the next extent is at its max capacity, defragging current extent
* makes no sense, as the total number of extents won't change.
*/
- if (next->len >= get_extent_max_capacity(em))
+ if (next->len >= get_extent_max_capacity(fs_info, em))
goto out;
/* Skip older extent */
if (next->generation < newer_than)
@@ -1400,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
bool locked, struct list_head *target_list,
u64 *last_scanned_ret)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
bool last_is_target = false;
u64 cur = start;
int ret = 0;
@@ -1484,7 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* Skip extents already at its max capacity, this is mostly for
* compressed extents, which max cap is only 128K.
*/
- if (em->len >= get_extent_max_capacity(em))
+ if (em->len >= get_extent_max_capacity(fs_info, em))
goto next;
/*
@@ -4243,26 +4246,6 @@ out:
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
-{
- struct btrfs_data_container *inodes = ctx;
- const size_t c = 3 * sizeof(u64);
-
- if (inodes->bytes_left >= c) {
- inodes->bytes_left -= c;
- inodes->val[inodes->elem_cnt] = inum;
- inodes->val[inodes->elem_cnt + 1] = offset;
- inodes->val[inodes->elem_cnt + 2] = root;
- inodes->elem_cnt += 3;
- } else {
- inodes->bytes_missing += c - inodes->bytes_left;
- inodes->bytes_left = 0;
- inodes->elem_missed += 3;
- }
-
- return 0;
-}
-
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
void __user *arg, int version)
{
@@ -4312,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- build_ino_list, inodes, ignore_offset);
+ inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4355,13 +4338,79 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
+/**
+ * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
+ * required.
+ *
+ * @fs_info: the filesystem
+ * @excl_acquired: ptr to boolean value which is set to false in case balance
+ * is being resumed
+ *
+ * Return 0 on success in which case both fs_info::balance is acquired as well
+ * as exclusive ops are blocked. In case of failure return an error code.
+ */
+static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
+{
+ int ret;
+
+ /*
+ * Exclusive operation is locked. Three possibilities:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ while (1) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ *excl_acquired = true;
+ mutex_lock(&fs_info->balance_mutex);
+ return 0;
+ }
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* This is either (2) or (3) */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (2) */
+ ret = -EINPROGRESS;
+ goto out_failure;
+
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to
+ * continue, we'll reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (3) */
+ *excl_acquired = false;
+ return 0;
+ }
+ }
+ } else {
+ /* This is (1) */
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_failure;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ }
+
+out_failure:
+ mutex_unlock(&fs_info->balance_mutex);
+ *excl_acquired = false;
+ return ret;
+}
+
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
- bool need_unlock; /* for mut. excl. ops lock */
+ bool need_unlock = true;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -4378,53 +4427,12 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
goto out;
}
-again:
- if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- mutex_lock(&fs_info->balance_mutex);
- need_unlock = true;
- goto locked;
- }
-
- /*
- * mut. excl. ops lock is locked. Three possibilities:
- * (1) some other op is running
- * (2) balance is running
- * (3) balance is paused -- special case (think resume)
- */
- mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- /* this is either (2) or (3) */
- if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
- /*
- * Lock released to allow other waiters to continue,
- * we'll reexamine the status again.
- */
- mutex_lock(&fs_info->balance_mutex);
-
- if (fs_info->balance_ctl &&
- !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- /* this is (3) */
- need_unlock = false;
- goto locked;
- }
-
- mutex_unlock(&fs_info->balance_mutex);
- goto again;
- } else {
- /* this is (2) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = -EINPROGRESS;
- goto out;
- }
- } else {
- /* this is (1) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = btrfs_try_lock_balance(fs_info, &need_unlock);
+ if (ret)
goto out;
- }
-locked:
+ lockdep_assert_held(&fs_info->balance_mutex);
+
if (bargs->flags & BTRFS_BALANCE_RESUME) {
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 430ad36b8b08..89bc5f825e0a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
orig_out + compressed_size - *cur_out);
- kunmap(cur_page);
+ kunmap_local(kaddr);
if ((*cur_out / PAGE_SIZE) >= max_nr_page)
return -E2BIG;
@@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
*cur_out += sector_bytes_left;
out:
- kunmap(cur_page);
+ kunmap_local(kaddr);
return 0;
}
@@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap(page_in);
+ data_in = kmap_local_page(page_in);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
workspace->mem);
- kunmap(page_in);
+ kunmap_local(data_in);
if (ret < 0) {
pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
@@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
- kaddr = kmap(cur_page);
- memcpy(dest + *cur_in - orig_in,
- kaddr + offset_in_page(*cur_in),
- copy_len);
- kunmap(cur_page);
+ memcpy_from_page(dest + *cur_in - orig_in, cur_page,
+ offset_in_page(*cur_in), copy_len);
*cur_in += copy_len;
}
@@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap(cb->compressed_pages[0]);
+ kaddr = kmap_local_page(cb->compressed_pages[0]);
len_in = read_compress_length(kaddr);
- kunmap(cb->compressed_pages[0]);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
/*
@@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
ASSERT(cur_page);
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
- kunmap(cur_page);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
if (seg_len > WORKSPACE_CBUF_LENGTH) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1957b14b329a..1952ac85222c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&tree->lock);
}
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
/*
* Mark all ordered extents io inside the specified range finished.
*
- * @page: The invovled page for the opeartion.
+ * @page: The involved page for the operation.
* For uncompressed buffered IO, the page status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
* endio function twice.
- * @finish_func: The function to be executed when all the IO of an ordered
- * extent are finished.
*
* This function is called for endio, thus the range must have ordered
- * extent(s) coveri it.
+ * extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate)
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
cond_wake_up(&entry->wait);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, entry);
spin_unlock_irqrestore(&tree->lock, flags);
- btrfs_init_work(&entry->work, finish_func, NULL, NULL);
+ btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &entry->work);
spin_lock_irqsave(&tree->lock, flags);
}
@@ -473,6 +479,7 @@ out:
if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
spin_unlock_irqrestore(&tree->lock, flags);
return finished;
@@ -807,8 +814,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup(inode, entry);
+ }
out:
spin_unlock_irqrestore(&tree->lock, flags);
return entry;
@@ -848,8 +857,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
break;
}
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_range(inode, entry);
+ }
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -878,6 +889,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
ASSERT(list_empty(&ordered->log_list));
list_add_tail(&ordered->log_list, list);
refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
spin_unlock_irq(&tree->lock);
}
@@ -901,6 +913,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -975,8 +988,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
/* No ordered extent in the range */
entry = NULL;
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+ }
+
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -1055,6 +1071,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret = 0;
+ trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
spin_lock_irq(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ecad67a2c745..87792f85e2c4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate);
+ u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c520412d1f86..2feb5c20641a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -63,137 +63,6 @@ struct sector_ptr {
unsigned int uptodate:8;
};
-enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE,
- BTRFS_RBIO_READ_REBUILD,
- BTRFS_RBIO_PARITY_SCRUB,
- BTRFS_RBIO_REBUILD_MISSING,
-};
-
-struct btrfs_raid_bio {
- struct btrfs_io_context *bioc;
-
- /* while we're doing rmw on a stripe
- * we put it into a hash table so we can
- * lock the stripe and merge more rbios
- * into it.
- */
- struct list_head hash_list;
-
- /*
- * LRU list for the stripe cache
- */
- struct list_head stripe_cache;
-
- /*
- * for scheduling work in the helper threads
- */
- struct work_struct work;
-
- /*
- * bio list and bio_list_lock are used
- * to add more bios into the stripe
- * in hopes of avoiding the full rmw
- */
- struct bio_list bio_list;
- spinlock_t bio_list_lock;
-
- /* also protected by the bio_list_lock, the
- * plug list is used by the plugging code
- * to collect partial bios while plugged. The
- * stripe locking code also uses it to hand off
- * the stripe lock to the next pending IO
- */
- struct list_head plug_list;
-
- /*
- * flags that tell us if it is safe to
- * merge with this bio
- */
- unsigned long flags;
-
- /*
- * set if we're doing a parity rebuild
- * for a read from higher up, which is handled
- * differently from a parity rebuild as part of
- * rmw
- */
- enum btrfs_rbio_ops operation;
-
- /* Size of each individual stripe on disk */
- u32 stripe_len;
-
- /* How many pages there are for the full stripe including P/Q */
- u16 nr_pages;
-
- /* How many sectors there are for the full stripe including P/Q */
- u16 nr_sectors;
-
- /* Number of data stripes (no p/q) */
- u8 nr_data;
-
- /* Numer of all stripes (including P/Q) */
- u8 real_stripes;
-
- /* How many pages there are for each stripe */
- u8 stripe_npages;
-
- /* How many sectors there are for each stripe */
- u8 stripe_nsectors;
-
- /* First bad stripe, -1 means no corruption */
- s8 faila;
-
- /* Second bad stripe (for RAID6 use) */
- s8 failb;
-
- /* Stripe number that we're scrubbing */
- u8 scrubp;
-
- /*
- * size of all the bios in the bio_list. This
- * helps us decide if the rbio maps to a full
- * stripe or not
- */
- int bio_list_bytes;
-
- int generic_bio_cnt;
-
- refcount_t refs;
-
- atomic_t stripes_pending;
-
- atomic_t error;
- /*
- * these are two arrays of pointers. We allocate the
- * rbio big enough to hold them both and setup their
- * locations when the rbio is allocated
- */
-
- /* pointers to pages that we allocated for
- * reading/writing stripes directly from the disk (including P/Q)
- */
- struct page **stripe_pages;
-
- /* Pointers to the sectors in the bio_list, for faster lookup */
- struct sector_ptr *bio_sectors;
-
- /*
- * For subpage support, we need to map each sector to above
- * stripe_pages.
- */
- struct sector_ptr *stripe_sectors;
-
- /* Bitmap to record which horizontal stripe has data */
- unsigned long *dbitmap;
-
- /* allocated with real_stripes-many pointers for finish_*() calls */
- void **finish_pointers;
-
- /* Allocated with stripe_nsectors-many bits for finish_*() calls */
- unsigned long *finish_pbitmap;
-};
-
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
static void rmw_work(struct work_struct *work);
@@ -347,6 +216,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
}
}
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+ struct btrfs_raid_bio *dest, int page_nr)
+{
+ const u32 sectorsize = src->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ if (dest->stripe_pages[page_nr])
+ __free_page(dest->stripe_pages[page_nr]);
+ dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+ src->stripe_pages[page_nr] = NULL;
+
+ /* Also update the sector->uptodate bits. */
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page; i++)
+ dest->stripe_sectors[i].uptodate = true;
+}
+
/*
* Stealing an rbio means taking all the uptodate pages from the stripe array
* in the source rbio and putting them into the destination rbio.
@@ -358,7 +245,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
struct page *s;
- struct page *d;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
@@ -368,12 +254,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
if (!s || !full_page_sectors_uptodate(src, i))
continue;
- d = dest->stripe_pages[i];
- if (d)
- __free_page(d);
-
- dest->stripe_pages[i] = s;
- src->stripe_pages[i] = NULL;
+ steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
index_stripe_sectors(src);
@@ -391,6 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+ dest->stripe_nsectors);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -590,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio)
int ret = 1;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
- if (size != rbio->nr_data * rbio->stripe_len)
+ if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
- BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
@@ -932,6 +816,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1023,29 +913,30 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_io_context *bioc,
- u32 stripe_len)
+ struct btrfs_io_context *bioc)
{
const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
+ const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
const unsigned int num_pages = stripe_npages * real_stripes;
- const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
+ const unsigned int stripe_nsectors =
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- int nr_data = 0;
void *p;
- ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * Our current stripe len should be fixed to 64k thus stripe_nsectors
+ * (at most 16) should be no larger than BITS_PER_LONG.
+ */
+ ASSERT(stripe_nsectors <= BITS_PER_LONG);
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
sizeof(*rbio->bio_sectors) * num_sectors +
sizeof(*rbio->stripe_sectors) * num_sectors +
- sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
- sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
+ sizeof(*rbio->finish_pointers) * real_stripes,
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1056,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bioc = bioc;
- rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
@@ -1081,18 +971,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
#undef CONSUME_ALLOC
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
- nr_data = real_stripes - 1;
- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
- nr_data = real_stripes - 2;
- else
- BUG();
+ ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
+ rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
- rbio->nr_data = nr_data;
return rbio;
}
@@ -1135,7 +1018,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector,
unsigned int stripe_nr,
unsigned int sector_nr,
- unsigned long bio_max_len,
enum req_op op)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1180,7 +1062,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
+ bio = bio_alloc(stripe->dev->bdev,
+ max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
op, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
bio->bi_private = rbio;
@@ -1215,9 +1098,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->raid_map[0];
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_bio(bio)->iter;
-
bio_for_each_segment(bvec, bio, iter) {
u32 bvec_offset;
@@ -1252,6 +1132,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_unlock_irq(&rbio->bio_list_lock);
}
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ struct raid56_bio_trace_info *trace_info)
+{
+ const struct btrfs_io_context *bioc = rbio->bioc;
+ int i;
+
+ ASSERT(bioc);
+
+ /* We rely on bio->bi_bdev to find the stripe number. */
+ if (!bio->bi_bdev)
+ goto not_found;
+
+ for (i = 0; i < bioc->num_stripes; i++) {
+ if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+ continue;
+ trace_info->stripe_nr = i;
+ trace_info->devid = bioc->stripes[i].dev->devid;
+ trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ bioc->stripes[i].physical;
+ return;
+ }
+
+not_found:
+ trace_info->devid = -1;
+ trace_info->offset = -1;
+ trace_info->stripe_nr = -1;
+}
+
/*
* this is called from one of two situations. We either
* have a full stripe from the higher layers, or we've read all
@@ -1266,7 +1174,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
+ /* The total sector number inside the full stripe. */
+ int total_sector_nr;
int stripe;
+ /* Sector number inside a stripe. */
int sectornr;
bool has_qstripe;
struct bio_list bio_list;
@@ -1282,6 +1193,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1348,55 +1262,71 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
/*
- * time to start writing. Make bios for everything from the
- * higher layers (the bio_list in our rbio) and our p/q. Ignore
- * everything else.
+ * Start writing. Make bios for everything from the higher layers (the
+ * bio_list in our rbio) and our P/Q. Ignore everything else.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
- continue;
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
- sectornr, rbio->stripe_len,
- REQ_OP_WRITE);
- if (ret)
- goto cleanup;
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
if (likely(!bioc->num_tgtdevs))
goto write_data;
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bioc->tgtdev_map[stripe])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (stripe < rbio->nr_data) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (!sector)
- continue;
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
+ if (!bioc->tgtdev_map[stripe]) {
+ /*
+ * We can skip the whole stripe completely, note
+ * total_sector_nr will be increased by one anyway.
+ */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
+ continue;
+ }
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- rbio->bioc->tgtdev_map[stripe],
- sectornr, rbio->stripe_len,
- REQ_OP_WRITE);
- if (ret)
- goto cleanup;
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ rbio->bioc->tgtdev_map[stripe],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
write_data:
@@ -1406,6 +1336,12 @@ write_data:
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -1433,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bioc->num_stripes; i++) {
stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, rbio->stripe_len) &&
+ if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
@@ -1455,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->nr_data; i++) {
u64 stripe_start = rbio->bioc->raid_map[i];
- if (in_range(logical, stripe_start, rbio->stripe_len))
+ if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
return i;
}
return -1;
@@ -1552,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1571,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio)
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+/*
+ * End io handler for the read phase of the RMW cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
*/
validate_rbio_for_rmw(rbio);
- return;
-
-cleanup:
-
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
@@ -1598,9 +1537,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
+ const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -1612,38 +1551,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ /* Build a list of bios to read all the missing data sectors. */
+ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
- /*
- * We want to find all the sectors missing from the
- * rbio and read them from the disk. If * sector_in_rbio()
- * finds a page in the bio list we don't need to read
- * it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a page
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate page.
- * If so, be happy and use it.
- */
- if (sector->uptodate)
- continue;
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate page. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret)
- goto cleanup;
- }
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -1662,11 +1597,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_rmw_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_read_partial_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_read_partial(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -1833,27 +1773,53 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+ set_bit(bit, &rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
- int ret;
+ int ret = 0;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ ret = PTR_ERR(rbio);
+ goto out_dec_counter;
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
- btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
/*
@@ -1863,8 +1829,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
- return ret;
+ goto out_dec_counter;
+ return;
}
cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
@@ -1875,13 +1841,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- ret = 0;
} else {
ret = __raid56_parity_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
+ goto out_dec_counter;
}
- return ret;
+
+ return;
+
+out_dec_counter:
+ btrfs_bio_counter_dec(fs_info);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
}
/*
@@ -1939,7 +1910,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sectornr, rbio->dbitmap))
+ !test_bit(sectornr, &rbio->dbitmap))
continue;
/*
@@ -2108,25 +2079,13 @@ cleanup_io:
}
/*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
*/
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- /*
- * we only read stripe pages off the disk, set them
- * up to date if there were no errors
- */
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2147,8 +2106,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2160,33 +2118,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
if (rbio->faila == stripe || rbio->failb == stripe) {
atomic_inc(&rbio->error);
+ /* Skip the current stripe. */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
}
-
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- if (sector->uptodate)
- continue;
-
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret < 0)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret < 0)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2209,11 +2165,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_recover_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_recover_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
@@ -2236,28 +2197,27 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u32 stripe_len, int mirror_num, int generic_io)
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num, bool generic_io)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- int ret;
if (generic_io) {
ASSERT(bioc->mirror_num == mirror_num);
btrfs_bio(bio)->mirror_num = mirror_num;
+ } else {
+ btrfs_get_bioc(bioc);
}
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- if (generic_io)
- btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ goto out_end_bio;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
@@ -2265,18 +2225,13 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bioc->map_type);
- if (generic_io)
- btrfs_put_bioc(bioc);
kfree(rbio);
- return -EIO;
+ bio->bi_status = BLK_STS_IOERR;
+ goto out_end_bio;
}
- if (generic_io) {
- btrfs_bio_counter_inc_noblocked(fs_info);
+ if (generic_io)
rbio->generic_bio_cnt = 1;
- } else {
- btrfs_get_bioc(bioc);
- }
/*
* Loop retry:
@@ -2295,24 +2250,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
rbio->failb--;
}
- ret = lock_stripe_add(rbio);
+ if (lock_stripe_add(rbio))
+ return;
/*
- * __raid56_parity_recover will end the bio with
- * any errors it hits. We don't want to return
- * its error value up the stack because our caller
- * will end up calling bio_endio with any nonzero
- * return
- */
- if (ret == 0)
- __raid56_parity_recover(rbio);
- /*
- * our rbio has been added to the list of
- * rbios that will be handled after the
- * currently lock owner is done
+ * This adds our rbio to the list of rbios that will be handled after
+ * the current lock owner is done.
*/
- return 0;
+ __raid56_parity_recover(rbio);
+ return;
+out_end_bio:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_put_bioc(bioc);
+ bio_endio(bio);
}
static void rmw_work(struct work_struct *work)
@@ -2343,14 +2294,14 @@ static void read_rebuild_work(struct work_struct *work)
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
- u32 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2374,7 +2325,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
}
ASSERT(i < rbio->real_stripes);
- bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+ bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
/*
* We have already increased bio_counter when getting bioc, record it
@@ -2395,7 +2346,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
ASSERT(logical >= rbio->bioc->raid_map[0]);
ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
- rbio->stripe_len * rbio->nr_data);
+ BTRFS_STRIPE_LEN * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset / sectorsize;
rbio->bio_sectors[index].page = page;
@@ -2409,23 +2360,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int stripe;
- int sectornr;
-
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- struct page *page;
- int index = (stripe * rbio->stripe_nsectors + sectornr) *
- sectorsize >> PAGE_SHIFT;
+ int total_sector_nr;
- if (rbio->stripe_pages[index])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct page *page;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
- }
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+ if (rbio->stripe_pages[index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
}
index_stripe_sectors(rbio);
return 0;
@@ -2437,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- unsigned long *pbitmap = rbio->finish_pbitmap;
+ unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
int sectornr;
@@ -2460,7 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
+ bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2497,7 +2447,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_sector.page);
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
void *parity;
@@ -2525,7 +2475,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, sectornr, 1);
+ bitmap_clear(&rbio->dbitmap, sectornr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
@@ -2547,12 +2497,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
- sectornr, rbio->stripe_len, REQ_OP_WRITE);
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2566,7 +2516,7 @@ writeback:
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector,
bioc->tgtdev_map[rbio->scrubp],
- sectornr, rbio->stripe_len, REQ_OP_WRITE);
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2584,6 +2534,12 @@ submit_write:
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
+ if (trace_raid56_scrub_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -2671,24 +2627,14 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
@@ -2698,8 +2644,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int sectornr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2709,37 +2654,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
goto cleanup;
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
- struct sector_ptr *sector;
- /*
- * We want to find all the sectors missing from the
- * rbio and read them from the disk. If * sector_in_rbio()
- * finds a sector in the bio list we don't need to read
- * it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ /* Build a list of bios to read all the missing parts. */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ struct sector_ptr *sector;
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate sector.
- * If so, be happy and use it.
- */
- if (sector->uptodate)
- continue;
+ /* No data in the vertical stripe, no need to read. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, rbio->stripe_len,
- REQ_OP_READ);
- if (ret)
- goto cleanup;
- }
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a sector
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
+
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate sector. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2758,11 +2704,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_parity_scrub_end_io;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -2797,13 +2748,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bioc, length);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index aaad08aefd7d..6f48f9e4c869 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,45 +7,179 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
-static inline int nr_parity_stripes(const struct map_lookup *map)
-{
- if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- return 1;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
- else
- return 0;
-}
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_io_context *bioc;
+
+ /*
+ * While we're doing RMW on a stripe we put it into a hash table so we
+ * can lock the stripe and merge more rbios into it.
+ */
+ struct list_head hash_list;
+
+ /* LRU list for the stripe cache */
+ struct list_head stripe_cache;
+
+ /* For scheduling work in the helper threads */
+ struct work_struct work;
+
+ /*
+ * bio_list and bio_list_lock are used to add more bios into the stripe
+ * in hopes of avoiding the full RMW
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /*
+ * Also protected by the bio_list_lock, the plug list is used by the
+ * plugging code to collect partial bios while plugged. The stripe
+ * locking code also uses it to hand off the stripe lock to the next
+ * pending IO.
+ */
+ struct list_head plug_list;
+
+ /* Flags that tell us if it is safe to merge with this bio. */
+ unsigned long flags;
+
+ /*
+ * Set if we're doing a parity rebuild for a read from higher up, which
+ * is handled differently from a parity rebuild as part of RMW.
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
+
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
+
+ /*
+ * Size of all the bios in the bio_list. This helps us decide if the
+ * rbio maps to a full stripe or not.
+ */
+ int bio_list_bytes;
+
+ int generic_bio_cnt;
+
+ refcount_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+
+ struct work_struct end_io_work;
+
+ /* Bitmap to record which horizontal stripe has data */
+ unsigned long dbitmap;
+
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
+ unsigned long finish_pbitmap;
+
+ /*
+ * These are two arrays of pointers. We allocate the rbio big enough
+ * to hold them both and setup their locations when the rbio is
+ * allocated.
+ */
+
+ /*
+ * Pointers to pages that we allocated for reading/writing stripes
+ * directly from the disk (including P/Q).
+ */
+ struct page **stripe_pages;
+
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
+
+ /*
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
+ */
+ struct sector_ptr *stripe_sectors;
+
+ /* Allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+ u64 devid;
+
+ /* The offset inside the stripe. (<= STRIPE_LEN) */
+ u32 offset;
+
+ /*
+ * Stripe number.
+ * 0 is the first data stripe, and nr_data for P stripe,
+ * nr_data + 1 for Q stripe.
+ * >= real_stripes for
+ */
+ u8 stripe_nr;
+};
static inline int nr_data_stripes(const struct map_lookup *map)
{
- return map->num_stripes - nr_parity_stripes(map);
+ return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
+
#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
((x) == RAID6_Q_STRIPE))
-struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u32 stripe_len, int mirror_num, int generic_io);
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len);
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num, bool generic_io);
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
- struct btrfs_io_context *bioc, u32 stripe_len,
+ struct btrfs_io_context *bioc,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index a3549d587464..9acf47b11fe6 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -5,6 +5,7 @@
#include "compression.h"
#include "ctree.h"
#include "delalloc-space.h"
+#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
@@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
int ret;
inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ if (!no_time_update) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (comp_type == BTRFS_COMPRESS_NONE) {
memcpy_to_page(page, offset_in_page(file_offset), data_start,
datal);
- flush_dcache_page(page);
} else {
ret = btrfs_decompress(comp_type, data_start, page,
offset_in_page(file_offset),
@@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
*
* So what's in the range [500, 4095] corresponds to zeroes.
*/
- if (datal < block_size) {
+ if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- flush_dcache_page(page);
- }
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
@@ -658,7 +658,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ const u64 bs = fs_info->sb->s_blocksize;
int ret;
/*
@@ -669,6 +670,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
@@ -778,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e7b0323e6efd..3afe5fa50a63 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -135,15 +135,13 @@ struct scrub_parity {
struct work_struct work;
/* Mark the parity blocks which have data */
- unsigned long *dbitmap;
+ unsigned long dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
- unsigned long *ebitmap;
-
- unsigned long bitmap[];
+ unsigned long ebitmap;
};
struct scrub_ctx {
@@ -1218,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
- u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
@@ -1233,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
continue;
if (logical >= raid_map[i] &&
- logical < raid_map[i] + mapped_length)
+ logical < raid_map[i] + BTRFS_STRIPE_LEN)
break;
}
@@ -1337,7 +1334,6 @@ leave_nomem:
scrub_stripe_index_and_offset(logical,
bioc->map_type,
bioc->raid_map,
- mapped_length,
bioc->num_stripes -
bioc->num_tgtdevs,
mirror_index,
@@ -1380,19 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
- int ret;
- int mirror_num;
bio->bi_iter.bi_sector = sector->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
-
- mirror_num = sector->sblock->sectors[0]->mirror_num;
- ret = raid56_parity_recover(bio, sector->recover->bioc,
- sector->recover->map_length,
- mirror_num, 0);
- if (ret)
- return ret;
+ raid56_parity_recover(bio, sector->recover->bioc,
+ sector->sblock->sectors[0]->mirror_num, false);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -2197,7 +2186,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(bio, bioc, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc);
if (!rbio)
goto rbio_out;
@@ -2406,13 +2395,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
@@ -2763,7 +2752,7 @@ static void scrub_free_parity(struct scrub_parity *sparity)
struct scrub_sector *curr, *next;
int nbits;
- nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
@@ -2795,8 +2784,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
- sparity->nsectors);
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+ &sparity->dbitmap, sparity->nsectors);
bio_put(bio);
@@ -2814,8 +2803,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
u64 length;
int ret;
- if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
- sparity->nsectors))
+ if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+ &sparity->ebitmap, sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start;
@@ -2831,9 +2820,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
sparity->scrub_dev,
- sparity->dbitmap,
+ &sparity->dbitmap,
sparity->nsectors);
if (!rbio)
goto rbio_out;
@@ -2847,7 +2836,7 @@ rbio_out:
bioc_out:
btrfs_bio_counter_dec(fs_info);
btrfs_put_bioc(bioc);
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2856,11 +2845,6 @@ out:
scrub_free_parity(sparity);
}
-static inline int scrub_calc_parity_bitmap_len(int nsectors)
-{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
-}
-
static void scrub_parity_get(struct scrub_parity *sparity)
{
refcount_inc(&sparity->refs);
@@ -3131,7 +3115,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int ret;
struct scrub_parity *sparity;
int nsectors;
- int bitmap_len;
path = btrfs_alloc_path();
if (!path) {
@@ -3145,9 +3128,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
- bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
- sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
- GFP_NOFS);
+ ASSERT(nsectors <= BITS_PER_LONG);
+ sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
if (!sparity) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -3165,8 +3147,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->sectors_list);
- sparity->dbitmap = sparity->bitmap;
- sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
ret = 0;
for (cur_logical = logic_start; cur_logical < logic_end;
@@ -3429,20 +3409,22 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct extent_map *em,
struct btrfs_device *scrub_dev,
- int stripe_index, u64 dev_extent_len)
+ int stripe_index)
{
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root;
struct btrfs_root *csum_root;
struct blk_plug plug;
+ struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
u64 physical = map->stripes[stripe_index].physical;
- const u64 physical_end = physical + dev_extent_len;
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
/* The logical increment after finishing one stripe */
@@ -3569,8 +3551,8 @@ next:
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[stripe_index].physical +
- dev_extent_len;
+ sctx->stat.last_physical =
+ map->stripes[stripe_index].physical + dev_stripe_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3639,8 +3621,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
- dev_extent_len);
+ ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
if (ret)
goto out;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c7dea639a56f..e7671afcee4f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -17,6 +17,7 @@
#include <linux/crc32c.h>
#include "send.h"
+#include "ctree.h"
#include "backref.h"
#include "locking.h"
#include "disk-io.h"
@@ -82,8 +83,12 @@ struct send_ctx {
char *send_buf;
u32 send_size;
u32 send_max_size;
- u64 total_send_size;
- u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ /*
+ * Whether BTRFS_SEND_A_DATA attribute was already added to current
+ * command (since protocol v2, data must be the last attribute).
+ */
+ bool put_data;
+ struct page **send_buf_pages;
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
/* Protocol version compatibility requested */
u32 proto;
@@ -113,14 +118,14 @@ struct send_ctx {
*/
u64 cur_ino;
u64 cur_inode_gen;
- int cur_inode_new;
- int cur_inode_new_gen;
- int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool cur_inode_new;
+ bool cur_inode_new_gen;
+ bool cur_inode_deleted;
bool ignore_cur_inode;
u64 send_progress;
@@ -235,6 +240,9 @@ struct send_ctx {
* Indexed by the inode number of the directory to be deleted.
*/
struct rb_root orphan_dirs;
+
+ struct rb_root rbtree_new_refs;
+ struct rb_root rbtree_deleted_refs;
};
struct pending_dir_move {
@@ -335,8 +343,8 @@ __maybe_unused
static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
{
switch (sctx->proto) {
- case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
- case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
default: return false;
}
}
@@ -577,15 +585,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
while (pos < len) {
ret = kernel_write(filp, buf + pos, len - pos, off);
- /* TODO handle that correctly */
- /*if (ret == -ERESTARTSYS) {
- continue;
- }*/
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (ret == 0)
return -EIO;
- }
pos += ret;
}
@@ -598,6 +601,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
int total_len = sizeof(*hdr) + len;
int left = sctx->send_max_size - sctx->send_size;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+
if (unlikely(left < total_len))
return -EOVERFLOW;
@@ -618,6 +624,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
@@ -693,8 +700,7 @@ static int send_header(struct send_ctx *sctx)
struct btrfs_stream_header hdr;
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
- hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+ hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
}
@@ -734,9 +740,8 @@ static int send_cmd(struct send_ctx *sctx)
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
- sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
+ sctx->put_data = false;
return ret;
}
@@ -842,7 +847,7 @@ out:
*/
static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev)
+ u64 *gid, u64 *rdev, u64 *fileattr)
{
int ret;
struct btrfs_inode_item *ii;
@@ -872,6 +877,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
*gid = btrfs_inode_gid(path->nodes[0], ii);
if (rdev)
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ /*
+ * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
+ * otherwise logically split to 32/32 parts.
+ */
+ if (fileattr)
+ *fileattr = btrfs_inode_flags(path->nodes[0], ii);
return ret;
}
@@ -879,7 +890,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
static int get_inode_info(struct btrfs_root *root,
u64 ino, u64 *size, u64 *gen,
u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+ u64 *rdev, u64 *fileattr)
{
struct btrfs_path *path;
int ret;
@@ -888,7 +899,7 @@ static int get_inode_info(struct btrfs_root *root,
if (!path)
return -ENOMEM;
ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev);
+ rdev, fileattr);
btrfs_free_path(path);
return ret;
}
@@ -1634,7 +1645,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
u64 right_gen;
ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
left_ret = ret;
@@ -1643,7 +1654,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
right_ret = -ENOENT;
} else {
ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
right_ret = ret;
@@ -1806,7 +1817,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
if (dir_gen) {
ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
}
@@ -1878,7 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1906,7 +1917,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL);
+ who_gen, who_mode, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -1945,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1968,7 +1979,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
}
ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret < 0)
goto out;
@@ -2184,7 +2195,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
- * __record_new_ref
+ * record_new_ref_if_needed().
*/
ret = is_inode_existent(sctx, ino, gen);
if (ret < 0)
@@ -2499,6 +2510,39 @@ out:
return ret;
}
+static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ if (sctx->proto < 2)
+ return 0;
+
+ btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2578,7 +2622,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
- /* TODO Add otime support when the otime patches get into upstream */
+ if (sctx->proto >= 2)
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
ret = send_cmd(sctx);
@@ -2612,7 +2657,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
if (ino != sctx->cur_ino) {
ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev);
+ NULL, NULL, &rdev, NULL);
if (ret < 0)
goto out;
} else {
@@ -2751,48 +2796,50 @@ struct recorded_ref {
u64 dir;
u64 dir_gen;
int name_len;
+ struct rb_node node;
+ struct rb_root *root;
};
-static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+static struct recorded_ref *recorded_ref_alloc(void)
{
- ref->full_path = path;
- ref->name = (char *)kbasename(ref->full_path->start);
- ref->name_len = ref->full_path->end - ref->name;
+ struct recorded_ref *ref;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref)
+ return NULL;
+ RB_CLEAR_NODE(&ref->node);
+ INIT_LIST_HEAD(&ref->list);
+ return ref;
}
-/*
- * We need to process new refs before deleted refs, but compare_tree gives us
- * everything mixed. So we first record all refs and later process them.
- * This function is a helper to record one ref.
- */
-static int __record_ref(struct list_head *head, u64 dir,
- u64 dir_gen, struct fs_path *path)
+static void recorded_ref_free(struct recorded_ref *ref)
{
- struct recorded_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_KERNEL);
if (!ref)
- return -ENOMEM;
+ return;
+ if (!RB_EMPTY_NODE(&ref->node))
+ rb_erase(&ref->node, ref->root);
+ list_del(&ref->list);
+ fs_path_free(ref->full_path);
+ kfree(ref);
+}
- ref->dir = dir;
- ref->dir_gen = dir_gen;
- set_ref_path(ref, path);
- list_add_tail(&ref->list, head);
- return 0;
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
}
static int dup_ref(struct recorded_ref *ref, struct list_head *list)
{
struct recorded_ref *new;
- new = kmalloc(sizeof(*ref), GFP_KERNEL);
+ new = recorded_ref_alloc();
if (!new)
return -ENOMEM;
new->dir = ref->dir;
new->dir_gen = ref->dir_gen;
- new->full_path = NULL;
- INIT_LIST_HEAD(&new->list);
list_add_tail(&new->list, list);
return 0;
}
@@ -2803,9 +2850,7 @@ static void __free_recorded_refs(struct list_head *head)
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(cur->full_path);
- list_del(&cur->list);
- kfree(cur);
+ recorded_ref_free(cur);
}
}
@@ -3315,7 +3360,7 @@ finish:
* The parent inode might have been deleted in the send snapshot
*/
ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3490,11 +3535,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
}
ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL);
+ &left_gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL);
+ &right_gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3625,7 +3670,7 @@ static int is_ancestor(struct btrfs_root *root,
}
ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3717,7 +3762,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
ret = get_inode_info(sctx->parent_root, ino, NULL,
&parent_ino_gen, NULL, NULL, NULL,
- NULL);
+ NULL, NULL);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -4311,185 +4356,169 @@ out:
return ret;
}
-static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
- void *ctx, struct list_head *refs)
+static int rbtree_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+ int result;
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ if (data->name_len > ref->name_len)
+ return 1;
+ if (data->name_len < ref->name_len)
+ return -1;
+ result = strcmp(data->name, ref->name);
+ if (result > 0)
+ return 1;
+ if (result < 0)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_ref_comp(entry, parent) < 0;
+}
+
+static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
+ struct fs_path *name, u64 dir, u64 dir_gen,
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
+ struct fs_path *path = NULL;
+ struct recorded_ref *ref = NULL;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ path = fs_path_alloc();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
+ ref = recorded_ref_alloc();
+ if (!ref) {
+ ret = -ENOMEM;
goto out;
+ }
- ret = get_cur_path(sctx, dir, gen, p);
+ ret = get_cur_path(sctx, dir, dir_gen, path);
if (ret < 0)
goto out;
- ret = fs_path_add_path(p, name);
+ ret = fs_path_add_path(path, name);
if (ret < 0)
goto out;
- ret = __record_ref(refs, dir, gen, p);
-
+ ref->dir = dir;
+ ref->dir_gen = dir_gen;
+ set_ref_path(ref, path);
+ list_add_tail(&ref->list, refs);
+ rb_add(&ref->node, root, rbtree_ref_less);
+ ref->root = root;
out:
- if (ret)
- fs_path_free(p);
+ if (ret) {
+ if (path && (!ref || !ref->full_path))
+ fs_path_free(path);
+ recorded_ref_free(ref);
+ }
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- struct send_ctx *sctx = ctx;
- return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
-}
-
-
-static int __record_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_new_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
+ int ret = 0;
struct send_ctx *sctx = ctx;
- return record_ref(sctx->parent_root, dir, name, ctx,
- &sctx->deleted_refs);
-}
-
-static int record_new_ref(struct send_ctx *sctx)
-{
- int ret;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
+ NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_new_refs,
+ &sctx->new_refs, name, dir, dir_gen,
+ sctx);
+ }
out:
return ret;
}
-static int record_deleted_ref(struct send_ctx *sctx)
+static int record_deleted_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
- int ret;
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
+ NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
+ &sctx->deleted_refs, name, dir,
+ dir_gen, sctx);
+ }
out:
return ret;
}
-struct find_ref_ctx {
- u64 dir;
- u64 dir_gen;
- struct btrfs_root *root;
- struct fs_path *name;
- int found_idx;
-};
-
-static int __find_iref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx_)
-{
- struct find_ref_ctx *ctx = ctx_;
- u64 dir_gen;
- int ret;
-
- if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
- strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
- /*
- * To avoid doing extra lookups we'll only do this if everything
- * else matches.
- */
- ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
- if (dir_gen != ctx->dir_gen)
- return 0;
- ctx->found_idx = num;
- return 1;
- }
- return 0;
-}
-
-static int find_iref(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 dir, u64 dir_gen, struct fs_path *name)
+static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- struct find_ref_ctx ctx;
-
- ctx.dir = dir;
- ctx.name = name;
- ctx.dir_gen = dir_gen;
- ctx.found_idx = -1;
- ctx.root = root;
- ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- return ret;
-
- if (ctx.found_idx == -1)
- return -ENOENT;
-
- return ctx.found_idx;
-}
-
-static int __record_changed_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- u64 dir_gen;
- int ret;
- struct send_ctx *sctx = ctx;
-
- ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
-
- ret = find_iref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_new_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ goto out;
+ ret = 0;
+out:
return ret;
}
-static int __record_changed_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_deleted_ref(struct send_ctx *sctx)
{
- u64 dir_gen;
int ret;
- struct send_ctx *sctx = ctx;
-
- ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
- ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
- dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_deleted_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, record_deleted_ref_if_needed,
+ sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+out:
return ret;
}
@@ -4498,11 +4527,11 @@ static int record_changed_ref(struct send_ctx *sctx)
int ret = 0;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+ sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -4533,10 +4562,10 @@ static int process_all_refs(struct send_ctx *sctx,
if (cmd == BTRFS_COMPARE_TREE_NEW) {
root = sctx->send_root;
- cb = __record_new_ref;
+ cb = record_new_ref_if_needed;
} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
root = sctx->parent_root;
- cb = __record_deleted_ref;
+ cb = record_deleted_ref_if_needed;
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
@@ -4864,14 +4893,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx)
static int put_data_header(struct send_ctx *sctx, u32 len)
{
- struct btrfs_tlv_header *hdr;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+ sctx->put_data = true;
+ if (sctx->proto >= 2) {
+ /*
+ * Since v2, the data attribute header doesn't include a length,
+ * it is implicitly to the end of the command.
+ */
+ if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ return -EOVERFLOW;
+ put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
+ sctx->send_size += sizeof(__le16);
+ } else {
+ struct btrfs_tlv_header *hdr;
- if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
- return -EOVERFLOW;
- hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
- put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
- put_unaligned_le16(len, &hdr->tlv_len);
- sctx->send_size += sizeof(*hdr);
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ }
return 0;
}
@@ -5014,7 +5057,7 @@ static int send_clone(struct send_ctx *sctx,
if (clone_root->root == sctx->send_root) {
ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL);
+ &gen, NULL, NULL, NULL, NULL, NULL);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5141,17 +5184,214 @@ tlv_put_failure:
return ret;
}
-static int send_extent_data(struct send_ctx *sctx,
- const u64 offset,
- const u64 len)
+static int send_encoded_inline_extent(struct send_ctx *sctx,
+ struct btrfs_path *path, u64 offset,
+ u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 ram_bytes;
+ size_t inline_size;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + ram_bytes - offset, len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+
+ ret = put_data_header(sctx, inline_size);
+ if (ret < 0)
+ goto out;
+ read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
+ btrfs_file_extent_inline_start(ei), inline_size);
+ sctx->send_size += inline_size;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ u64 offset, u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_bytenr, disk_num_bytes;
+ u32 data_offset;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
+ len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
+ btrfs_file_extent_ram_bytes(leaf, ei));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
+ offset - key.offset + btrfs_file_extent_offset(leaf, ei));
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
+
+ ret = put_data_header(sctx, disk_num_bytes);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We want to do I/O directly into the send buffer, so get the next page
+ * boundary in the send buffer. This means that there may be a gap
+ * between the beginning of the command and the file data.
+ */
+ data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ if (data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ /*
+ * Note that send_buf is a mapping of send_buf_pages, so this is really
+ * reading into send_buf.
+ */
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ disk_bytenr, disk_num_bytes,
+ sctx->send_buf_pages +
+ (data_offset >> PAGE_SHIFT));
+ if (ret)
+ goto out;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
+ hdr->crc = 0;
+ crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+ if (!ret) {
+ ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
+ disk_num_bytes, &sctx->send_off);
+ }
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ const u64 offset, const u64 len)
{
const u64 end = offset + len;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
+ bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_INLINE);
+
+ /*
+ * Send the compressed extent unless the compressed data is
+ * larger than the decompressed data. This can happen if we're
+ * not sending the entire extent, either because it has been
+ * partially overwritten/truncated or because this is a part of
+ * the extent that we couldn't clone in clone_range().
+ */
+ if (is_inline &&
+ btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]) <= len) {
+ return send_encoded_inline_extent(sctx, path, offset,
+ len);
+ } else if (!is_inline &&
+ btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
+ return send_encoded_extent(sctx, path, offset, len);
+ }
+ }
+
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
@@ -5289,12 +5529,9 @@ out:
return ret;
}
-static int clone_range(struct send_ctx *sctx,
- struct clone_root *clone_root,
- const u64 disk_byte,
- u64 data_offset,
- u64 offset,
- u64 len)
+static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
+ struct clone_root *clone_root, const u64 disk_byte,
+ u64 data_offset, u64 offset, u64 len)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -5318,7 +5555,7 @@ static int clone_range(struct send_ctx *sctx,
*/
if (clone_root->offset == 0 &&
len == sctx->send_root->fs_info->sectorsize)
- return send_extent_data(sctx, offset, len);
+ return send_extent_data(sctx, dst_path, offset, len);
path = alloc_path_for_send();
if (!path)
@@ -5329,7 +5566,8 @@ static int clone_range(struct send_ctx *sctx,
* accept clones from these extents.
*/
ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ &clone_src_i_size, NULL, NULL, NULL, NULL, NULL,
+ NULL);
btrfs_release_path(path);
if (ret < 0)
goto out;
@@ -5415,7 +5653,8 @@ static int clone_range(struct send_ctx *sctx,
if (hole_len > len)
hole_len = len;
- ret = send_extent_data(sctx, offset, hole_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ hole_len);
if (ret < 0)
goto out;
@@ -5488,14 +5727,16 @@ static int clone_range(struct send_ctx *sctx,
if (ret < 0)
goto out;
}
- ret = send_extent_data(sctx, offset + slen,
+ ret = send_extent_data(sctx, dst_path,
+ offset + slen,
clone_len - slen);
} else {
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
} else {
- ret = send_extent_data(sctx, offset, clone_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ clone_len);
}
if (ret < 0)
@@ -5527,7 +5768,7 @@ next:
}
if (len > 0)
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
out:
@@ -5558,10 +5799,10 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
- ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, end - offset);
+ ret = clone_range(sctx, path, clone_root, disk_byte,
+ data_offset, offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, end - offset);
+ ret = send_extent_data(sctx, path, offset, end - offset);
}
sctx->cur_inode_next_write_offset = end;
return ret;
@@ -6021,11 +6262,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
u64 left_mode;
u64 left_uid;
u64 left_gid;
+ u64 left_fileattr;
u64 right_mode;
u64 right_uid;
u64 right_gid;
+ u64 right_fileattr;
int need_chmod = 0;
int need_chown = 0;
+ bool need_fileattr = false;
int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -6059,7 +6303,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL);
+ &left_mode, &left_uid, &left_gid, NULL, &left_fileattr);
if (ret < 0)
goto out;
@@ -6074,7 +6318,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
&old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL);
+ &right_gid, NULL, &right_fileattr);
if (ret < 0)
goto out;
@@ -6082,6 +6326,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
+ need_fileattr = true;
if ((old_size == sctx->cur_inode_size) ||
(sctx->cur_inode_size > old_size &&
sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
@@ -6125,6 +6371,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret < 0)
goto out;
}
+ if (need_fileattr) {
+ ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_fileattr);
+ if (ret < 0)
+ goto out;
+ }
ret = send_capabilities(sctx);
if (ret < 0)
@@ -6165,8 +6417,13 @@ static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
{
struct parent_paths_ctx *ppctx = ctx;
- return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
- ppctx->refs);
+ /*
+ * Pass 0 as the generation for the directory, we don't care about it
+ * here as we have no new references to add, we just want to delete all
+ * references for an inode.
+ */
+ return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs,
+ name, dir, 0, ppctx->sctx);
}
/*
@@ -6220,9 +6477,7 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
ret = send_unlink(sctx, ref->full_path);
if (ret < 0)
goto out;
- fs_path_free(ref->full_path);
- list_del(&ref->list);
- kfree(ref);
+ recorded_ref_free(ref);
}
ret = 0;
out:
@@ -6269,7 +6524,7 @@ static int changed_inode(struct send_ctx *sctx,
close_current_inode(sctx);
sctx->cur_ino = key->objectid;
- sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_new_gen = false;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
@@ -6310,7 +6565,7 @@ static int changed_inode(struct send_ctx *sctx,
*/
if (left_gen != right_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
- sctx->cur_inode_new_gen = 1;
+ sctx->cur_inode_new_gen = true;
}
/*
@@ -6342,8 +6597,8 @@ static int changed_inode(struct send_ctx *sctx,
if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6354,8 +6609,8 @@ static int changed_inode(struct send_ctx *sctx,
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6373,8 +6628,8 @@ static int changed_inode(struct send_ctx *sctx,
* First, process the inode as if it was deleted.
*/
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6388,8 +6643,8 @@ static int changed_inode(struct send_ctx *sctx,
* Now process the inode as if it was new.
*/
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6421,9 +6676,9 @@ static int changed_inode(struct send_ctx *sctx,
goto out;
} else {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_new_gen = 0;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6536,12 +6791,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir)
int ret;
ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL);
+ NULL, NULL, NULL);
if (ret)
return ret;
ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (ret)
return ret;
@@ -7537,6 +7792,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
} else {
sctx->proto = 1;
}
+ if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
@@ -7556,8 +7815,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
sctx->clone_roots_cnt = arg->clone_sources_count;
- sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ if (sctx->proto >= 2) {
+ u32 send_buf_num_pages;
+
+ sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
+ sctx->send_buf_pages = kcalloc(send_buf_num_pages,
+ sizeof(*sctx->send_buf_pages),
+ GFP_KERNEL);
+ if (!sctx->send_buf_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < send_buf_num_pages; i++) {
+ sctx->send_buf_pages[i] =
+ vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
+ }
+ } else {
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ }
if (!sctx->send_buf) {
ret = -ENOMEM;
goto out;
@@ -7566,6 +7848,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
arg->clone_sources_count + 1,
@@ -7750,6 +8034,7 @@ out:
fput(sctx->send_filp);
kvfree(sctx->clone_roots);
+ kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 08602fdd600a..4bb4e6a638cb 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -7,12 +7,19 @@
#ifndef BTRFS_SEND_H
#define BTRFS_SEND_H
-#include "ctree.h"
+#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+#define BTRFS_SEND_STREAM_VERSION 2
-#define BTRFS_SEND_BUF_SIZE SZ_64K
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+
+struct inode;
+struct btrfs_ioctl_send_args;
enum btrfs_tlv_type {
BTRFS_TLV_U8,
@@ -46,87 +53,117 @@ struct btrfs_tlv_header {
/* commands */
enum btrfs_send_cmd {
- BTRFS_SEND_C_UNSPEC,
+ BTRFS_SEND_C_UNSPEC = 0,
/* Version 1 */
- BTRFS_SEND_C_SUBVOL,
- BTRFS_SEND_C_SNAPSHOT,
+ BTRFS_SEND_C_SUBVOL = 1,
+ BTRFS_SEND_C_SNAPSHOT = 2,
- BTRFS_SEND_C_MKFILE,
- BTRFS_SEND_C_MKDIR,
- BTRFS_SEND_C_MKNOD,
- BTRFS_SEND_C_MKFIFO,
- BTRFS_SEND_C_MKSOCK,
- BTRFS_SEND_C_SYMLINK,
+ BTRFS_SEND_C_MKFILE = 3,
+ BTRFS_SEND_C_MKDIR = 4,
+ BTRFS_SEND_C_MKNOD = 5,
+ BTRFS_SEND_C_MKFIFO = 6,
+ BTRFS_SEND_C_MKSOCK = 7,
+ BTRFS_SEND_C_SYMLINK = 8,
- BTRFS_SEND_C_RENAME,
- BTRFS_SEND_C_LINK,
- BTRFS_SEND_C_UNLINK,
- BTRFS_SEND_C_RMDIR,
+ BTRFS_SEND_C_RENAME = 9,
+ BTRFS_SEND_C_LINK = 10,
+ BTRFS_SEND_C_UNLINK = 11,
+ BTRFS_SEND_C_RMDIR = 12,
- BTRFS_SEND_C_SET_XATTR,
- BTRFS_SEND_C_REMOVE_XATTR,
+ BTRFS_SEND_C_SET_XATTR = 13,
+ BTRFS_SEND_C_REMOVE_XATTR = 14,
- BTRFS_SEND_C_WRITE,
- BTRFS_SEND_C_CLONE,
+ BTRFS_SEND_C_WRITE = 15,
+ BTRFS_SEND_C_CLONE = 16,
- BTRFS_SEND_C_TRUNCATE,
- BTRFS_SEND_C_CHMOD,
- BTRFS_SEND_C_CHOWN,
- BTRFS_SEND_C_UTIMES,
+ BTRFS_SEND_C_TRUNCATE = 17,
+ BTRFS_SEND_C_CHMOD = 18,
+ BTRFS_SEND_C_CHOWN = 19,
+ BTRFS_SEND_C_UTIMES = 20,
- BTRFS_SEND_C_END,
- BTRFS_SEND_C_UPDATE_EXTENT,
- __BTRFS_SEND_C_MAX_V1,
+ BTRFS_SEND_C_END = 21,
+ BTRFS_SEND_C_UPDATE_EXTENT = 22,
+ BTRFS_SEND_C_MAX_V1 = 22,
/* Version 2 */
- __BTRFS_SEND_C_MAX_V2,
+ BTRFS_SEND_C_FALLOCATE = 23,
+ BTRFS_SEND_C_FILEATTR = 24,
+ BTRFS_SEND_C_ENCODED_WRITE = 25,
+ BTRFS_SEND_C_MAX_V2 = 25,
/* End */
- __BTRFS_SEND_C_MAX,
+ BTRFS_SEND_C_MAX = 25,
};
-#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
/* attributes in send stream */
enum {
- BTRFS_SEND_A_UNSPEC,
-
- BTRFS_SEND_A_UUID,
- BTRFS_SEND_A_CTRANSID,
-
- BTRFS_SEND_A_INO,
- BTRFS_SEND_A_SIZE,
- BTRFS_SEND_A_MODE,
- BTRFS_SEND_A_UID,
- BTRFS_SEND_A_GID,
- BTRFS_SEND_A_RDEV,
- BTRFS_SEND_A_CTIME,
- BTRFS_SEND_A_MTIME,
- BTRFS_SEND_A_ATIME,
- BTRFS_SEND_A_OTIME,
-
- BTRFS_SEND_A_XATTR_NAME,
- BTRFS_SEND_A_XATTR_DATA,
-
- BTRFS_SEND_A_PATH,
- BTRFS_SEND_A_PATH_TO,
- BTRFS_SEND_A_PATH_LINK,
-
- BTRFS_SEND_A_FILE_OFFSET,
- BTRFS_SEND_A_DATA,
-
- BTRFS_SEND_A_CLONE_UUID,
- BTRFS_SEND_A_CLONE_CTRANSID,
- BTRFS_SEND_A_CLONE_PATH,
- BTRFS_SEND_A_CLONE_OFFSET,
- BTRFS_SEND_A_CLONE_LEN,
-
- __BTRFS_SEND_A_MAX,
+ BTRFS_SEND_A_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_A_UUID = 1,
+ BTRFS_SEND_A_CTRANSID = 2,
+
+ BTRFS_SEND_A_INO = 3,
+ BTRFS_SEND_A_SIZE = 4,
+ BTRFS_SEND_A_MODE = 5,
+ BTRFS_SEND_A_UID = 6,
+ BTRFS_SEND_A_GID = 7,
+ BTRFS_SEND_A_RDEV = 8,
+ BTRFS_SEND_A_CTIME = 9,
+ BTRFS_SEND_A_MTIME = 10,
+ BTRFS_SEND_A_ATIME = 11,
+ BTRFS_SEND_A_OTIME = 12,
+
+ BTRFS_SEND_A_XATTR_NAME = 13,
+ BTRFS_SEND_A_XATTR_DATA = 14,
+
+ BTRFS_SEND_A_PATH = 15,
+ BTRFS_SEND_A_PATH_TO = 16,
+ BTRFS_SEND_A_PATH_LINK = 17,
+
+ BTRFS_SEND_A_FILE_OFFSET = 18,
+ /*
+ * As of send stream v2, this attribute is special: it must be the last
+ * attribute in a command, its header contains only the type, and its
+ * length is implicitly the remaining length of the command.
+ */
+ BTRFS_SEND_A_DATA = 19,
+
+ BTRFS_SEND_A_CLONE_UUID = 20,
+ BTRFS_SEND_A_CLONE_CTRANSID = 21,
+ BTRFS_SEND_A_CLONE_PATH = 22,
+ BTRFS_SEND_A_CLONE_OFFSET = 23,
+ BTRFS_SEND_A_CLONE_LEN = 24,
+
+ BTRFS_SEND_A_MAX_V1 = 24,
+
+ /* Version 2 */
+ BTRFS_SEND_A_FALLOCATE_MODE = 25,
+
+ /*
+ * File attributes from the FS_*_FL namespace (i_flags, xflags),
+ * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored
+ * in btrfs_inode_item::flags (represented by btrfs_inode::flags and
+ * btrfs_inode::ro_flags).
+ */
+ BTRFS_SEND_A_FILEATTR = 26,
+
+ BTRFS_SEND_A_UNENCODED_FILE_LEN = 27,
+ BTRFS_SEND_A_UNENCODED_LEN = 28,
+ BTRFS_SEND_A_UNENCODED_OFFSET = 29,
+ /*
+ * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+ * BTRFS_SEND_C_ENCODED_WRITE.
+ */
+ BTRFS_SEND_A_COMPRESSION = 30,
+ BTRFS_SEND_A_ENCRYPTION = 31,
+ BTRFS_SEND_A_MAX_V2 = 31,
+
+ /* End */
+ BTRFS_SEND_A_MAX = 31,
};
-#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
-#ifdef __KERNEL__
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
-#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2dd8754cb990..d0cbeb7ae81c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,6 +9,7 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
+#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -187,6 +188,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return SZ_1G;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
+}
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -208,6 +240,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
@@ -263,7 +296,7 @@ out:
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly, u64 bytes_zone_unusable,
- struct btrfs_space_info **space_info)
+ bool active, struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
int factor;
@@ -274,6 +307,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
ASSERT(found);
spin_lock(&found->lock);
found->total_bytes += total_bytes;
+ if (active)
+ found->active_total_bytes += total_bytes;
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
@@ -337,6 +372,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
+static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ /*
+ * On regular filesystem, all total_bytes are always writable. On zoned
+ * filesystem, there may be a limitation imposed by max_active_zones.
+ * For metadata allocation, we cannot finish an existing active block
+ * group to avoid a deadlock. Thus, we need to consider only the active
+ * groups to be writable for metadata space.
+ */
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return space_info->total_bytes;
+
+ return space_info->active_total_bytes;
+}
+
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
@@ -349,9 +400,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
+ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+ avail = 0;
+ else
+ avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < space_info->total_bytes + avail)
+ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
return 1;
return 0;
}
@@ -387,7 +441,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
+ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
@@ -671,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
+ /*
+ * For metadata space on zoned filesystem, reaching here means we
+ * don't have enough space left in active_total_bytes. Try to
+ * activate a block group first, because we may have inactive
+ * block group already allocated.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
+ if (ret < 0)
+ break;
+ else if (ret == 1)
+ break;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -681,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info,
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
+
+ /*
+ * For metadata space on zoned filesystem, allocating a new chunk
+ * is not enough. We still need to activate the block * group.
+ * Active the newly allocated block group by (maybe) finishing
+ * a block group.
+ */
+ if (ret == 1) {
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
+ /*
+ * Revert to the original ret regardless we could finish
+ * one block group or not.
+ */
+ if (ret >= 0)
+ ret = 1;
+ }
+
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -718,6 +801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
+ u64 total;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -732,8 +816,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
- if (space_info->total_bytes + avail < used)
- to_reclaim += used - (space_info->total_bytes + avail);
+ total = writable_total_bytes(fs_info, space_info);
+ if (total + avail < used)
+ to_reclaim += used - (total + avail);
return to_reclaim;
}
@@ -743,9 +828,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 90);
+ u64 total = writable_total_bytes(fs_info, space_info);
+ u64 thresh;
u64 used;
+ thresh = div_factor_fine(total, 90);
+
lockdep_assert_held(&space_info->lock);
/* If we're just plain full then async reclaim just slows us down. */
@@ -807,8 +895,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
- if (used < space_info->total_bytes)
- thresh += space_info->total_bytes - used;
+ if (used < total)
+ thresh += total - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -1280,7 +1368,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
/*
* This is the priority reclaim path, so to_reclaim could be >0 still
- * because we may have only satisified the priority tickets and still
+ * because we may have only satisfied the priority tickets and still
* left non priority tickets on the list. We would then have
* to_reclaim but ->bytes == 0.
*/
@@ -1525,7 +1613,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets &&
- ((used + orig_bytes <= space_info->total_bytes) ||
+ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c096695598c1..12fd6147f92d 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -19,12 +19,16 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ /* Total bytes in the space, but only accounts active block groups. */
+ u64 active_total_bytes;
u64 bytes_zone_unusable; /* total bytes that are unusable until
resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ /* Chunk size in bytes */
+ u64 chunk_size;
/*
* Once a block group drops below this threshold (percents) we'll
@@ -122,7 +126,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly, u64 bytes_zone_unusable,
- struct btrfs_space_info **space_info);
+ bool active, struct btrfs_space_info **space_info);
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index f429256f56db..12455b2b41de 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
{
const unsigned long member_offset = (unsigned long)ptr + off;
- if (member_offset > eb->len) {
+ if (unlikely(member_offset + size > eb->len)) {
btrfs_warn(eb->fs_info,
- "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d",
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
- if (member_offset + size > eb->len) {
- btrfs_warn(eb->fs_info,
- "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d",
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
(unsigned long)ptr, eb->start, member_offset, size);
return false;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index a105b291444f..6fc2b77ae5c3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -123,7 +123,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/*
- * We have cases like a dummy extent buffer page, which is not mappped
+ * We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
if (page->mapping)
@@ -731,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* It should not have any subpage::writers count.
* Can be unlocked by unlock_page().
* This is the most common locked page for __extent_writepage() called
- * inside extent_write_cache_pages() or extent_write_full_page().
+ * inside extent_write_cache_pages().
* Rarer cases include the @locked_page from extent_write_locked_range().
*
* - Page locked by lock_delalloc_pages()
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6627dd7875ee..4c7089b1681b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
+#include "raid56.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data);
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
/*
- * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
@@ -1931,10 +1932,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
- new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
@@ -2246,12 +2243,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
- num_stripes = 2;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
- num_stripes = 3;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
- num_stripes = 4;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ num_stripes = rattr->ncopies;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
@@ -2275,17 +2268,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
- * In order to avoid overwriting the superblock on the drive,
- * btrfs starts at an offset of at least 1MB when doing chunk
- * allocation.
- *
- * This ensures we have at least min_stripe_size free space
- * after excluding 1MB.
+ * Ensure we have at least min_stripe_size on top of the
+ * reserved space on the device.
*/
- if (avail_space <= SZ_1M + min_stripe_size)
+ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
continue;
- avail_space -= SZ_1M;
+ avail_space -= BTRFS_DEVICE_RANGE_RESERVED;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
@@ -2703,13 +2692,9 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_ref;
- err = btrfs_end_io_wq_init();
- if (err)
- goto free_prelim_ref;
-
err = btrfs_interface_init();
if (err)
- goto free_end_io_wq;
+ goto free_prelim_ref;
btrfs_print_mod_info();
@@ -2725,8 +2710,6 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
-free_end_io_wq:
- btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2764,7 +2747,6 @@ static void __exit exit_btrfs_fs(void)
extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
- btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92a1fa8e3da6..d5d0717fd09a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,6 +21,7 @@
#include "space-info.h"
#include "block-group.h"
#include "qgroup.h"
+#include "misc.h"
/*
* Structure name Path
@@ -61,6 +62,10 @@ struct raid_kobject {
.store = _store, \
}
+#define BTRFS_ATTR_W(_prefix, _name, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
@@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct kobject *get_btrfs_kobj(struct kobject *kobj);
static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
{
@@ -270,12 +276,10 @@ static umode_t btrfs_feature_visible(struct kobject *kobj,
return mode;
}
-BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
-BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
@@ -283,9 +287,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-#ifdef CONFIG_BTRFS_DEBUG
-/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
@@ -296,17 +301,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
/*
* Features which depend on feature bits and may differ between each fs.
*
- * /sys/fs/btrfs/features - all available features implemeted by this version
+ * /sys/fs/btrfs/features - all available features implemented by this version
* /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
* can be changed on a mounted filesystem.
*/
static struct attribute *btrfs_supported_feature_attrs[] = {
- BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(default_subvol),
BTRFS_FEAT_ATTR_PTR(mixed_groups),
BTRFS_FEAT_ATTR_PTR(compress_lzo),
BTRFS_FEAT_ATTR_PTR(compress_zstd),
- BTRFS_FEAT_ATTR_PTR(big_metadata),
BTRFS_FEAT_ATTR_PTR(extended_iref),
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
@@ -314,8 +317,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
@@ -709,6 +714,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
+static ssize_t btrfs_chunk_size_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size));
+}
+
+/*
+ * Store new chunk size in space info. Can be called on a read-only filesystem.
+ *
+ * If the new chunk size value is larger than 10% of free space it is reduced
+ * to match that limit. Alignment must be to 256M and the system chunk size
+ * cannot be set.
+ */
+static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ char *retptr;
+ u64 val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!fs_info->fs_devices)
+ return -EINVAL;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EINVAL;
+
+ /* System block type must not be changed. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -EPERM;
+
+ val = memparse(buf, &retptr);
+ /* There could be trailing '\n', also catch any typos after the value */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || val == 0)
+ return -EINVAL;
+
+ val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
+
+ /* Limit stripe size to 10% of available space. */
+ val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+
+ /* Must be multiple of 256M. */
+ val &= ~((u64)SZ_256M - 1);
+
+ /* Must be at least 256M. */
+ if (val < SZ_256M)
+ return -EINVAL;
+
+ btrfs_update_space_info_chunk_size(space_info, val);
+
+ return len;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Request chunk allocation with current chunk size.
+ */
+static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ struct btrfs_trans_handle *trans;
+ bool val;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ /*
+ * This is unsafe to be called from sysfs context and may cause
+ * unexpected problems.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+ btrfs_end_transaction(trans);
+
+ if (ret == 1)
+ return len;
+
+ return -ENOSPC;
+}
+BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store);
+
+#endif
+
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -719,6 +830,7 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -773,6 +885,10 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, chunk_size),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -871,6 +987,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
+static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf,
+ "commits %llu\n"
+ "last_commit_ms %llu\n"
+ "max_commit_ms %llu\n"
+ "total_commit_ms %llu\n",
+ fs_info->commit_stats.commit_count,
+ div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
+}
+
+static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long val;
+ int ret;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+ if (val)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0);
+
+ return len;
+}
+BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1110,6 +1268,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(, commit_stats),
NULL,
};
@@ -1140,6 +1299,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
return to_fs_devs(kobj)->fs_info;
}
+static struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+ while (kobj) {
+ if (kobj->ktype == &btrfs_ktype)
+ return kobj;
+ kobj = kobj->parent;
+ }
+ return NULL;
+}
+
#define NUM_FEATURE_BITS 64
#define BTRFS_FEATURE_NAME_MAX 13
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
@@ -2106,4 +2275,3 @@ void __cold btrfs_exit_sysfs(void)
#endif
kset_unregister(btrfs_kset);
}
-
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d8e56edd6991..cc9377cf56a3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void)
return NULL;
inode->i_mode = S_IFREG;
+ inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 51a8b075c259..b7d181a08eab 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ path->nodes[0] = eb;
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 875b801ab3d7..0bec10740ad3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/timekeeping.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -1831,8 +1832,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
dentry->d_name.len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime =
- current_time(parent_inode);
+ parent_inode->i_mtime = current_time(parent_inode);
+ parent_inode->i_ctime = parent_inode->i_mtime;
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2098,12 +2099,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
+static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+{
+ fs_info->commit_stats.commit_count++;
+ fs_info->commit_stats.last_commit_dur = interval;
+ fs_info->commit_stats.max_commit_dur =
+ max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
+ fs_info->commit_stats.total_commit_dur += interval;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ktime_t start_time;
+ ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
@@ -2228,6 +2240,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
}
+ /*
+ * Get the time spent on the work done by the commit thread and not
+ * the time spent waiting on a previous commit
+ */
+ start_time = ktime_get_ns();
+
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2469,6 +2487,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
+ interval = ktime_get_ns() - start_time;
+
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2476,6 +2496,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ update_commit_stats(fs_info, interval);
+
return ret;
unlock_reloc:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 370388fadf96..dcf75a8daa20 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -171,7 +171,7 @@ again:
int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -194,7 +194,7 @@ again:
* writing.
*/
if (zoned && !created) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -2287,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_key location;
/*
- * Currenly we only log dir index keys. Even if we replay a log created
+ * Currently we only log dir index keys. Even if we replay a log created
* by an older kernel that logged both dir index and dir item keys, all
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
@@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
@@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
}
@@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
bool log_dentries = false;
if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
if (btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
@@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
if (ret)
@@ -7029,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* anyone from syncing the log until we have updated both inodes
* in the log.
*/
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0))
+ goto out;
log_pinned = true;
- btrfs_pin_log_trans(root);
path = btrfs_alloc_path();
if (!path) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1620f8170629..57ab5f3b8dc7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,6 +12,9 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+/* We can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT (1)
+
struct btrfs_log_ctx {
int log_ret;
int log_transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9c20049d1fec..272901514b0c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags)
return btrfs_raid_array[index].raid_name;
}
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
@@ -238,7 +245,6 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
@@ -1396,12 +1402,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
- /*
- * We don't want to overwrite the superblock on the drive nor
- * any area used by the boot loader (grub for example), so we
- * make sure to start at an offset of at least 1MB.
- */
- return max_t(u64, start, SZ_1M);
+ return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
@@ -5071,26 +5072,16 @@ static void init_alloc_chunk_ctl_policy_regular(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
- u64 type = ctl->type;
+ struct btrfs_space_info *space_info;
- if (type & BTRFS_BLOCK_GROUP_DATA) {
- ctl->max_stripe_size = SZ_1G;
- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* For larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- ctl->max_stripe_size = SZ_1G;
- else
- ctl->max_stripe_size = SZ_256M;
- ctl->max_chunk_size = ctl->max_stripe_size;
- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ctl->max_stripe_size = SZ_32M;
- ctl->max_chunk_size = 2 * ctl->max_stripe_size;
- ctl->devs_max = min_t(int, ctl->devs_max,
- BTRFS_MAX_DEVS_SYS_CHUNK);
- } else {
- BUG();
- }
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
+
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = ctl->max_chunk_size;
+
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
@@ -5720,7 +5711,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
- int ret;
+ enum btrfs_raid_types index;
+ int ret = 1;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
@@ -5733,10 +5725,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
- ret = map->num_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- ret = map->sub_stripes;
+ index = btrfs_bg_flags_to_raid_index(map->type);
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ ret = btrfs_raid_array[index].ncopies;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
@@ -5748,8 +5741,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- else
- ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
@@ -5768,6 +5759,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
@@ -5785,6 +5779,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return 0;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
@@ -5917,18 +5914,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
kfree(bioc);
}
-/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
-static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 *length_ret,
- struct btrfs_io_context **bioc_ret)
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_io_context *bioc;
+ struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5937,29 +5933,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
- u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
- int ret = 0;
+ int ret;
int i;
- /* Discard always returns a bioc. */
- ASSERT(bioc_ret);
-
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
- return PTR_ERR(em);
+ return ERR_CAST(em);
map = em->map_lookup;
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
- goto out;
- }
+ goto out_free_map;
+}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
@@ -5985,7 +5978,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- num_stripes = 1;
+ *num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
@@ -5995,7 +5988,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
- num_stripes = min_t(u64, map->num_stripes,
+ *num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
@@ -6005,31 +5998,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = map->num_stripes;
+ *num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
- bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
- if (!bioc) {
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
ret = -ENOMEM;
- goto out;
+ goto out_free_map;
}
- for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical =
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bioc->stripes[i].length = stripes_per_dev *
- map->stripe_len;
+ stripes[i].length = stripes_per_dev * map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bioc->stripes[i].length += map->stripe_len;
+ stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -6040,17 +6032,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bioc->stripes[i].length -= stripe_offset;
+ stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bioc->stripes[i].length -= stripe_end_offset;
+ stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bioc->stripes[i].length = length;
+ stripes[i].length = length;
}
stripe_index++;
@@ -6060,12 +6052,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bioc_ret = bioc;
- bioc->map_type = map->type;
- bioc->num_stripes = num_stripes;
-out:
free_extent_map(em);
- return ret;
+ return stripes;
+out_free_map:
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
/*
@@ -6208,7 +6199,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + i;
new->physical = old->physical;
- new->length = old->length;
new->dev = dev_replace->tgtdev;
bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
@@ -6249,8 +6239,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bioc->tgtdev_map[index_srcdev] = num_stripes;
@@ -6472,6 +6460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
@@ -6479,9 +6468,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
- max_errors = nr_parity_stripes(map);
+ max_errors = btrfs_chunk_max_errors(map);
- *length = map->stripe_len;
+ /* Return the length to the full stripe end */
+ *length = min(logical + *length,
+ raid56_full_stripe_start + em->start +
+ data_stripes * stripe_len) - logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6604,10 +6596,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret, int mirror_num)
{
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bioc_ret);
-
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
@@ -6620,77 +6608,106 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+{
+ if (bioc->orig_bio->bi_opf & REQ_META)
+ return bioc->fs_info->endio_meta_workers;
+ return bioc->fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bio_endio(&bbio->bio);
+}
+
+static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
{
- bio->bi_private = bioc->private;
- bio->bi_end_io = bioc->end_io;
- bio_endio(bio);
+ struct bio *orig_bio = bioc->orig_bio;
+ struct btrfs_bio *bbio = btrfs_bio(orig_bio);
+
+ bbio->mirror_num = bioc->mirror_num;
+ orig_bio->bi_private = bioc->private;
+ orig_bio->bi_end_io = bioc->end_io;
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ orig_bio->bi_status = BLK_STS_IOERR;
+ else
+ orig_bio->bi_status = BLK_STS_OK;
+
+ if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+ } else {
+ bio_endio(orig_bio);
+ }
btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_io_context *bioc = bio->bi_private;
- int is_orig_bio = 0;
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
if (bio->bi_status) {
atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_bio(bio)->device;
-
- ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
}
- if (bio == bioc->orig_bio)
- is_orig_bio = 1;
+ if (bio != bioc->orig_bio)
+ bio_put(bio);
btrfs_bio_counter_dec(bioc->fs_info);
-
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bioc->orig_bio;
- }
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
- */
- if (atomic_read(&bioc->error) > bioc->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
-
- btrfs_end_bioc(bioc, bio);
- } else if (!is_orig_bio) {
- bio_put(bio);
- }
+ if (atomic_dec_and_test(&bioc->stripes_pending))
+ btrfs_end_bioc(bioc, true);
}
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
- u64 physical, struct btrfs_device *dev)
+static void submit_stripe_bio(struct btrfs_io_context *bioc,
+ struct bio *orig_bio, int dev_nr, bool clone)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
+ u64 physical = bioc->stripes[dev_nr].physical;
+ struct bio *bio;
+
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending))
+ btrfs_end_bioc(bioc, false);
+ return;
+ }
+
+ if (clone) {
+ bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
+ } else {
+ bio = orig_bio;
+ bio_set_dev(bio, dev->bdev);
+ btrfs_bio(bio)->device = dev;
+ }
- bio->bi_private = bioc;
- btrfs_bio(bio)->device = dev;
+ bioc->stripes[dev_nr].bioc = bioc;
+ bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
@@ -6708,8 +6725,8 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
}
}
btrfs_debug_in_rcu(fs_info,
- "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
@@ -6719,66 +6736,39 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
submit_bio(bio);
}
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bioc->orig_bio);
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bioc(bioc, bio);
- }
-}
-
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num)
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_io_context *bioc = NULL;
- length = bio->bi_iter.bi_size;
- map_length = length;
-
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ return;
}
total_devs = bioc->num_stripes;
- bioc->orig_bio = first_bio;
- bioc->private = first_bio->bi_private;
- bioc->end_io = first_bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, bioc->num_stripes);
+ bioc->orig_bio = bio;
+ bioc->private = bio->bi_private;
+ bioc->end_io = bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, total_devs);
if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- /* In this case, map_length has been set to the length of
- a single stripe; not the whole write */
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(bio, bioc, map_length);
- } else {
- ret = raid56_parity_recover(bio, bioc, map_length,
- mirror_num, 1);
- }
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ raid56_parity_write(bio, bioc);
+ else
+ raid56_parity_recover(bio, bioc, mirror_num, true);
+ return;
}
if (map_length < length) {
@@ -6789,26 +6779,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bioc->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bioc_error(bioc, first_bio, logical);
- continue;
- }
-
- if (dev_nr < total_devs - 1) {
- bio = btrfs_bio_clone(dev->bdev, first_bio);
- } else {
- bio = first_bio;
- bio_set_dev(bio, dev->bdev);
- }
+ const bool should_clone = (dev_nr < total_devs - 1);
- submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, dev_nr, should_clone);
}
btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -6966,11 +6941,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+u64 btrfs_calc_stripe_length(const struct extent_map *em)
{
- const int data_stripes = calc_data_stripes(type, num_stripes);
+ const struct map_lookup *map = em->map_lookup;
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(chunk_len, data_stripes);
+ return div_u64(em->len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -7109,8 +7085,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(type, em->len,
- map->num_stripes);
+ em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7236,7 +7211,8 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = args.devid = btrfs_device_id(leaf, dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ args.devid = devid;
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
@@ -7865,11 +7841,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
- btrfs_dev_stat_print_on_error(dev);
-}
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
-{
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
@@ -8011,7 +7983,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
map = em->map_lookup;
- stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+ stripe_len = btrfs_calc_stripe_length(em);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
@@ -8021,6 +7993,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
+ /*
+ * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->devid == devid &&
map->stripes[i].physical == physical_offset) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6721002000ee..5639961b3626 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -355,6 +355,13 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+/*
* Additional info to pass along bio.
*
* Mostly for btrfs specific features like csum and mirror_num.
@@ -371,6 +378,9 @@ struct btrfs_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -391,10 +401,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
}
}
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
struct btrfs_io_stripe {
struct btrfs_device *dev;
+ union {
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
+ };
+};
+
+struct btrfs_discard_stripe {
+ struct btrfs_device *dev;
u64 physical;
- u64 length; /* only used for discard mappings */
+ u64 length;
};
/*
@@ -533,6 +569,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret);
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
@@ -541,8 +580,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num);
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -601,6 +639,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
+u64 btrfs_calc_stripe_length(const struct extent_map *em);
+int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 767a0c6c9694..b4f44662cda7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
- char *data_in;
+ char *data_in = NULL;
char *cpage_out;
int nr_pages = 0;
struct page *in_page = NULL;
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
+
return ret;
}
@@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d99026df6f67..b150b07ba1a7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -94,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
* Possible states of log buffer zones
*
* Empty[0] In use[0] Full[0]
- * Empty[1] * x 0
- * In use[1] 0 x 0
- * Full[1] 1 1 C
+ * Empty[1] * 0 1
+ * In use[1] x x 1
+ * Full[1] 0 0 C
*
* Log position:
* *: Special case, no superblock is written
@@ -415,6 +415,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ /*
+ * We limit max_zone_append_size also by max_segments *
+ * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
+ * since btrfs adds the pages one by one to a bio, and btrfs cannot
+ * increase the metadata reservation even if it increases the number of
+ * extents, it is safe to stick with the limit.
+ */
+ zone_info->max_zone_append_size =
+ min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
@@ -640,6 +650,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
u64 zoned_devices = 0;
u64 nr_devices = 0;
u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
@@ -674,6 +685,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
ret = -EINVAL;
goto out;
}
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size =
+ zone_info->max_zone_append_size;
}
nr_devices++;
}
@@ -723,7 +739,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
}
fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
+ fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+ if (fs_info->max_zone_append_size < fs_info->max_extent_size)
+ fs_info->max_extent_size = fs_info->max_zone_append_size;
/*
* Check mount options here, because we might change fs_info->zoned
@@ -1829,6 +1849,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
@@ -1840,6 +1861,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (block_group->zone_is_active) {
ret = true;
@@ -1868,7 +1890,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* Successfully activated all the zones */
block_group->zone_is_active = 1;
+ space_info->active_total_bytes += block_group->length;
spin_unlock(&block_group->lock);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
/* For the active block group list */
btrfs_get_block_group(block_group);
@@ -1881,6 +1906,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
return ret;
}
@@ -1981,6 +2007,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* For active_bg_list */
btrfs_put_block_group(block_group);
+ clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+ wake_up_all(&fs_info->zone_finish_wait);
+
return 0;
}
@@ -2017,6 +2046,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
}
mutex_unlock(&fs_info->chunk_mutex);
+ if (!ret)
+ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
return ret;
}
@@ -2160,3 +2192,96 @@ out:
spin_unlock(&block_group->lock);
btrfs_put_block_group(block_group);
}
+
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_block_group *min_bg = NULL;
+ u64 min_avail = U64_MAX;
+ int ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+ active_bg_list) {
+ u64 avail;
+
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (min_avail > avail) {
+ if (min_bg)
+ btrfs_put_block_group(min_bg);
+ min_bg = block_group;
+ min_avail = avail;
+ btrfs_get_block_group(min_bg);
+ }
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ if (!min_bg)
+ return 0;
+
+ ret = btrfs_zone_finish(min_bg);
+ btrfs_put_block_group(min_bg);
+
+ return ret < 0 ? ret : 1;
+}
+
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ struct btrfs_block_group *bg;
+ int index;
+
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ /* No more block groups to activate */
+ if (space_info->active_total_bytes == space_info->total_bytes)
+ return 0;
+
+ for (;;) {
+ int ret;
+ bool need_finish = false;
+
+ down_read(&space_info->groups_sem);
+ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
+ list_for_each_entry(bg, &space_info->block_groups[index],
+ list) {
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+
+ if (btrfs_zone_activate(bg)) {
+ up_read(&space_info->groups_sem);
+ return 1;
+ }
+
+ need_finish = true;
+ }
+ }
+ up_read(&space_info->groups_sem);
+
+ if (!do_finish || !need_finish)
+ break;
+
+ ret = btrfs_zone_finish_one_bg(fs_info);
+ if (ret == 0)
+ break;
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6b2eec99162b..e17462db3a84 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -19,6 +19,7 @@ struct btrfs_zoned_device_info {
*/
u64 zone_size;
u8 zone_size_shift;
+ u64 max_zone_append_size;
u32 nr_zones;
unsigned int max_active_zones;
atomic_t active_zones_left;
@@ -79,6 +80,9 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, bool do_finish);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -248,6 +252,20 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
+
+static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ return 1;
+}
+
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ /* Consider all the block groups are active */
+ return 0;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 0fe31a6f6e68..35a0224d4eb7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -403,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -415,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -450,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -462,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -477,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
-
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -510,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -522,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -537,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = tot_out;
out:
*out_pages = nr_pages;
- /* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (workspace->in_buf.src) {
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
}
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -567,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -603,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ kunmap_local(workspace->in_buf.src);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -619,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
zero_fill_bio(cb->orig_bio);
done:
if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(workspace->in_buf.src);
return ret;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d04bdf549efa..dccdf1551c62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1167,6 +1167,11 @@ bdev_max_zone_append_sectors(struct block_device *bdev)
return queue_max_zone_append_sectors(bdev_get_queue(bdev));
}
+static inline unsigned int bdev_max_segments(struct block_device *bdev)
+{
+ return queue_max_segments(bdev_get_queue(bdev));
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
int retval = 512;
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index cddb42ff0473..034b1106d022 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -8,7 +8,7 @@
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
-void kunmap_local_indexed(void *vaddr);
+void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
@@ -89,7 +89,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
return __kmap_local_pfn_prot(pfn, kmap_prot);
}
-static inline void __kunmap_local(void *vaddr)
+static inline void __kunmap_local(const void *vaddr)
{
kunmap_local_indexed(vaddr);
}
@@ -121,7 +121,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
return __kmap_local_pfn_prot(pfn, kmap_prot);
}
-static inline void __kunmap_atomic(void *addr)
+static inline void __kunmap_atomic(const void *addr)
{
kunmap_local_indexed(addr);
pagefault_enable();
@@ -197,7 +197,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
return kmap_local_page(pfn_to_page(pfn));
}
-static inline void __kunmap_local(void *addr)
+static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr);
@@ -224,7 +224,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
return kmap_atomic(pfn_to_page(pfn));
}
-static inline void __kunmap_atomic(void *addr)
+static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 9ae94ef3e270..73df80d462dc 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -30,6 +30,8 @@ struct btrfs_qgroup;
struct extent_io_tree;
struct prelim_ref;
struct btrfs_space_info;
+struct btrfs_raid_bio;
+struct raid56_bio_trace_info;
#define show_ref_type(type) \
__print_symbolic(type, \
@@ -596,6 +598,70 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
TP_ARGS(inode, ordered)
);
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_range,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first_range,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_for_logging,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_split,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_dec_test_pending,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
+DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished,
+
+ TP_PROTO(const struct btrfs_inode *inode,
+ const struct btrfs_ordered_extent *ordered),
+
+ TP_ARGS(inode, ordered)
+);
+
DECLARE_EVENT_CLASS(btrfs__writepage,
TP_PROTO(const struct page *page, const struct inode *inode,
@@ -2258,6 +2324,98 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
TP_ARGS(fs_info, sinfo, old, diff)
);
+DECLARE_EVENT_CLASS(btrfs_raid56_bio,
+
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, full_stripe )
+ __field( u64, physical )
+ __field( u64, devid )
+ __field( u32, offset )
+ __field( u32, len )
+ __field( u8, opf )
+ __field( u8, total_stripes )
+ __field( u8, real_stripes )
+ __field( u8, nr_data )
+ __field( u8, stripe_nr )
+ ),
+
+ TP_fast_assign_btrfs(rbio->bioc->fs_info,
+ __entry->full_stripe = rbio->bioc->raid_map[0];
+ __entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ __entry->len = bio->bi_iter.bi_size;
+ __entry->opf = bio_op(bio);
+ __entry->devid = trace_info->devid;
+ __entry->offset = trace_info->offset;
+ __entry->stripe_nr = trace_info->stripe_nr;
+ __entry->total_stripes = rbio->bioc->num_stripes;
+ __entry->real_stripes = rbio->real_stripes;
+ __entry->nr_data = rbio->nr_data;
+ ),
+ /*
+ * For type output, we need to output things like "DATA1"
+ * (the first data stripe), "DATA2" (the second data stripe),
+ * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device).
+ */
+ TP_printk_btrfs(
+"full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u",
+ __entry->full_stripe, __entry->devid,
+ (__entry->stripe_nr < __entry->nr_data) ? "DATA" :
+ ((__entry->stripe_nr < __entry->real_stripes) ? "PQ" :
+ "REPLACE"),
+ (__entry->stripe_nr < __entry->nr_data) ?
+ (__entry->stripe_nr + 1) :
+ ((__entry->stripe_nr < __entry->real_stripes) ?
+ (__entry->stripe_nr - __entry->nr_data + 1) : 0),
+ __entry->offset, __entry->opf, __entry->physical, __entry->len)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial,
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe,
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info)
+);
+
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe,
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read,
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info)
+);
+
+DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover,
+ TP_PROTO(const struct btrfs_raid_bio *rbio,
+ const struct bio *bio,
+ const struct raid56_bio_trace_info *trace_info),
+
+ TP_ARGS(rbio, bio, trace_info)
+);
+
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 3d0edbe3b991..7ada84e4a3ed 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -777,11 +777,19 @@ struct btrfs_ioctl_received_subvol_args {
*/
#define BTRFS_SEND_FLAG_VERSION 0x8
+/*
+ * Send compressed data using the ENCODED_WRITE command instead of decompressing
+ * the data and sending it with the WRITE command. This requires protocol
+ * version >= 2.
+ */
+#define BTRFS_SEND_FLAG_COMPRESSED 0x10
+
#define BTRFS_SEND_FLAG_MASK \
(BTRFS_SEND_FLAG_NO_FILE_DATA | \
BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
BTRFS_SEND_FLAG_OMIT_END_CMD | \
- BTRFS_SEND_FLAG_VERSION)
+ BTRFS_SEND_FLAG_VERSION | \
+ BTRFS_SEND_FLAG_COMPRESSED)
struct btrfs_ioctl_send_args {
__s64 send_fd; /* in */
diff --git a/mm/highmem.c b/mm/highmem.c
index 1a692997fac4..e32083e4ce0d 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -561,7 +561,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
}
EXPORT_SYMBOL(__kmap_local_page_prot);
-void kunmap_local_indexed(void *vaddr)
+void kunmap_local_indexed(const void *vaddr)
{
unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
pte_t *kmap_pte;