diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 3955 |
1 files changed, 1193 insertions, 2762 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,13 +6,13 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/prefetch.h> -#include <linux/cleancache.h> #include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" @@ -29,39 +29,29 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" -static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set btrfs_bioset; - -static inline bool extent_state_in_tree(const struct extent_state *state) -{ - return !RB_EMPTY_NODE(&state->rb_node); -} #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(states); -static DEFINE_SPINLOCK(leak_lock); - -static inline void btrfs_leak_debug_add(spinlock_t *lock, - struct list_head *new, - struct list_head *head) +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_add(new, head); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } -static inline void btrfs_leak_debug_del(spinlock_t *lock, - struct list_head *entry) +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_del(entry); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) @@ -76,6 +66,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -89,51 +80,22 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } - -static inline void btrfs_extent_state_leak_debug_check(void) -{ - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", - state->start, state->end, state->state, - extent_state_in_tree(state), - refcount_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - } -} - -#define btrfs_debug_check_extent_io_range(tree, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) -static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct extent_io_tree *tree, u64 start, u64 end) -{ - struct inode *inode = tree->private_data; - u64 isize; - - if (!inode || !is_data_inode(inode)) - return; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} #else -#define btrfs_leak_debug_add(lock, new, head) do {} while (0) -#define btrfs_leak_debug_del(lock, entry) do {} while (0) -#define btrfs_extent_state_leak_debug_check() do {} while (0) -#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + int mirror_num; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; + btrfs_bio_end_io_t end_io_func; }; struct extent_page_data { @@ -147,92 +109,59 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, u32 bits, - struct extent_changeset *changeset, - int set) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - int ret; - - if (!changeset) - return 0; - if (set && (state->state & bits) == bits) - return 0; - if (!set && (state->state & bits) == 0) - return 0; - changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; -} + struct bio *bio; + struct bio_vec *bv; + struct inode *inode; + int mirror_num; -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) -{ - blk_status_t ret = 0; - struct extent_io_tree *tree = bio->bi_private; + if (!bio_ctrl->bio) + return; - bio->bi_private = NULL; + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); - else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - return blk_status_to_errno(ret); -} + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); + else + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } + /* The bio is owned by the end_io handler now */ + bio_ctrl->bio = NULL; } /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; - if (bio) { - ret = submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ epd->bio_ctrl.bio = NULL; + } else { + submit_one_bio(&epd->bio_ctrl); } - return ret; -} - -int __init extent_state_cache_init(void) -{ - extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - return 0; } -int __init extent_io_init(void) +int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, @@ -240,32 +169,10 @@ int __init extent_io_init(void) if (!extent_buffer_cache) return -ENOMEM; - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - goto free_buffer_cache; - - if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) - goto free_bioset; - return 0; - -free_bioset: - bioset_exit(&btrfs_bioset); - -free_buffer_cache: - kmem_cache_destroy(extent_buffer_cache); - extent_buffer_cache = NULL; - return -ENOMEM; } -void __cold extent_state_cache_exit(void) -{ - btrfs_extent_state_leak_debug_check(); - kmem_cache_destroy(extent_state_cache); -} - -void __cold extent_io_exit(void) +void __cold extent_buffer_free_cachep(void) { /* * Make sure all delayed rcu free are flushed before we @@ -273,1222 +180,6 @@ void __cold extent_io_exit(void) */ rcu_barrier(); kmem_cache_destroy(extent_buffer_cache); - bioset_exit(&btrfs_bioset); -} - -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; - -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) -{ - tree->fs_info = fs_info; - tree->state = RB_ROOT; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - tree->private_data = private_data; - tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); -} - -void extent_io_tree_release(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - - /* - * The given mask might be not appropriate for the slab allocator, - * drop the unsupported bits - */ - mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->failrec = NULL; - RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); - refcount_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (refcount_dec_and_test(&state->refs)) { - WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&leak_lock, &state->leak_list); - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -/** - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret: points to entry which would have been the parent of the entry, - * containing @offset - * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. - */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - - while (*n) { - prev = *n; - entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; - - if (offset < entry->start) - n = &(*n)->rb_left; - else if (offset > entry->end) - n = &(*n)->rb_right; - else - return *n; - } - - if (p_ret) - *p_ret = n; - if (parent_ret) - *parent_ret = prev; - - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; - } - - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; -} - -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) -{ - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; -} - -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, - struct extent_changeset *changeset); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, - u32 *bits, struct extent_changeset *changeset) -{ - struct rb_node *node; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; - - set_state_bits(tree, state, bits, changeset); - - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; - } - merge_state(tree, state); - return 0; -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *node; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_split_delalloc_extent(tree->private_data, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; - } - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up anyone waiting on this state (wake == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - u32 *bits, int wake, - struct extent_changeset *changeset) -{ - struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); - - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); - - if (bits & EXTENT_DELALLOC) - bits |= EXTENT_NORESERVE; - - if (delete) - bits |= ~EXTENT_CTLBITS; - - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && extent_state_in_tree(cached) && - cached->start <= start && cached->end > start) { - if (clear) - refcount_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); - goto next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, &bits, wake, changeset); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, &bits, wake, changeset); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - struct extent_state *state; - struct rb_node *node; - - btrfs_debug_check_extent_io_range(tree, start, end); - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); -process_node: - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - refcount_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (!cond_resched_lock(&tree->lock)) { - node = rb_next(node); - goto process_node; - } - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) -{ - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; - int ret; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_set_delalloc_extent(tree->private_data, state, bits); - - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); - state->state |= bits_to_set; -} - -static void cache_state_if_flags(struct extent_state *state, - struct extent_state **cached_ptr, - unsigned flags) -{ - if (cached_ptr && !(*cached_ptr)) { - if (!flags || (state->state & flags)) { - *cached_ptr = state; - refcount_inc(&state->refs); - } - } -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - u32 exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - - if (exclusive_bits) - ASSERT(failed_start); - else - ASSERT(failed_start == NULL); -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, &bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - /* - * If this extent already has all the bits we want set, then - * skip it, not necessary to split it or do anything with it. - */ - if ((state->state & bits) == bits) { - start = state->end + 1; - cache_state(state, cached_state); - goto search_again; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits, changeset); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -} - -/** - * convert_extent_bit - convert all bits in a given range from one bit to - * another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @cached_state: state that we're going to cache - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - * - * All allocations are done with GFP_NOFS. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - bool first_iteration = true; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, - clear_bits); - -again: - if (!prealloc) { - /* - * Best effort, don't worry if extent state allocation fails - * here for the first iteration. We might have a cached state - * that matches exactly the target range, in which case no - * extent state allocations are needed. We'll only know this - * after locking the tree. - */ - prealloc = alloc_extent_state(GFP_NOFS); - if (!prealloc && !first_iteration) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits, NULL); - cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - cond_resched(); - first_iteration = false; - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; -} - -/* wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. - */ - BUG_ON(bits & EXTENT_LOCKED); - - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); -} - -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); -} - -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, GFP_NOFS, NULL); -} - -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * Don't support EXTENT_LOCKED case, same reason as - * set_record_extent_bits(). - */ - BUG_ON(bits & EXTENT_LOCKED); - - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, - changeset); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) -{ - int err; - u64 failed_start; - - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL); - return 0; - } - return 1; } void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) @@ -1508,309 +199,20 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } } -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * Find the first offset in the io tree with one or more @bits set. - * - * Note: If there are multiple bits set in @bits, any of them will match. - * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->end == start - 1 && extent_state_in_tree(state)) { - while ((state = next_state(state)) != NULL) { - if (state->state & bits) - goto got_it; - } - free_extent_state(*cached_state); - *cached_state = NULL; - goto out; - } - free_extent_state(*cached_state); - *cached_state = NULL; - } - - state = find_first_extent_bit_state(tree, start, bits); -got_it: - if (state) { - cache_state_if_flags(state, cached_state, 0); - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } -out: - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find a contiguous area of bits - * - * @tree: io tree to check - * @start: offset to start the search from - * @start_ret: the first offset we found with the bits set - * @end_ret: the final contiguous range of the bits that were set - * @bits: bits to look for - * - * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges - * to set bits appropriately, and then merge them again. During this time it - * will drop the tree->lock, so use this helper if you want to find the actual - * contiguous area for given bits. We will search to the first bit we find, and - * then walk down the tree until we find a non-contiguous area. The area - * returned will be the full contiguous area with the bits set. - */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - while ((state = next_state(state)) != NULL) { - if (state->start > (*end_ret + 1)) - break; - *end_ret = state->end; - } - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find the first range that has @bits not set. This range could start before - * @start. - * - * @tree: the tree to search - * @start: offset at/after which the found extent should start - * @start_ret: records the beginning of the range - * @end_ret: records the end of the range (inclusive) - * @bits: the set of bits which must be unset - * - * Since unallocated range is also considered one which doesn't have the bits - * set it's possible that @end_ret contains -1, this happens in case the range - * spans (last_range_end, end of device]. In this case it's up to the caller to - * trim @end_ret to the appropriate size. - */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - struct rb_node *node, *prev = NULL, *next; - - spin_lock(&tree->lock); - - /* Find first extent with bits cleared */ - while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node && !next && !prev) { - /* - * Tree is completely empty, send full range and let - * caller deal with it - */ - *start_ret = 0; - *end_ret = -1; - goto out; - } else if (!node && !next) { - /* - * We are past the last allocated chunk, set start at - * the end of the last extent. - */ - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } else if (!node) { - node = next; - } - /* - * At this point 'node' either contains 'start' or start is - * before 'node' - */ - state = rb_entry(node, struct extent_state, rb_node); - - if (in_range(start, state->start, state->end - state->start + 1)) { - if (state->state & bits) { - /* - * |--range with bits sets--| - * | - * start - */ - start = state->end + 1; - } else { - /* - * 'start' falls within a range that doesn't - * have the bits set, so take its start as - * the beginning of the desired range - * - * |--range with bits cleared----| - * | - * start - */ - *start_ret = state->start; - break; - } - } else { - /* - * |---prev range---|---hole/unset---|---node range---| - * | - * start - * - * or - * - * |---hole/unset--||--first node--| - * 0 | - * start - */ - if (prev) { - state = rb_entry(prev, struct extent_state, - rb_node); - *start_ret = state->end + 1; - } else { - *start_ret = 0; - } - break; - } - } - - /* - * Find the longest stretch from start until an entry which has the - * bits set - */ - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && !(state->state & bits)) { - *end_ret = state->end; - } else { - *end_ret = state->start - 1; - break; - } - - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * true is returned if we find something, false if nothing was in the tree - */ -bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, - u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - bool found = false; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - refcount_inc(&state->refs); - } - found = true; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return found; -} - /* * Process one page for __process_pages_contig(). * @@ -1868,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1879,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1898,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: @@ -1993,10 +392,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; @@ -2060,14 +461,14 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state); + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -2084,210 +485,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops, NULL); } -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) +static int insert_failrec(struct btrfs_inode *inode, + struct io_failure_record *failrec) { - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; - - if (WARN_ON(search_end <= cur_start)) - return 0; + struct rb_node *exist; - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; + spin_lock(&inode->io_failure_lock); + exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, + &failrec->rb_node); + spin_unlock(&inode->io_failure_lock); - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; + return (exist == NULL) ? 0 : -EEXIST; } -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) { struct rb_node *node; - struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec = ERR_PTR(-ENOENT); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->failrec = failrec; -out: - spin_unlock(&tree->lock); - return ret; -} - -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) -{ - struct rb_node *node; - struct extent_state *state; - struct io_failure_record *failrec; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - - failrec = state->failrec; -out: - spin_unlock(&tree->lock); + spin_lock(&inode->io_failure_lock); + node = rb_simple_search(&inode->io_failure_tree, start); + if (node) + failrec = rb_entry(node, struct io_failure_record, rb_node); + spin_unlock(&inode->io_failure_lock); return failrec; } -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +static void free_io_failure(struct btrfs_inode *inode, + struct io_failure_record *rec) { - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && extent_state_in_tree(cached) && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec) -{ - int ret; - int err = 0; - - set_state_failrec(failure_tree, rec->start, NULL); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret) - err = ret; - - ret = clear_extent_bits(io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED); - if (ret && !err) - err = ret; + spin_lock(&inode->io_failure_lock); + rb_erase(&rec->rb_node, &inode->io_failure_tree); + spin_unlock(&inode->io_failure_lock); kfree(rec); - return err; } /* @@ -2304,12 +541,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2317,8 +555,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2336,52 +572,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2407,28 +641,36 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset) +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset) { - u64 private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 ino = btrfs_ino(inode); + u64 locked_start, locked_end; struct io_failure_record *failrec; - struct extent_state *state; - int num_copies; + int mirror; int ret; - private = 0; - ret = count_range_bits(failure_tree, &private, (u64)-1, 1, - EXTENT_DIRTY, 0); - if (!ret) - return 0; - - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(inode, start); if (IS_ERR(failrec)) return 0; @@ -2437,26 +679,21 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (sb_rdonly(fs_info->sb)) goto out; - spin_lock(&io_tree->lock); - state = find_first_extent_bit_state(io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&io_tree->lock); - - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, + &locked_end, EXTENT_LOCKED, NULL); + if (ret || locked_start > failrec->bytenr || + locked_end < failrec->bytenr + failrec->len - 1) + goto out; -out: - free_io_failure(failure_tree, io_tree, failrec); + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); +out: + free_io_failure(inode, failrec); return 0; } @@ -2468,56 +705,50 @@ out: */ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; - struct extent_state *state, *next; + struct rb_node *node, *next; - if (RB_EMPTY_ROOT(&failure_tree->state)) + if (RB_EMPTY_ROOT(&inode->io_failure_tree)) return; - spin_lock(&failure_tree->lock); - state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); - while (state) { - if (state->start > end) + spin_lock(&inode->io_failure_lock); + node = rb_simple_search_first(&inode->io_failure_tree, start); + while (node) { + failrec = rb_entry(node, struct io_failure_record, rb_node); + if (failrec->bytenr > end) break; - ASSERT(state->end <= end); - - next = next_state(state); - - failrec = state->failrec; - free_extent_state(state); + next = rb_next(node); + rb_erase(&failrec->rb_node, &inode->io_failure_tree); kfree(failrec); - state = next; + node = next; } - spin_unlock(&failure_tree->lock); + spin_unlock(&inode->io_failure_lock); } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(BTRFS_I(inode), start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->start, failrec->len); + failrec->logical, failrec->bytenr, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == bbio->mirror_num); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2525,53 +756,34 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode if (!failrec) return ERR_PTR(-ENOMEM); - failrec->start = start; + RB_CLEAR_NODE(&failrec->rb_node); + failrec->bytenr = start; failrec->len = sectorsize; - failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } + btrfs_debug(fs_info, + "new io failure record logical %llu start %llu", + failrec->logical, start); - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); kfree(failrec); return ERR_PTR(-EIO); } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) { - ret = set_state_failrec(failure_tree, start, failrec); - /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, start + sectorsize - 1, - EXTENT_DAMAGED); - } else if (ret < 0) { + ret = insert_failrec(BTRFS_I(inode), failrec); + if (ret) { kfree(failrec); return ERR_PTR(ret); } @@ -2579,90 +791,50 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { - free_io_failure(failure_tree, tree, failrec); + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } - repair_bio = btrfs_bio_alloc(1); + repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, + failed_bbio->private); repair_bbio = btrfs_bio(repair_bio); - repair_bio->bi_opf = REQ_OP_READ; - repair_bio->bi_end_io = failed_bio->bi_end_io; + repair_bbio->file_offset = start; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - repair_bio->bi_private = failed_bio->bi_private; if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; @@ -2679,13 +851,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -2710,26 +882,44 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static void end_sector_io(struct page *page, u64 offset, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); +} + +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, + u32 bio_offset, const struct bio_vec *bvec, + unsigned int error_bitmap) { + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); + + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2738,12 +928,11 @@ static blk_status_t submit_read_repair(struct inode *inode, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2756,10 +945,9 @@ static blk_status_t submit_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2770,24 +958,12 @@ static blk_status_t submit_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2825,8 +1001,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio) +static void end_bio_extent_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; @@ -2926,11 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - if (processed->uptodate && tree->track_uptodate) - set_extent_uptodate(tree, processed->start, processed->end, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, processed->start, processed->end, - &cached); + unlock_extent_atomic(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -2943,7 +1116,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2965,7 +1138,7 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } @@ -2990,11 +1163,10 @@ static struct extent_buffer *find_extent_buffer_readpage( * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio) +static void end_bio_extent_readpage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct btrfs_bio *bbio = btrfs_bio(bio); - struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* * The offset to the beginning of a bio, since one bio can never be @@ -3002,7 +1174,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3013,6 +1184,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3021,8 +1193,6 @@ static void end_bio_extent_readpage(struct bio *bio) "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; /* * We always issue full-sector reads, but if some block in a @@ -3050,49 +1220,21 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * btrfs_submit_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3109,14 +1251,44 @@ readpage_ok: zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3124,61 +1296,40 @@ readpage_ok: bio_put(bio); } -/* - * Initialize the members up to but not including 'bio'. Use after allocating a - * new bio by bio_alloc_bioset as it does not initialize the bytes outside of - * 'bio' because use of __GFP_ZERO is not supported. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -} - -/* - * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped * - * The bio allocation is backed by bioset and does not fail. + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. */ -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) { - struct bio *bio; + unsigned int allocated; - ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio)); - return bio; -} + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; -struct bio *btrfs_bio_clone(struct bio *bio) -{ - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); - /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); - ASSERT(bio); + if (allocated == nr_pages) + return 0; - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio); + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; + memalloc_retry_wait(GFP_NOFS); + } + return 0; } /** @@ -3190,7 +1341,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3202,25 +1353,50 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig; + bool contig = false; int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + + if (bio->bi_iter.bi_size == 0) { + /* We can always add a page into an empty bio. */ + contig = true; + } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { + struct bio_vec *bvec = bio_last_bvec_all(bio); + + /* + * The contig check requires the following conditions to be met: + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. + */ + if (bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + + bvec->bv_len == page_offset(page) + pg_offset) + contig = true; + } else { + /* + * For compression, all IO should have its logical bytenr + * set to the starting bytenr of the compressed extent. + */ contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; + } + if (!contig) return 0; @@ -3260,7 +1436,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3300,82 +1476,90 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, - unsigned int opf, - bio_end_io_t end_io_func, + blk_opf_t opf, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + ASSERT(bio_ctrl->end_io_func); + + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; - bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; - bio->bi_opf = opf; + bio_ctrl->compress_type = compress_type; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: bio_ctrl->bio = NULL; - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return ret; } /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting - * @page: page to add to the bio * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @bio_ret: must be valid pointer, newly allocated bio will be stored there - * @end_io_func: end_io callback for new bio - * @mirror_num: desired mirror to read/write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio + * + * The will either add the page into the existing @bio_ctrl->bio, or allocate a + * new one in @bio_ctrl->bio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. */ -static int submit_extent_page(unsigned int opf, +static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset, - bio_end_io_t end_io_func, - int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3386,12 +1570,11 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; - } + + ASSERT(bio_ctrl->end_io_func); + + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3400,9 +1583,9 @@ static int submit_extent_page(unsigned int opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, + disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3410,14 +1593,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3431,11 +1614,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3458,7 +1637,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3493,7 +1672,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3510,7 +1689,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3535,7 +1714,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3549,9 +1728,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start) + blk_opf_t read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3561,10 +1740,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; - u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3572,30 +1749,21 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { - unlock_extent(tree, start, end); + unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); goto out; } - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - unlock_extent(tree, start, end); - unlock_page(page); - goto out; - } - } - if (page->index == last_byte >> PAGE_SHIFT) { size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } + bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { unsigned long this_bio_flag = 0; @@ -3608,35 +1776,30 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { - unlock_extent(tree, cur, end); + if (IS_ERR(em)) { + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3694,48 +1857,35 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, + bio_ctrl, disk_bytenr, page, iosize, + pg_offset, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + if (ret) { + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end, NULL); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3745,6 +1895,26 @@ out: return ret; } +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + submit_one_bio(&bio_ctrl); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3763,12 +1933,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3868,7 +2032,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3911,10 +2075,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; - u32 opf = REQ_OP_WRITE; - const unsigned int write_flags = wbc_to_write_flags(wbc); + enum req_op op = REQ_OP_WRITE; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3929,8 +2095,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; + epd->bio_ctrl.end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -3961,9 +2128,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -3984,7 +2154,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) - opf = REQ_OP_ZONE_APPEND; + op = REQ_OP_ZONE_APPEND; free_extent_map(em); em = NULL; @@ -4020,13 +2190,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(opf | write_flags, wbc, - &epd->bio_ctrl, page, - disk_bytenr, iosize, + ret = submit_extent_page(op | write_flags, wbc, + &epd->bio_ctrl, disk_bytenr, + page, iosize, cur - page_offset(page), - end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4040,8 +2213,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4058,6 +2233,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4078,15 +2254,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -4171,9 +2345,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4193,14 +2364,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4210,9 +2379,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4250,7 +2417,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4259,14 +2426,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4274,25 +2434,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4403,14 +2544,15 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct bio *bio) +static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4460,8 +2602,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) bio_put(bio); } -static void end_bio_extent_buffer_writepage(struct bio *bio) +static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; struct extent_buffer *eb; int done; @@ -4527,7 +2670,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4543,10 +2686,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); + epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + &epd->bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4562,7 +2706,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4572,11 +2716,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); + epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4584,10 +2730,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, p, disk_bytenr, - PAGE_SIZE, 0, - end_bio_extent_buffer_writepage, - 0, 0, false); + &epd->bio_ctrl, disk_bytenr, p, + PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4598,7 +2742,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4701,7 +2845,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4737,7 +2881,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4790,11 +2934,11 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -4880,10 +3024,6 @@ retry: index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4910,14 +3050,16 @@ retry: * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. + * + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. */ - if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); - } else { + if (ret > 0) + ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - end_write_bio(&epd, ret); - } -out: + submit_write_bio(&epd, ret); + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5020,8 +3162,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + submit_write_bio(epd, 0); lock_page(page); } @@ -5031,10 +3172,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5074,9 +3213,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + submit_write_bio(epd, 0); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5086,27 +3224,6 @@ retry: return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still @@ -5164,10 +3281,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - ret = flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5192,13 +3306,8 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - ret = flush_write_bio(&epd); return ret; } @@ -5220,25 +3329,21 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + submit_one_bio(&bio_ctrl); } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5247,20 +3352,20 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + lock_extent(tree, start, end, &cached_state); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent_cached(tree, start, end, &cached_state); + unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for releasepage, this tests for areas of the page that + * a helper for release_folio, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ @@ -5274,15 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree, if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + /* * At this point we can safely clear everything except the * locked bit, the nodatasum bit and the delalloc new bit. * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), - 0, 0, NULL, mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, + mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -5296,7 +3403,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, } /* - * a helper for releasepage. As long as there are no locked extents + * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed */ @@ -5381,42 +3488,6 @@ next: } /* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, - u64 offset, u64 last) -{ - u64 sectorsize = btrfs_inode_sectorsize(inode); - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while (1) { - len = last - offset; - if (len == 0) - break; - len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) - return em; - - /* if this isn't a hole return it */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -/* * To cache previous fiemap extent * * Will be used for merging fiemap extent @@ -5445,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, { int ret = 0; + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + if (!cache->cached) goto assign; @@ -5468,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * - * 3) Share same flags except FIEMAP_EXTENT_LAST - * So regular extent won't get merged with prealloc extent + * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && - (cache->flags & ~FIEMAP_EXTENT_LAST) == - (flags & ~FIEMAP_EXTENT_LAST)) { + cache->flags == flags) { cache->len += len; - cache->flags |= flags; - goto try_submit_last; + return 0; } /* Not mergeable, need to submit cached one */ @@ -5492,13 +3563,8 @@ assign: cache->phys = phys; cache->len = len; cache->flags = flags; -try_submit_last: - if (cache->flags & FIEMAP_EXTENT_LAST) { - ret = fiemap_fill_next_extent(fieinfo, cache->offset, - cache->phys, cache->len, cache->flags); - cache->cached = false; - } - return ret; + + return 0; } /* @@ -5528,215 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - int ret = 0; - u64 off; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(&inode->vfs_inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_root *root = inode->root; - struct fiemap_cache cache = { 0 }; - struct ulist *roots; - struct ulist *tmp_ulist; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; - if (len == 0) - return -EINVAL; + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; - path = btrfs_alloc_path(); - if (!path) + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) return -ENOMEM; - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!roots || !tmp_ulist) { - ret = -ENOMEM; - goto out_free_ulist; + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; } /* - * We can't initialize that to 'start' as this could miss extents due - * to extent item merging + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. */ - off = 0; - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct btrfs_backref_shared_cache *backref_cache, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp_ulist, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); - if (ret < 0) { - goto out_free_ulist; - } else { - WARN_ON(!ret); - if (ret == 1) - ret = 0; - } + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = found_key.type; - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. */ - last = found_key.offset; - last_for_get_extent = last + 1; + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); } - btrfs_release_path(path); /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; } - lock_extent_bits(&inode->io_tree, start, start + len - 1, - &cached_state); + return 0; +} - em = get_extent_skip_holes(inode, start, last_for_get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_shared_cache *backref_cache; + struct ulist *roots; + struct ulist *tmp_ulist; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + path = btrfs_alloc_path(); + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!backref_cache || !path || !roots || !tmp_ulist) { + ret = -ENOMEM; goto out; } - while (!end) { - u64 offset_in_extent = 0; + lockstart = round_down(start, root->fs_info->sectorsize); + lockend = round_up(start + len, root->fs_info->sectorsize); + prev_extent_end = lockstart; - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { /* - * record the offset from the start of the extent - * for adjusting the disk offset below. Only do this if the - * extent isn't compressed since our in ram offset may be past - * what we have actually allocated on disk. + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. */ - if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - flags = 0; - if (em->block_start < EXTENT_MAP_LAST_BYTE) - disko = em->block_start + offset_in_extent; - else - disko = 0; + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); /* - * bump off for our next call to get_extent + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else if (fieinfo->fi_extents_max) { - u64 bytenr = em->block_start - - (em->start - em->orig_start); + if (extent_end <= lockstart) + goto next_item; - /* - * As btrfs supports shared space, this information - * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 - * then we're just getting a count and we can skip the - * lookup stuff. - */ - ret = btrfs_check_shared(root, btrfs_ino(inode), - bytenr, roots, tmp_ulist); - if (ret < 0) - goto out_free; - if (ret) - flags |= FIEMAP_EXTENT_SHARED; - ret = 0; + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, + disk_bytenr, extent_offset, + extent_gen, roots, tmp_ulist, + key.offset, extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(root, ino, + disk_bytenr, + extent_gen, + roots, + tmp_ulist, + backref_cache); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); } - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; } - ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, - em_len, flags); - if (ret) { - if (ret == 1) - ret = 0; - goto out_free; + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; + } + cond_resched(); + } + +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + 0, 0, 0, roots, tmp_ulist, + prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; } } -out_free: - if (!ret) - ret = emit_last_fiemap_cache(fieinfo, &cache); - free_extent_map(em); -out: - unlock_extent_cached(&inode->io_tree, start, start + len - 1, - &cached_state); -out_free_ulist: + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); +out: + kfree(backref_cache); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); @@ -5793,7 +4178,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5867,7 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); __free_extent_buffer(eb); } @@ -5884,8 +4269,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; init_rwsem(&eb->lock); - btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, - &fs_info->allocated_ebs); + btrfs_leak_debug_add_eb(eb); INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); @@ -5900,9 +4284,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5915,22 +4299,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5944,31 +4329,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -5990,10 +4380,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or - * calling releasepage when the tree reference is the only reference. + * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's - * pages are not under io. However, releasepage can be concurrently + * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. @@ -6113,7 +4503,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6135,6 +4525,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !PAGE_ALIGNED(start)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6146,13 +4560,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + u64 lockdep_owner = owner_root; int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6165,14 +4578,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6180,7 +4585,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); - btrfs_set_buffer_lockdep_class(owner_root, eb, level); + + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { @@ -6202,7 +4615,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6246,7 +4659,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted in the radix tree, this - * opens a race with btree_releasepage which can free a page + * opens a race with btree_release_folio which can free a page * while we are still filling in all pages for the buffer and * we could crash. */ @@ -6278,7 +4691,7 @@ again: /* * Now it's safe to unlock the pages because any calls to - * btree_releasepage will correctly detect that a page belongs to a + * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ for (i = 0; i < num_pages; i++) @@ -6324,7 +4737,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -6344,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb) void free_extent_buffer(struct extent_buffer *eb) { int refs; - int old; if (!eb) return; + refs = atomic_read(&eb->refs); while (1) { - refs = atomic_read(&eb->refs); if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs == 1)) break; - old = atomic_cmpxchg(&eb->refs, refs, refs - 1); - if (old == refs) + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) return; } @@ -6421,7 +4832,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6453,7 +4864,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6493,9 +4904,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6510,7 +4930,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6520,7 +4949,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6531,7 +4962,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); if (ret < 0) return ret; } @@ -6541,7 +4972,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); return ret; } @@ -6549,14 +4980,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, eb->read_mirror = 0; atomic_set(&eb->io_pages, 1); check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, - page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, + eb->start, page, eb->len, + eb->start - page_offset(page), 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6565,14 +4996,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6592,7 +5016,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6605,7 +5031,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6648,10 +5074,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); /* - * It is possible for releasepage to clear the TREE_REF bit before we + * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -6663,10 +5090,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, - &bio_ctrl, page, page_offset(page), - PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + err = submit_extent_page(REQ_OP_READ, NULL, + &bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6683,12 +5109,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); - bio_ctrl.bio = NULL; - if (err) - return err; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6851,14 +5272,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; - if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ + if (fs_info->nodesize < PAGE_SIZE) { + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } @@ -6952,7 +5383,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6961,7 +5392,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7354,7 +5785,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* |