diff options
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r-- | fs/btrfs/space-info.c | 307 |
1 files changed, 229 insertions, 78 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 48d77f360a24..f171bf875633 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -181,6 +182,43 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) found->full = 0; } +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return BTRFS_MAX_DATA_CHUNK_SIZE; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -202,6 +240,10 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -251,30 +293,36 @@ out: return ret; } -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) { struct btrfs_space_info *found; - int factor; + int factor, index; - factor = btrfs_bg_type_to_factor(flags); + factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, flags); + found = btrfs_find_space_info(info, block_group->flags); ASSERT(found); spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - found->bytes_zone_unusable += bytes_zone_unusable; - if (total_bytes > 0) + found->total_bytes += block_group->length; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + found->active_total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; + found->bytes_readonly += block_group->bytes_super; + found->bytes_zone_unusable += block_group->zone_unusable; + if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); - *space_info = found; + + block_group->space_info = found; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + down_write(&found->groups_sem); + list_add_tail(&block_group->list, &found->block_groups[index]); + up_write(&found->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -328,6 +376,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -340,9 +404,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -378,7 +445,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -409,28 +476,47 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) +static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + +static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info) { + const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", - info->flags, + btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", + flag_str, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", +"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly, info->bytes_zone_unusable); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - } void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -442,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&info->lock); __btrfs_dump_space_info(fs_info, info); + dump_global_block_rsv(fs_info); spin_unlock(&info->lock); if (!dump_block_groups) @@ -519,7 +606,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } - trans = (struct btrfs_trans_handle *)current->journal_info; + trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on @@ -617,7 +704,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, enum btrfs_flush_state state, bool for_preempt) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -662,6 +749,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -672,6 +771,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -709,6 +825,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -723,8 +840,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -734,9 +852,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + + lockdep_assert_held(&space_info->lock); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -796,8 +919,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -844,6 +967,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + if (!ticket->steal) + return false; + if (global_rsv->space_info != space_info) return false; @@ -899,8 +1025,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (!aborted && ticket->steal && - steal_from_global_rsv(fs_info, space_info, ticket)) + if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) return true; if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) @@ -1059,7 +1184,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1090,6 +1214,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, @@ -1260,18 +1386,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int states_nr) { u64 to_reclaim; - int flush_state; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); - if (!to_reclaim) { + /* + * This is the priority reclaim path, so to_reclaim could be >0 still + * because we may have only satisfied the priority tickets and still + * left non priority tickets on the list. We would then have + * to_reclaim but ->bytes == 0. + */ + if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - flush_state = 0; - do { + while (flush_state < states_nr) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, to_reclaim, states[flush_state], false); flush_state++; @@ -1280,23 +1411,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - } while (flush_state < states_nr); + } + + /* Attempt to steal from the global rsv if we can. */ + if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + } + + /* + * We must run try_granting_tickets here because we could be a large + * ticket in front of a smaller ticket that can now be satisfied with + * the available space. + */ + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + spin_lock(&space_info->lock); + + /* We could have been granted before we got here. */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + while (!space_info->full) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); } + + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1378,25 +1535,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, break; } - spin_lock(&space_info->lock); ret = ticket->error; - if (ticket->bytes || ticket->error) { - /* - * We were a priority ticket, so we need to delete ourselves - * from the list. Because we could have other priority tickets - * behind us that require less space, run - * btrfs_try_granting_tickets() to see if their reservations can - * now be made. - */ - if (!list_empty(&ticket->list)) { - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); - } - - if (!ret) - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); /* * Check that we can't have an error set if the reservation succeeded, @@ -1438,6 +1577,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, space_info->clamp = min(space_info->clamp + 1, 8); } +static inline bool can_steal(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_EVICT); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1492,7 +1637,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); @@ -1511,7 +1656,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); - ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1541,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, &space_info->priority_tickets); } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; /* * We will do the space reservation dance during log replay, * which means we won't have fs_info->fs_root set, so don't do @@ -1567,7 +1711,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, /** * Trye to reserve metadata bytes from the block_rsv's space * - * @root: the root we're allocating for + * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1579,22 +1723,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, @@ -1624,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || - flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || + flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); @@ -1636,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, } return ret; } + +/* Dump all the space infos when we abort a transaction due to ENOSPC. */ +__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + btrfs_info(fs_info, "dumping space info:"); + list_for_each_entry(space_info, &fs_info->space_info, list) { + spin_lock(&space_info->lock); + __btrfs_dump_space_info(fs_info, space_info); + spin_unlock(&space_info->lock); + } + dump_global_block_rsv(fs_info); +} |