aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/space-info.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/space-info.c')
-rw-r--r--fs/btrfs/space-info.c307
1 files changed, 229 insertions, 78 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 48d77f360a24..f171bf875633 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,6 +9,7 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
+#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -181,6 +182,43 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
+}
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -202,6 +240,10 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -251,30 +293,36 @@ out:
return ret;
}
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
{
struct btrfs_space_info *found;
- int factor;
+ int factor, index;
- factor = btrfs_bg_type_to_factor(flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, flags);
+ found = btrfs_find_space_info(info, block_group->flags);
ASSERT(found);
spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- found->bytes_zone_unusable += bytes_zone_unusable;
- if (total_bytes > 0)
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
- *space_info = found;
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -328,6 +376,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
+static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ /*
+ * On regular filesystem, all total_bytes are always writable. On zoned
+ * filesystem, there may be a limitation imposed by max_active_zones.
+ * For metadata allocation, we cannot finish an existing active block
+ * group to avoid a deadlock. Thus, we need to consider only the active
+ * groups to be writable for metadata space.
+ */
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return space_info->total_bytes;
+
+ return space_info->active_total_bytes;
+}
+
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
@@ -340,9 +404,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
+ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+ avail = 0;
+ else
+ avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < space_info->total_bytes + avail)
+ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
return 1;
return 0;
}
@@ -378,7 +445,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
+ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
@@ -409,28 +476,47 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
+ const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
- info->flags,
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
info->bytes_readonly, info->bytes_zone_unusable);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -442,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
if (!dump_block_groups)
@@ -519,7 +606,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
- trans = (struct btrfs_trans_handle *)current->journal_info;
+ trans = current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on
@@ -617,7 +704,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
enum btrfs_flush_state state, bool for_preempt)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -662,6 +749,18 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
+ /*
+ * For metadata space on zoned filesystem, reaching here means we
+ * don't have enough space left in active_total_bytes. Try to
+ * activate a block group first, because we may have inactive
+ * block group already allocated.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
+ if (ret < 0)
+ break;
+ else if (ret == 1)
+ break;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -672,6 +771,23 @@ static void flush_space(struct btrfs_fs_info *fs_info,
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
+
+ /*
+ * For metadata space on zoned filesystem, allocating a new chunk
+ * is not enough. We still need to activate the block * group.
+ * Active the newly allocated block group by (maybe) finishing
+ * a block group.
+ */
+ if (ret == 1) {
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
+ /*
+ * Revert to the original ret regardless we could finish
+ * one block group or not.
+ */
+ if (ret >= 0)
+ ret = 1;
+ }
+
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -709,6 +825,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
+ u64 total;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -723,8 +840,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
- if (space_info->total_bytes + avail < used)
- to_reclaim += used - (space_info->total_bytes + avail);
+ total = writable_total_bytes(fs_info, space_info);
+ if (total + avail < used)
+ to_reclaim += used - (total + avail);
return to_reclaim;
}
@@ -734,9 +852,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 90);
+ u64 total = writable_total_bytes(fs_info, space_info);
+ u64 thresh;
u64 used;
+ thresh = div_factor_fine(total, 90);
+
+ lockdep_assert_held(&space_info->lock);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -796,8 +919,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
- if (used < space_info->total_bytes)
- thresh += space_info->total_bytes - used;
+ if (used < total)
+ thresh += total - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -844,6 +967,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
+ if (!ticket->steal)
+ return false;
+
if (global_rsv->space_info != space_info)
return false;
@@ -899,8 +1025,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (!aborted && ticket->steal &&
- steal_from_global_rsv(fs_info, space_info, ticket))
+ if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
return true;
if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
@@ -1059,7 +1184,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv->reserved;
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
- spin_unlock(&space_info->lock);
/*
* We don't want to include the global_rsv in our calculation,
@@ -1090,6 +1214,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
+ spin_unlock(&space_info->lock);
+
/*
* We don't want to reclaim everything, just a portion, so scale
* down the to_reclaim by 1/4. If it takes us down to 0,
@@ -1260,18 +1386,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int states_nr)
{
u64 to_reclaim;
- int flush_state;
+ int flush_state = 0;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
- if (!to_reclaim) {
+ /*
+ * This is the priority reclaim path, so to_reclaim could be >0 still
+ * because we may have only satisfied the priority tickets and still
+ * left non priority tickets on the list. We would then have
+ * to_reclaim but ->bytes == 0.
+ */
+ if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- flush_state = 0;
- do {
+ while (flush_state < states_nr) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, to_reclaim, states[flush_state],
false);
flush_state++;
@@ -1280,23 +1411,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- } while (flush_state < states_nr);
+ }
+
+ /* Attempt to steal from the global rsv if we can. */
+ if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ }
+
+ /*
+ * We must run try_granting_tickets here because we could be a large
+ * ticket in front of a smaller ticket that can now be satisfied with
+ * the available space.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
+ spin_lock(&space_info->lock);
+
+ /* We could have been granted before we got here. */
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
while (!space_info->full) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
}
+
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
@@ -1378,25 +1535,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
break;
}
- spin_lock(&space_info->lock);
ret = ticket->error;
- if (ticket->bytes || ticket->error) {
- /*
- * We were a priority ticket, so we need to delete ourselves
- * from the list. Because we could have other priority tickets
- * behind us that require less space, run
- * btrfs_try_granting_tickets() to see if their reservations can
- * now be made.
- */
- if (!list_empty(&ticket->list)) {
- remove_ticket(space_info, ticket);
- btrfs_try_granting_tickets(fs_info, space_info);
- }
-
- if (!ret)
- ret = -ENOSPC;
- }
- spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
/*
* Check that we can't have an error set if the reservation succeeded,
@@ -1438,6 +1577,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
space_info->clamp = min(space_info->clamp + 1, 8);
}
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
/**
* Try to reserve bytes from the block_rsv's space
*
@@ -1492,7 +1637,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets &&
- ((used + orig_bytes <= space_info->total_bytes) ||
+ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
@@ -1511,7 +1656,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
- ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ ticket.steal = can_steal(flush);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
@@ -1541,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
@@ -1567,7 +1711,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
/**
* Trye to reserve metadata bytes from the block_rsv's space
*
- * @root: the root we're allocating for
+ * @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1579,22 +1723,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
- if (ret == -ENOSPC &&
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- if (block_rsv != global_rsv &&
- !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
- ret = 0;
- }
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
@@ -1624,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
- flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
@@ -1636,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
}
return ret;
}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}