Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba: "Features: - subpage: - support for PAGE_SIZE > 4K (previously only 64K) - make it work with raid56 - repair super block num_devices automatically if it does not match the number of device items - defrag can convert inline extents to regular extents, up to now inline files were skipped but the setting of mount option max_inline could affect the decision logic - zoned: - minimal accepted zone size is explicitly set to 4MiB - make zone reclaim less aggressive and don't reclaim if there are enough free zones - add per-profile sysfs tunable of the reclaim threshold - allow automatic block group reclaim for non-zoned filesystems, with sysfs tunables - tree-checker: new check, compare extent buffer owner against owner rootid Performance: - avoid blocking on space reservation when doing nowait direct io writes (+7% throughput for reads and writes) - NOCOW write throughput improvement due to refined locking (+3%) - send: reduce pressure to page cache by dropping extent pages right after they're processed Core: - convert all radix trees to xarray - add iterators for b-tree node items - support printk message index - user bulk page allocation for extent buffers - switch to bio_alloc API, use on-stack bios where convenient, other bio cleanups - use rw lock for block groups to favor concurrent reads - simplify workques, don't allocate high priority threads for all normal queues as we need only one - refactor scrub, process chunks based on their constraints and similarity - allocate direct io structures on stack and pass around only pointers, avoids allocation and reduces potential error handling Fixes: - fix count of reserved transaction items for various inode operations - fix deadlock between concurrent dio writes when low on free data space - fix a few cases when zones need to be finished VFS, iomap: - add helper to check if sb write has started (usable for assertions) - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io" * tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits) btrfs: zoned: introduce a minimal zone size 4M and reject mount btrfs: allow defrag to convert inline extents to regular extents btrfs: add "0x" prefix for unsupported optional features btrfs: do not account twice for inode ref when reserving metadata units btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer btrfs: send: avoid trashing the page cache btrfs: send: keep the current inode open while processing it btrfs: allocate the btrfs_dio_private as part of the iomap dio bio btrfs: move struct btrfs_dio_private to inode.c btrfs: remove the disk_bytenr in struct btrfs_dio_private btrfs: allocate dio_data on stack iomap: add per-iomap_iter private data iomap: allow the file system to provide a bio_set for direct I/O btrfs: add a btrfs_dio_rw wrapper btrfs: zoned: zone finish unused block group btrfs: zoned: properly finish block group on metadata write btrfs: zoned: finish block group when there are no more allocatable bytes left btrfs: zoned: consolidate zone finish functions btrfs: zoned: introduce btrfs_zoned_bg_is_full btrfs: improve error reporting in lookup_inline_extent_backref ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2022-05-24 18:52:35 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2022-05-24 18:52:35 -0700
commit: bd1b7c1384ec15294ee45bf3add7b7036e146dad (patch)
tree: 5b8efc004782d52f8697b2831bdcce9c9a884988 /fs/btrfs/zoned.c
parent: Merge tag 'zonefs-5.19-rc1-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs (diff)
parent: btrfs: zoned: introduce a minimal zone size 4M and reject mount (diff)
download: linux-dev-bd1b7c1384ec15294ee45bf3add7b7036e146dad.tar.xz
linux-dev-bd1b7c1384ec15294ee45bf3add7b7036e146dad.zip
1 files changed, 142 insertions, 75 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 29b54fd9c128..11237a913bee 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -51,11 +51,13 @@
 #define BTRFS_MIN_ACTIVE_ZONES		(BTRFS_SUPER_MIRROR_MAX + 5)
 
 /*
- * Maximum supported zone size. Currently, SMR disks have a zone size of
- * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
- * expect the zone size to become larger than 8GiB in the near future.
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
  */
 #define BTRFS_MAX_ZONE_SIZE		SZ_8G
+#define BTRFS_MIN_ZONE_SIZE		SZ_4M
 
 #define SUPER_INFO_SECTORS	((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
 
@@ -401,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 				 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 		ret = -EINVAL;
 		goto out;
+	} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+		btrfs_err_in_rcu(fs_info,
+		"zoned: %s: zone size %llu smaller than supported minimum %u",
+				 rcu_str_deref(device->name),
+				 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+		ret = -EINVAL;
+		goto out;
 	}
 
 	nr_sectors = bdev_nr_sectors(bdev);
@@ -1835,7 +1844,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 	}
 
 	/* No space left */
-	if (block_group->alloc_offset == block_group->zone_capacity) {
+	if (btrfs_zoned_bg_is_full(block_group)) {
 		ret = false;
 		goto out_unlock;
 	}
@@ -1872,20 +1881,14 @@ out_unlock:
 	return ret;
 }
 
-int btrfs_zone_finish(struct btrfs_block_group *block_group)
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct map_lookup *map;
-	struct btrfs_device *device;
-	u64 physical;
+	bool need_zone_finish;
 	int ret = 0;
 	int i;
 
-	if (!btrfs_is_zoned(fs_info))
-		return 0;
-
-	map = block_group->physical_map;
-
 	spin_lock(&block_group->lock);
 	if (!block_group->zone_is_active) {
 		spin_unlock(&block_group->lock);
@@ -1895,40 +1898,56 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
 	/* Check if we have unwritten allocated space */
 	if ((block_group->flags &
 	     (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
-	    block_group->alloc_offset > block_group->meta_write_pointer) {
+	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
 		spin_unlock(&block_group->lock);
 		return -EAGAIN;
 	}
-	spin_unlock(&block_group->lock);
-
-	ret = btrfs_inc_block_group_ro(block_group, false);
-	if (ret)
-		return ret;
-
-	/* Ensure all writes in this block group finish */
-	btrfs_wait_block_group_reservations(block_group);
-	/* No need to wait for NOCOW writers. Zoned mode does not allow that. */
-	btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
-				 block_group->length);
-
-	spin_lock(&block_group->lock);
 
 	/*
-	 * Bail out if someone already deactivated the block group, or
-	 * allocated space is left in the block group.
+	 * If we are sure that the block group is full (= no more room left for
+	 * new allocation) and the IO for the last usable block is completed, we
+	 * don't need to wait for the other IOs. This holds because we ensure
+	 * the sequential IO submissions using the ZONE_APPEND command for data
+	 * and block_group->meta_write_pointer for metadata.
 	 */
-	if (!block_group->zone_is_active) {
+	if (!fully_written) {
 		spin_unlock(&block_group->lock);
-		btrfs_dec_block_group_ro(block_group);
-		return 0;
-	}
 
-	if (block_group->reserved) {
-		spin_unlock(&block_group->lock);
-		btrfs_dec_block_group_ro(block_group);
-		return -EAGAIN;
+		ret = btrfs_inc_block_group_ro(block_group, false);
+		if (ret)
+			return ret;
+
+		/* Ensure all writes in this block group finish */
+		btrfs_wait_block_group_reservations(block_group);
+		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
+		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+					 block_group->length);
+
+		spin_lock(&block_group->lock);
+
+		/*
+		 * Bail out if someone already deactivated the block group, or
+		 * allocated space is left in the block group.
+		 */
+		if (!block_group->zone_is_active) {
+			spin_unlock(&block_group->lock);
+			btrfs_dec_block_group_ro(block_group);
+			return 0;
+		}
+
+		if (block_group->reserved) {
+			spin_unlock(&block_group->lock);
+			btrfs_dec_block_group_ro(block_group);
+			return -EAGAIN;
+		}
 	}
 
+	/*
+	 * The block group is not fully allocated, so not fully written yet. We
+	 * need to send ZONE_FINISH command to free up an active zone.
+	 */
+	need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
+
 	block_group->zone_is_active = 0;
 	block_group->alloc_offset = block_group->zone_capacity;
 	block_group->free_space_ctl->free_space = 0;
@@ -1936,24 +1955,29 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
 	btrfs_clear_data_reloc_bg(block_group);
 	spin_unlock(&block_group->lock);
 
+	map = block_group->physical_map;
 	for (i = 0; i < map->num_stripes; i++) {
-		device = map->stripes[i].dev;
-		physical = map->stripes[i].physical;
+		struct btrfs_device *device = map->stripes[i].dev;
+		const u64 physical = map->stripes[i].physical;
 
 		if (device->zone_info->max_active_zones == 0)
 			continue;
 
-		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
-				       physical >> SECTOR_SHIFT,
-				       device->zone_info->zone_size >> SECTOR_SHIFT,
-				       GFP_NOFS);
+		if (need_zone_finish) {
+			ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+					       physical >> SECTOR_SHIFT,
+					       device->zone_info->zone_size >> SECTOR_SHIFT,
+					       GFP_NOFS);
 
-		if (ret)
-			return ret;
+			if (ret)
+				return ret;
+		}
 
 		btrfs_dev_clear_active_zone(device, physical);
 	}
-	btrfs_dec_block_group_ro(block_group);
+
+	if (!fully_written)
+		btrfs_dec_block_group_ro(block_group);
 
 	spin_lock(&fs_info->zone_active_bgs_lock);
 	ASSERT(!list_empty(&block_group->active_bg_list));
@@ -1966,6 +1990,14 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
 	return 0;
 }
 
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+	if (!btrfs_is_zoned(block_group->fs_info))
+		return 0;
+
+	return do_zone_finish(block_group, false);
+}
+
 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 {
 	struct btrfs_fs_info *fs_info = fs_devices->fs_info;
@@ -1997,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
 {
 	struct btrfs_block_group *block_group;
-	struct map_lookup *map;
-	struct btrfs_device *device;
-	u64 physical;
+	u64 min_alloc_bytes;
 
 	if (!btrfs_is_zoned(fs_info))
 		return;
@@ -2007,42 +2037,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
 	block_group = btrfs_lookup_block_group(fs_info, logical);
 	ASSERT(block_group);
 
-	if (logical + length < block_group->start + block_group->zone_capacity)
-		goto out;
-
-	spin_lock(&block_group->lock);
+	/* No MIXED_BG on zoned btrfs. */
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+		min_alloc_bytes = fs_info->sectorsize;
+	else
+		min_alloc_bytes = fs_info->nodesize;
 
-	if (!block_group->zone_is_active) {
-		spin_unlock(&block_group->lock);
+	/* Bail out if we can allocate more data from this block group. */
+	if (logical + length + min_alloc_bytes <=
+	    block_group->start + block_group->zone_capacity)
 		goto out;
-	}
 
-	block_group->zone_is_active = 0;
-	/* We should have consumed all the free space */
-	ASSERT(block_group->alloc_offset == block_group->zone_capacity);
-	ASSERT(block_group->free_space_ctl->free_space == 0);
-	btrfs_clear_treelog_bg(block_group);
-	btrfs_clear_data_reloc_bg(block_group);
-	spin_unlock(&block_group->lock);
+	do_zone_finish(block_group, true);
 
-	map = block_group->physical_map;
-	device = map->stripes[0].dev;
-	physical = map->stripes[0].physical;
+out:
+	btrfs_put_block_group(block_group);
+}
 
-	if (!device->zone_info->max_active_zones)
-		goto out;
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+	struct btrfs_block_group *bg =
+		container_of(work, struct btrfs_block_group, zone_finish_work);
 
-	btrfs_dev_clear_active_zone(device, physical);
+	wait_on_extent_buffer_writeback(bg->last_eb);
+	free_extent_buffer(bg->last_eb);
+	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+	btrfs_put_block_group(bg);
+}
 
-	spin_lock(&fs_info->zone_active_bgs_lock);
-	ASSERT(!list_empty(&block_group->active_bg_list));
-	list_del_init(&block_group->active_bg_list);
-	spin_unlock(&fs_info->zone_active_bgs_lock);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+				   struct extent_buffer *eb)
+{
+	if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+		return;
 
-	btrfs_put_block_group(block_group);
+	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+		btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+			  bg->start);
+		return;
+	}
 
-out:
-	btrfs_put_block_group(block_group);
+	/* For the work */
+	btrfs_get_block_group(bg);
+	atomic_inc(&eb->refs);
+	bg->last_eb = eb;
+	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+	queue_work(system_unbound_wq, &bg->zone_finish_work);
 }
 
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2072,3 +2112,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 }
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	u64 used = 0;
+	u64 total = 0;
+	u64 factor;
+
+	ASSERT(btrfs_is_zoned(fs_info));
+
+	if (fs_info->bg_reclaim_threshold == 0)
+		return false;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->bdev)
+			continue;
+
+		total += device->disk_total_bytes;
+		used += device->bytes_used;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	factor = div64_u64(used * 100, total);
+	return factor >= fs_info->bg_reclaim_threshold;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2022-05-24 18:52:35 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2022-05-24 18:52:35 -0700
commit	bd1b7c1384ec15294ee45bf3add7b7036e146dad (patch)
tree	5b8efc004782d52f8697b2831bdcce9c9a884988 /fs/btrfs/zoned.c
parent	Merge tag 'zonefs-5.19-rc1-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs (diff)
parent	btrfs: zoned: introduce a minimal zone size 4M and reject mount (diff)
download	linux-dev-bd1b7c1384ec15294ee45bf3add7b7036e146dad.tar.xz linux-dev-bd1b7c1384ec15294ee45bf3add7b7036e146dad.zip