diff options
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r-- | fs/btrfs/zoned.c | 797 |
1 files changed, 555 insertions, 242 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f559d517c7c4..1912abf6d020 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -51,11 +51,13 @@ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* - * Maximum supported zone size. Currently, SMR disks have a zone size of - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not - * expect the zone size to become larger than 8GiB in the near future. + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) @@ -92,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written @@ -350,7 +352,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -402,15 +403,41 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; } nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + * + * With the zoned emulation, we can have non-zoned device on the zoned + * mode. In this case, we don't have a valid max zone append size. So, + * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. + */ + if (bdev_is_zoned(bdev)) { + zone_info->max_zone_append_size = min_t(u64, + (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + } else { + zone_info->max_zone_append_size = + (u64)bdev_max_segments(bdev) << PAGE_SHIFT; + } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -612,6 +639,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) device->zone_info = NULL; } +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) +{ + struct btrfs_zoned_device_info *zone_info; + + zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return NULL; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) + goto out; + + bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, + zone_info->nr_zones); + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) + goto out; + + bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, + zone_info->nr_zones); + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) + goto out; + + bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, + zone_info->nr_zones); + zone_info->zone_cache = NULL; + + return zone_info; + +out: + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->active_zones); + kfree(zone_info); + return NULL; +} + int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { @@ -625,75 +692,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->bdev && + bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found: %pg", + device->bdev); + return -EINVAL; + } + } + + return 0; +} + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 zoned_devices = 0; - u64 nr_devices = 0; u64 zone_size = 0; - const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); - int ret = 0; + u64 max_zone_append_size = 0; + int ret; - /* Count zoned devices */ - list_for_each_entry(device, &fs_devices->devices, dev_list) { - enum blk_zoned_model model; + /* + * Host-Managed devices can't be used without the ZONED flag. With the + * ZONED all devices can be used, using zone emulation if required. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; - model = bdev_zoned_model(device->bdev); - /* - * A Host-Managed zoned device must be used as a zoned device. - * A Host-Aware zoned device and a non-zoned devices can be - * treated as a zoned device, if ZONED flag is enabled in the - * superblock. - */ - if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned) || - (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info = - device->zone_info; - - zone_info = device->zone_info; - zoned_devices++; - if (!zone_size) { - zone_size = zone_info->zone_size; - } else if (zone_info->zone_size != zone_size) { - btrfs_err(fs_info, + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", - device->zone_info->zone_size, - zone_size); - ret = -EINVAL; - goto out; - } + zone_info->zone_size, zone_size); + return -EINVAL; } - nr_devices++; - } - - if (!zoned_devices && !incompat_zoned) - goto out; - - if (!zoned_devices && incompat_zoned) { - /* No zoned block device found on ZONED filesystem */ - btrfs_err(fs_info, - "zoned: no zoned devices found on a zoned filesystem"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices && !incompat_zoned) { - btrfs_err(fs_info, - "zoned: mode not enabled but zoned device found"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices != nr_devices) { - btrfs_err(fs_info, - "zoned: cannot mix zoned and regular devices"); - ret = -EINVAL; - goto out; + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = zone_info->max_zone_append_size; } /* @@ -705,18 +752,20 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); - ret = -EINVAL; - goto out; + return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); - ret = -EINVAL; - goto out; + return -EINVAL; } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned @@ -724,11 +773,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) */ ret = btrfs_check_mountopts_zoned(fs_info); if (ret) - goto out; + return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); -out: - return ret; + return 0; } int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) @@ -1151,7 +1199,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, - u64 *offset_ret) + u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; @@ -1161,6 +1209,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, int ret; u64 length; + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1215,12 +1278,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; - u64 physical = 0; int ret; int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; u64 *caps = NULL; + u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1264,6 +1327,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; @@ -1277,19 +1346,26 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; - physical = map->stripes[i].physical; + physical[i] = map->stripes[i].physical; if (device->bdev == NULL) { alloc_offsets[i] = WP_MISSING_DEV; continue; } - is_sequential = btrfs_dev_is_sequential(device, physical); + is_sequential = btrfs_dev_is_sequential(device, physical[i]); if (is_sequential) num_sequential++; else num_conventional++; + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + if (!is_sequential) { alloc_offsets[i] = WP_CONVENTIONAL; continue; @@ -1299,21 +1375,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * This zone will be used for allocation, so mark this zone * non-empty. */ - btrfs_dev_clear_zone_empty(device, physical); + btrfs_dev_clear_zone_empty(device, physical[i]); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. */ - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical, &zone); + ret = btrfs_get_dev_zone(device, physical[i], &zone); memalloc_nofs_restore(nofs_flag); if (ret == -EIO || ret == -EOPNOTSUPP) { ret = 0; @@ -1339,7 +1415,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical >> device->zone_info->zone_size_shift, + physical[i] >> device->zone_info->zone_size_shift, rcu_str_deref(device->name), device->devid); alloc_offsets[i] = WP_MISSING_DEV; break; @@ -1356,45 +1432,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) __set_bit(i, active); break; } - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); } if (num_sequential > 0) cache->seq_zone = true; if (num_conventional > 0) { - /* - * Avoid calling calculate_alloc_pointer() for new BG. It - * is no use for new BG. It must be always 0. - * - * Also, we have a lock chain of extent buffer lock -> - * chunk mutex. For new BG, this function is called from - * btrfs_make_block_group() which is already taking the - * chunk mutex. Thus, we cannot call - * calculate_alloc_pointer() which takes extent buffer - * locks to avoid deadlock. - */ - /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; - if (new) { - cache->alloc_offset = 0; - goto out; - } - ret = calculate_alloc_pointer(cache, &last_alloc); - if (ret || map->num_stripes == num_conventional) { - if (!ret) - cache->alloc_offset = last_alloc; - else - btrfs_err(fs_info, + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", - cache->start); + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } @@ -1404,15 +1458,54 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (alloc_offsets[0] == WP_MISSING_DEV) { btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", - physical); + physical[0]); ret = -EIO; goto out; } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = caps[0]; - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); break; case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &cache->runtime_flags); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID0: case BTRFS_BLOCK_GROUP_RAID10: @@ -1426,13 +1519,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - if (cache->zone_is_active) { - btrfs_get_block_group(cache); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } - out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1457,14 +1543,21 @@ out: ret = -EIO; } - if (!ret) + if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - - if (ret) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { kfree(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); + kfree(physical); kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1485,7 +1578,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; @@ -1685,12 +1777,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1709,7 +1803,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1759,7 +1854,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1777,52 +1871,56 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; bool ret; + int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return true; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); - - if (block_group->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { + if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } - if (!btrfs_dev_set_active_zone(device, physical)) { - /* Cannot activate the zone */ - ret = false; - goto out_unlock; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->zone_info->max_active_zones == 0) + continue; + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } } /* Successfully activated all the zones */ - block_group->zone_is_active = 1; - + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(list_empty(&block_group->active_bg_list)); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -1830,115 +1928,169 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } -int btrfs_zone_finish(struct btrfs_block_group *block_group) +static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; - int ret = 0; - - if (!btrfs_is_zoned(fs_info)) - return 0; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } - map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} - if (device->zone_info->max_active_zones == 0) - return 0; +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + int ret = 0; + int i; spin_lock(&block_group->lock); - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && - block_group->alloc_offset > block_group->meta_write_pointer) { + if (is_metadata && + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } - spin_unlock(&block_group->lock); - - ret = btrfs_inc_block_group_ro(block_group, false); - if (ret) - return ret; - - /* Ensure all writes in this block group finish */ - btrfs_wait_block_group_reservations(block_group); - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); - - spin_lock(&block_group->lock); /* - * Bail out if someone already deactivated the block group, or - * allocated space is left in the block group. + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. */ - if (!block_group->zone_is_active) { + if (!fully_written) { spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return 0; - } - if (block_group->reserved) { - spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return -EAGAIN; + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } } - block_group->zone_is_active = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); - btrfs_dec_block_group_ro(block_group); + map = block_group->physical_map; + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; - if (!ret) { - btrfs_dev_clear_active_zone(device, physical); + if (device->zone_info->max_active_zones == 0) + continue; - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - /* For active_bg_list */ - btrfs_put_block_group(block_group); + if (ret) + return ret; + + btrfs_dev_clear_active_zone(device, physical); } - return ret; + if (!fully_written) + btrfs_dec_block_group_ro(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + + return 0; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1950,7 +2102,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return ret; } @@ -1958,9 +2113,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; @@ -1968,42 +2121,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); - if (logical + length < block_group->start + block_group->zone_capacity) - goto out; - - spin_lock(&block_group->lock); + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; - if (!block_group->zone_is_active) { - spin_unlock(&block_group->lock); + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) goto out; - } - block_group->zone_is_active = 0; - /* We should have consumed all the free space */ - ASSERT(block_group->alloc_offset == block_group->zone_capacity); - ASSERT(block_group->free_space_ctl->free_space == 0); - btrfs_clear_treelog_bg(block_group); - btrfs_clear_data_reloc_bg(block_group); - spin_unlock(&block_group->lock); + do_zone_finish(block_group, true); - map = block_group->physical_map; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; +out: + btrfs_put_block_group(block_group); +} - if (!device->zone_info->max_active_zones) - goto out; +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); - btrfs_dev_clear_active_zone(device, physical); + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; - btrfs_put_block_group(block_group); + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } -out: - btrfs_put_block_group(block_group); + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2033,3 +2196,153 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &bg->runtime_flags)) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} |