diff options
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r-- | fs/btrfs/ioctl.c | 1203 |
1 files changed, 884 insertions, 319 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2b84846ed934..d5dd8bed1488 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> +#include <linux/sched/xacct.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 { #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ @@ -387,6 +406,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, * * Compatibility: * - the same type is already running + * - when trying to add a device and balance has been paused * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller * must check the condition first that would allow none -> @type */ @@ -394,7 +414,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type) + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) return true; spin_unlock(&fs_info->super_lock); @@ -414,10 +436,31 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) { - struct inode *inode = file_inode(file); + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +{ return put_user(inode->i_generation, arg); } @@ -425,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -455,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); @@ -501,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, - const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -516,12 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); - struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; int ret; - int err; - dev_t anon_dev = 0; + dev_t anon_dev; u64 objectid; - u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -529,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto fail_free; - - ret = get_anon_bdev(&anon_dev); - if (ret < 0) - goto fail_free; + goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be @@ -541,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; - goto fail_free; + goto out_root_item; } + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; + } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * The same as the snapshot creation, please see the comment - * of create_snapshot(). - */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); if (ret) - goto fail_free; + goto out_new_inode_args; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); - goto fail_free; + goto out_new_inode_args; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) - goto fail; + goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); - goto fail; + goto out; } btrfs_mark_buffer_dirty(leaf); @@ -617,100 +692,70 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the * filesystem in an inconsistent state (an extent item in the - * extent tree without backreferences). Also no need to have - * the tree block locked since it is not in any tree at this - * point, so no other task can find it and use it. + * extent tree with a backreference for a root that does not + * exists). */ - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_tree_lock(leaf); + btrfs_clean_tree_block(leaf); + btrfs_tree_unlock(leaf); + btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); - goto fail; + goto out; } free_extent_buffer(leaf); leaf = NULL; - key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { - free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - /* Freeing will be done in btrfs_put_root() of new_root */ + /* anon_dev is owned by new_root now. */ anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { - btrfs_put_root(new_root); btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); - btrfs_put_root(new_root); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, - btrfs_ino(BTRFS_I(dir)), index, name, namelen); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_uuid_tree_add(trans, root_item->uuid, - BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) - btrfs_abort_transaction(trans, ret); + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; -fail: - kfree(root_item); +out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; - - if (!ret) { - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return PTR_ERR(inode); - d_instantiate(dentry, inode); - } - return ret; - -fail_free: + if (ret) + btrfs_end_transaction(trans); + else + ret = btrfs_commit_transaction(trans); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); +out_root_item: kfree(root_item); return ret; } @@ -722,9 +767,17 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; struct btrfs_trans_handle *trans; int ret; + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -752,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - parent dir inode - * 2 - dir entries - * 1 - root item - * 2 - root ref/backref - * 1 - root of snapshot - * 1 - UUID item + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item */ + trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 8, - false); + &pending_snapshot->block_rsv, + trans_num_items, false); if (ret) goto free_pending; @@ -777,10 +828,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) @@ -934,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -987,8 +1035,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1003,16 +1198,30 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; /* get the big lock and read metadata off disk */ if (!locked) - lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent_cached(io_tree, start, end, &cached); + unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1021,23 +1230,52 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; +out: free_extent_map(next); return ret; } @@ -1095,10 +1333,10 @@ again: while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); if (!ordered) break; @@ -1122,7 +1360,7 @@ again: * make it uptodate. */ if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping || !PagePrivate(page)) { unlock_page(page); @@ -1161,8 +1399,11 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1172,12 +1413,25 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; - /* Skip hole/inline/preallocated extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE || + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; @@ -1185,6 +1439,39 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. @@ -1193,11 +1480,27 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (em->len >= extent_thresh) + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; @@ -1214,6 +1517,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1257,10 +1561,22 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } #define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); /* * Defrag one contiguous target range. @@ -1300,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, return ret; clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, cached_state); + EXTENT_DEFRAG, cached_state); set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); /* Update the page status */ @@ -1315,7 +1631,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1349,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, wait_on_page_writeback(pages[i]); /* Lock the pages range */ - lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1361,7 +1678,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1377,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); free_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) { @@ -1396,7 +1713,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1404,24 +1722,34 @@ static int defrag_one_cluster(struct btrfs_inode *inode, LIST_HEAD(target_list); int ret; - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; - /* Reached the limit */ - if (max_sectors && max_sectors == *sectors_defragged) + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; break; + } if (max_sectors) range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1434,16 +1762,20 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; - *sectors_defragged += range_len; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1456,6 +1788,12 @@ out: * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, @@ -1471,6 +1809,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; if (isize == 0) return 0; @@ -1490,12 +1829,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (range->start + range->len > range->start) { /* Got a specific range */ - last_byte = min(isize, range->start + range->len) - 1; + last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ - last_byte = isize - 1; + last_byte = isize; } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so @@ -1508,15 +1851,23 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; - /* The cluster size 256K should always be page aligned */ - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + @@ -1537,16 +1888,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); } if (ra_allocated) kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they @@ -1915,10 +2280,9 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -2082,7 +2446,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); - item_len = btrfs_item_size_nr(leaf, i); + item_len = btrfs_item_size(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) @@ -2223,7 +2587,12 @@ static noinline int search_ioctl(struct inode *inode, while (1) { ret = -EFAULT; - if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -2248,26 +2617,22 @@ err: return ret; } -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) { - struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; - struct inode *inode; int ret; size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; buf_size = sizeof(uargs->buf); - inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* @@ -2282,12 +2647,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct file *file, +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; - struct inode *inode; int ret; size_t buf_size; const size_t buf_limit = SZ_16M; @@ -2296,7 +2660,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, return -EPERM; /* copy search header and buffer size */ - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; @@ -2306,7 +2669,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, if (buf_size > buf_limit) buf_size = buf_limit; - inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) @@ -2536,7 +2898,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); - item_len = btrfs_item_size_nr(leaf, slot); + item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { @@ -2557,25 +2919,22 @@ out: return ret; } -static noinline int btrfs_ioctl_ino_lookup(struct file *file, +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); - inode = file_inode(file); - /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; + args->treeid = root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2587,7 +2946,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, goto out; } - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); @@ -2643,7 +3002,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ -static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; @@ -2655,7 +3014,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; - struct inode *inode; int slot; int ret = 0; @@ -2669,7 +3027,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) return -ENOMEM; } - inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ @@ -2738,7 +3095,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); - item_len = btrfs_item_size_nr(leaf, slot) + item_len = btrfs_item_size(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); @@ -2763,15 +3120,14 @@ out_free: * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ -static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; - struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct inode *inode; u64 objectid; int slot; int ret; @@ -2787,15 +3143,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) return PTR_ERR(rootrefs); } - inode = file_inode(file); - root = BTRFS_I(inode)->root->fs_info->tree_root; - objectid = BTRFS_I(inode)->root->root_key.objectid; - + objectid = root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; + root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; @@ -2875,6 +3229,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int err = 0; bool destroy_parent = false; + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) @@ -3058,10 +3419,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); @@ -3146,13 +3505,30 @@ out: static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; + bool restore_op = false; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { + if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + /* + * We can do the device add because we have a paused balanced, + * change the exclusive op type and remember we should bring + * back the paused balance + */ + fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; + btrfs_exclop_start_unlock(fs_info); + restore_op = true; + } vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { @@ -3168,7 +3544,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - btrfs_exclop_finish(fs_info); + if (restore_op) + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + else + btrfs_exclop_finish(fs_info); return ret; } @@ -3247,7 +3626,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3620,7 +3999,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, { struct btrfs_trans_handle *trans; u64 transid; - int ret; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -3632,11 +4010,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + btrfs_commit_transaction_async(trans); out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -3667,6 +4041,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); @@ -3766,6 +4145,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); @@ -3862,26 +4246,6 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -3931,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -3974,19 +4338,81 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; - if (!arg) - btrfs_warn(fs_info, - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3994,105 +4420,55 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; } - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; goto out_unlock; } - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); + goto do_balance; + } - goto do_balance; - } - } else { - bargs = NULL; + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; - goto out_bargs; + goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; - goto out_bargs; - } - - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + goto out_unlock; } - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { - ret = -EINVAL; - goto out_bctl; - } + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. @@ -4105,21 +4481,19 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if ((ret == 0 || ret == -ECANCELED) && arg) { + if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } -out_bctl: kfree(bctl); -out_bargs: - kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); + kfree(bargs); return ret; } @@ -4826,7 +5200,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4856,11 +5230,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(file, arg); + ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4871,7 +5428,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); + return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: @@ -4891,7 +5448,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); + return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -4912,14 +5469,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); + return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(file, argp); + return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); + return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: @@ -4966,10 +5521,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(file, argp, false); + return _btrfs_ioctl_send(inode, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(file, argp, true); + return _btrfs_ioctl_send(inode, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -4996,15 +5551,25 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: - return btrfs_ioctl_get_subvol_info(file, argp); + return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: - return btrfs_ioctl_get_subvol_rootref(file, argp); + return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif } return -ENOTTY; |