aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c1013
1 files changed, 489 insertions, 524 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..fb8cc9642ac4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
return ret;
}
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
-
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
- }
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
-{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
-
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
- }
-
- if (min_key.offset == (u64)-1)
- goto none;
-
- min_key.offset++;
- btrfs_release_path(path);
- }
-none:
- btrfs_free_path(path);
- return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ bool locked)
{
struct extent_map *next;
bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ next = defrag_lookup_extent(inode, em->start + em->len, locked);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
- /*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
- */
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
/*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- u64 start = (u64)start_index << PAGE_SHIFT;
- u64 search_start;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ if (!em)
+ break;
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start, page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /* Skip hole/inline/preallocated extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- break;
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (em->len >= extent_thresh)
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+add:
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
- }
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
- pages[i] = page;
- i_done++;
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
}
- if (!i_done || ret)
- goto out;
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
- wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+ /* Lock the pages range */
+ lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
- * When defragmenting we skip ranges that have holes or inline extents,
- * (check should_defrag_range()), to avoid unnecessary IO and wasting
- * space. At btrfs_defrag_file(), we check if a range should be defragged
- * before locking the inode and then, if it should, we trigger a sync
- * page cache readahead - we lock the inode only after that to avoid
- * blocking for too long other tasks that possibly want to operate on
- * other file ranges. But before we were able to get the inode lock,
- * some other task may have punched a hole in the range, or we may have
- * now an inline extent, in which case we should not defrag. So check
- * for that here, where we have the inode and the range locked, and bail
- * out if that happened.
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
*/
- search_start = page_start;
- while (search_start < page_end) {
- struct extent_map *em;
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list);
+ if (ret < 0)
+ goto unlock_extent;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock_range;
- }
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- /* Ok, 0 means we did not defrag anything */
- ret = 0;
- goto out_unlock_range;
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
}
- search_start = extent_map_end(em);
- free_extent_map(em);
}
+ kfree(pages);
+ return ret;
+}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, (page_cnt - i_done) << PAGE_SHIFT, true);
- }
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list);
+ if (ret < 0)
+ goto out;
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ /* Reached the limit */
+ if (max_sectors && max_sectors == *sectors_defragged)
+ break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len;
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
-
-out_unlock_range:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
if (isize == 0)
return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len) - 1;
+ } else {
+ /* Defrag until file end */
+ last_byte = isize - 1;
+ }
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
- }
-
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
}
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
-
- /*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
- */
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
-
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
- if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
- ret = -EAGAIN;
- goto error;
- }
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
+ while (cur < last_byte) {
+ u64 cluster_end;
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
- }
+ /* The cluster size 256K should always be page aligned */
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
btrfs_inode_unlock(inode, 0);
- goto out_ra;
+ break;
}
-
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress,
+ &sectors_defragged, max_to_defrag);
btrfs_inode_unlock(inode, 0);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
-
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
-
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
- }
+ if (ret < 0)
+ break;
+ cur = cluster_end + 1;
}
- ret = defrag_count;
-error:
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
-
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- }
-
-out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
}
- if (!file)
- kfree(ra);
- kfree(pages);
return ret;
}
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
@@ -2261,9 +2222,8 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = fault_in_pages_writeable(ubuf + sk_offset,
- *buf_size - sk_offset);
- if (ret)
+ ret = -EFAULT;
+ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -3136,12 +3096,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- /* Subpage defrag will be supported in later commits */
- if (root->fs_info->sectorsize < PAGE_SIZE) {
- ret = -ENOTTY;
- goto out;
- }
-
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3130,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -3220,6 +3174,7 @@ out:
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3186,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto err_drop;
+ goto out;
}
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
+
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- strcmp("cancel", vol_args->name) == 0)
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret)
- goto out;
- /* Exclusive operation is now claimed */
+ goto err_drop;
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3271,17 +3230,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3254,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out_drop_write;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- cancel = (strcmp("cancel", vol_args->name) == 0);
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
- kfree(vol_args);
-out_drop_write:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3379,22 +3346,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -4430,7 +4396,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args qsa = {0};
- int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4441,9 +4406,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, &qsa, sizeof(qsa)))
- ret = -EFAULT;
+ return -EFAULT;
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,