aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c200
1 files changed, 127 insertions, 73 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c3f386b7cc0b..56032c518b26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -368,6 +368,7 @@ struct async_chunk {
u64 end;
unsigned int write_flags;
struct list_head extents;
+ struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
atomic_t *pending;
};
@@ -474,6 +475,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
+ u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -488,7 +490,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, i_size_read(inode), end + 1);
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -699,10 +713,12 @@ cleanup_and_bail_uncompressed:
* to our extent and set things up for the async work queue to run
* cow_file_range to do the normal delalloc dance.
*/
- if (page_offset(async_chunk->locked_page) >= start &&
- page_offset(async_chunk->locked_page) <= end)
+ if (async_chunk->locked_page &&
+ (page_offset(async_chunk->locked_page) >= start &&
+ page_offset(async_chunk->locked_page)) <= end) {
__set_page_dirty_nobuffers(async_chunk->locked_page);
/* unlocked later on in the async handlers */
+ }
if (redirty)
extent_range_redirty_for_io(inode, start, end);
@@ -782,7 +798,7 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
WB_SYNC_ALL);
- else if (ret)
+ else if (ret && async_chunk->locked_page)
unlock_page(async_chunk->locked_page);
kfree(async_extent);
cond_resched();
@@ -865,7 +881,8 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages,
- async_chunk->write_flags)) {
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -1183,6 +1200,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
/*
* Since the pointer to 'pending' is at the beginning of the array of
* async_chunk's, freeing it ensures the whole array has been freed.
@@ -1191,12 +1210,14 @@ static noinline void async_cow_free(struct btrfs_work *work)
kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+static int cow_file_range_async(struct inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- unsigned int write_flags)
+ unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
@@ -1205,6 +1226,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
int i;
bool should_compress;
unsigned nofs_flag;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -1251,14 +1273,45 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
- async_chunk[i].locked_page = locked_page;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
- btrfs_init_work(&async_chunk[i].work,
- btrfs_delalloc_helper,
- async_cow_start, async_cow_submit,
- async_cow_free);
+ /*
+ * The locked_page comes all the way from writepage and its
+ * the original page we were actually given. As we spread
+ * this large delalloc region across multiple async_chunk
+ * structs, only the first struct needs a pointer to locked_page
+ *
+ * This way we don't need racey decisions about who is supposed
+ * to unlock it.
+ */
+ if (locked_page) {
+ /*
+ * Depending on the compressibility, the pages might or
+ * might not go through async. We want all of them to
+ * be accounted against wbc once. Let's do it here
+ * before the paths diverge. wbc accounting is used
+ * only for foreign writeback detection and doesn't
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+ wbc_account_cgroup_owner(wbc, locked_page,
+ cur_end - start);
+ async_chunk[i].locked_page = locked_page;
+ locked_page = NULL;
+ } else {
+ async_chunk[i].locked_page = NULL;
+ }
+
+ if (blkcg_css != blkcg_root_css) {
+ css_get(blkcg_css);
+ async_chunk[i].blkcg_css = blkcg_css;
+ } else {
+ async_chunk[i].blkcg_css = NULL;
+ }
+
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -1684,7 +1737,6 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- unsigned int write_flags = wbc_to_write_flags(wbc);
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1699,9 +1751,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
- ret = cow_file_range_async(inode, locked_page, start, end,
- page_started, nr_written,
- write_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+ page_started, nr_written);
}
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
@@ -2097,7 +2148,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
}
mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
out:
if (ret) {
@@ -2201,12 +2252,16 @@ again:
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ClearPageChecked(page);
- goto out;
+ goto out_reserved;
}
ClearPageChecked(page);
set_page_dirty(page);
+out_reserved:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ PAGE_SIZE, true);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2247,8 +2302,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
SetPageChecked(page);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
- btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EBUSY;
@@ -2662,7 +2716,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
return 0;
@@ -2986,7 +3040,7 @@ out_kfree:
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
@@ -3014,7 +3068,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
int compress_type = 0;
int ret = 0;
u64 logical_len = ordered_extent->len;
- bool nolock;
+ bool freespace_inode;
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
@@ -3025,7 +3079,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
clear_new_delalloc_bytes = true;
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -3056,8 +3110,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3091,8 +3145,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
EXTENT_DEFRAG, 0, 0, &cached_state);
}
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3241,7 +3295,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -3250,16 +3303,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
- NULL);
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &ordered_extent->work);
}
@@ -3518,7 +3567,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &found_key, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
@@ -5140,7 +5189,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
@@ -5737,12 +5785,14 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
return inode;
}
-/* Get an inode object given its location and corresponding root.
- * Returns in *is_new if the inode was read from disk
+/*
+ * Get an inode object given its location and corresponding root.
+ * Path can be preallocated to prevent recursing back to iget through
+ * allocator. NULL is also valid but may require an additional allocation
+ * later.
*/
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path)
+ struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
@@ -5757,8 +5807,6 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
- if (new)
- *new = 1;
} else {
iget_failed(inode);
/*
@@ -5776,9 +5824,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
}
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+ struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, new, NULL);
+ return btrfs_iget_path(s, location, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -5844,7 +5892,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, root);
if (IS_ERR(inode))
return inode;
@@ -5869,7 +5917,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, sub_root);
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6918,8 +6966,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
- if (em)
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
if (em) {
@@ -6935,7 +6981,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
@@ -7194,7 +7239,6 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = NULL;
ASSERT(hole_em);
/*
@@ -7554,7 +7598,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
{
struct extent_map_tree *em_tree;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
@@ -7572,7 +7615,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
@@ -7611,6 +7653,8 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
struct inode *inode,
u64 start, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
if (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return -ENOENT;
@@ -7620,7 +7664,7 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
return 0;
@@ -7703,7 +7747,7 @@ skip_cow:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7845,7 +7889,7 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
if (ret)
return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
return ret;
}
@@ -8198,18 +8242,14 @@ static void __endio_write_update_ordered(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
@@ -8217,9 +8257,8 @@ static void __endio_write_update_ordered(struct inode *inode,
&ordered_offset,
ordered_bytes,
uptodate)) {
- btrfs_init_work(&ordered->work, func,
- finish_ordered_fn,
- NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
+ NULL);
btrfs_queue_work(wq, &ordered->work);
}
/*
@@ -8376,7 +8415,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto err;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0, 0);
+ ret = btrfs_map_bio(fs_info, bio, 0);
err:
return ret;
}
@@ -9308,7 +9347,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
- mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
@@ -9537,6 +9575,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
@@ -9731,6 +9772,18 @@ out_fail:
commit_transaction = true;
}
if (commit_transaction) {
+ /*
+ * We may have set commit_transaction when logging the new name
+ * in the destination root, in which case we left the source
+ * root context in the list of log contextes. So make sure we
+ * remove it to avoid invalid memory accesses, since the context
+ * was allocated in our stack frame.
+ */
+ if (sync_log_root) {
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx_root.list);
+ mutex_unlock(&root->log_mutex);
+ }
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
@@ -9744,6 +9797,9 @@ out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ ASSERT(list_empty(&ctx_root.list));
+ ASSERT(list_empty(&ctx_dest.list));
+
return ret;
}
@@ -10088,8 +10144,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
- btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -10422,7 +10477,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
@@ -10778,7 +10832,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 len = isize - start;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);