aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c204
1 files changed, 147 insertions, 57 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c0f202741e09..39e45b8a5031 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state)
}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(buffers);
static LIST_HEAD(states);
-
static DEFINE_SPINLOCK(leak_lock);
-static inline
-void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+static inline void btrfs_leak_debug_add(spinlock_t *lock,
+ struct list_head *new,
+ struct list_head *head)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_add(new, head);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_del(struct list_head *entry)
+static inline void btrfs_leak_debug_del(spinlock_t *lock,
+ struct list_head *entry)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_del(entry);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline void btrfs_extent_buffer_leak_debug_check(void)
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
+ unsigned long flags;
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
+ return;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
static inline void btrfs_extent_state_leak_debug_check(void)
@@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
}
}
#else
-#define btrfs_leak_debug_add(new, head) do {} while (0)
-#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
+#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
+#define btrfs_leak_debug_del(lock, entry) do {} while (0)
#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@@ -122,7 +133,6 @@ struct tree_entry {
struct extent_page_data {
struct bio *bio;
- struct extent_io_tree *tree;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void)
void __cold extent_io_exit(void)
{
- btrfs_extent_buffer_leak_debug_check();
-
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
@@ -257,6 +265,15 @@ void __cold extent_io_exit(void)
bioset_exit(&btrfs_bioset);
}
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
@@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&state->leak_list, &states);
+ btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&state->leak_list);
+ btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@@ -1053,6 +1072,16 @@ hit_next:
goto out;
}
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
@@ -1568,6 +1597,43 @@ out:
}
/**
+ * find_contiguous_extent_bit: find a contiguous area of bits
+ * @tree - io tree to check
+ * @start - offset to start the search from
+ * @start_ret - the first offset we found with the bits set
+ * @end_ret - the final contiguous range of the bits that were set
+ * @bits - bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/**
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
*
@@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
- * @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @pg_offset: offset of the new bio or to check whether we are adding
@@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
-static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
+static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
@@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct bio *bio;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
ASSERT(bio_ret);
@@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct extent_io_tree *tree,
- struct page *page,
+static int __do_readpage(struct page *page,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
@@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree,
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
set_page_extent_mapped(page);
@@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree,
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
end_bio_extent_readpage, mirror_num,
@@ -3269,8 +3335,7 @@ out:
return ret;
}
-static inline void contiguous_readpages(struct extent_io_tree *tree,
- struct page *pages[], int nr_pages,
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct bio **bio,
@@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
+ __do_readpage(pages[index], btrfs_get_extent, em_cached,
bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct extent_io_tree *tree,
- struct page *page,
+static int __extent_read_full_page(struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
unsigned long *bio_flags,
@@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+ ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
bio_flags, read_flags, NULL);
return ret;
}
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
+int extent_read_full_page(struct page *page, get_extent_t *get_extent,
+ int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+ ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
&bio_flags, 0);
if (bio)
ret = submit_one_bio(bio, mirror_num, bio_flags);
@@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
page, offset, iosize, pg_offset,
&epd->bio,
end_bio_extent_writepage,
@@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
int i, num_pages;
@@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
@@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -4018,7 +4079,39 @@ retry:
end_write_bio(&epd, ret);
return ret;
}
- ret = flush_write_bio(&epd);
+ /*
+ * If something went wrong, don't allow any metadata write bio to be
+ * submitted.
+ *
+ * This would prevent use-after-free if we had dirty pages not
+ * cleaned up, which can still happen by fuzzed images.
+ *
+ * - Bad extent tree
+ * Allowing existing tree block to be allocated for other trees.
+ *
+ * - Log tree operations
+ * Exiting tree blocks get allocated to log tree, bumps its
+ * generation, then get cleaned in tree re-balance.
+ * Such tree block will not be written back, since it's clean,
+ * thus no WRITTEN flag set.
+ * And after log writes back, this tree block is not traced by
+ * any dirty extent_io_tree.
+ *
+ * - Offending tree block gets re-dirtied from its original owner
+ * Since it has bumped generation, no WRITTEN flag, it can be
+ * reused without COWing. This tree block will not be traced
+ * by btrfs_transaction::dirty_pages.
+ *
+ * Now such dirty tree block will not be cleaned by any dirty
+ * extent io tree. Thus we don't want to submit such wild eb
+ * if the fs already has error.
+ */
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ ret = flush_write_bio(&epd);
+ } else {
+ ret = -EUCLEAN;
+ end_write_bio(&epd, ret);
+ }
return ret;
}
@@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping,
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
@@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
- contiguous_readpages(tree, pagepool, nr, contig_start,
+ contiguous_readpages(pagepool, nr, contig_start,
contig_end, &em_cached, &bio, &bio_flags,
&prev_em_start);
}
@@ -4796,7 +4884,6 @@ out_free_ulist:
static void __free_extent_buffer(struct extent_buffer *eb)
{
- btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
@@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
- btrfs_leak_debug_add(&eb->leak_list, &buffers);
+ btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
+ &fs_info->allocated_ebs);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
- struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(tree, page,
+ err = __extent_read_full_page(page,
btree_get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);