aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c226
1 files changed, 150 insertions, 76 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 21f34ad0d411..07b5e6f7df67 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,10 +31,10 @@
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <linux/error-injection.h>
+#include <linux/crc32c.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
@@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void)
return 0;
}
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
{
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
@@ -124,8 +124,8 @@ struct async_submit_bio {
void *private_data;
struct btrfs_fs_info *fs_info;
struct bio *bio;
- extent_submit_bio_hook_t *submit_bio_start;
- extent_submit_bio_hook_t *submit_bio_done;
+ extent_submit_bio_start_t *submit_bio_start;
+ extent_submit_bio_done_t *submit_bio_done;
int mirror_num;
unsigned long bio_flags;
/*
@@ -270,7 +270,7 @@ out:
u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
{
- return btrfs_crc32c(seed, data, len);
+ return crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, u8 *result)
@@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
u32 crc = ~(u32)0;
- const int csum_size = sizeof(crc);
- char result[csum_size];
+ char result[sizeof(crc)];
/*
* The super_block structure does not span the whole
@@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
- if (memcmp(raw_disk_sb, result, csum_size))
+ if (memcmp(raw_disk_sb, result, sizeof(result)))
ret = 1;
}
@@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
return ret;
}
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key)
+{
+ int found_level;
+ struct btrfs_key found_key;
+ int ret;
+
+ found_level = btrfs_header_level(eb);
+ if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+ eb->start, level, found_level);
+#endif
+ return -EIO;
+ }
+
+ if (!first_key)
+ return 0;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret) {
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+ eb->start, first_key->objectid, first_key->type,
+ first_key->offset, found_key.objectid,
+ found_key.type, found_key.offset);
+ }
+#endif
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid: expected transid, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key of first slot, skip check if NULL
*/
static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_io_tree *io_tree;
int failed = 0;
@@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
mirror_num);
if (!ret) {
- if (!verify_parent_transid(io_tree, eb,
+ if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
- break;
- else
ret = -EIO;
+ else if (verify_level_key(fs_info, eb, level,
+ first_key))
+ ret = -EUCLEAN;
+ else
+ break;
}
/*
@@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
* there is no reason to read the other copies, they won't be
* any less wrong.
*/
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+ ret == -EUCLEAN)
break;
num_copies = btrfs_num_copies(fs_info,
@@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && btrfs_check_node(root, eb))
+ if (found_level > 0 && btrfs_check_node(fs_info, eb))
ret = -EIO;
if (!ret)
@@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
return 0;
}
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
- unsigned long limit = min_t(unsigned long,
- info->thread_pool_size,
- info->fs_devices->open_devices);
- return 256 * limit;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->private_data, async->bio,
- async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
async->status = ret;
@@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
- async->bio_flags, async->bio_offset);
+ async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work)
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset, void *private_data,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done)
+ extent_submit_bio_start_t *submit_bio_start,
+ extent_submit_bio_done_t *submit_bio_done)
{
struct async_submit_bio *async;
@@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
/*
@@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
return btree_csum_one_bio(bio);
}
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num)
{
struct inode *inode = private_data;
blk_status_t ret;
@@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
*/
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
bio_offset, private_data,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ btree_submit_bio_start,
+ btree_submit_bio_done);
}
if (ret)
@@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
buf->start, buf->start + buf->len - 1);
}
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid: expected transid of this tree block, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key in slot 0, skip check if NULL
+ */
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
@@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
if (ret) {
free_extent_buffer(buf);
return ERR_PTR(ret);
@@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
+ spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
@@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->orphan_inodes, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
- atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_path *path;
u64 generation;
int ret;
+ int level;
path = btrfs_alloc_path();
if (!path)
@@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
+ level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation);
+ generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
goto find_fail;
@@ -1808,12 +1857,10 @@ sleep:
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&fs_info->fs_state)))
btrfs_cleanup_transaction(fs_info);
- set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
cannot_commit))
- schedule_timeout(delay);
- __set_current_state(TASK_RUNNING);
+ schedule_timeout_interruptible(delay);
} while (!kthread_should_stop());
return 0;
}
@@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
- int max_active = fs_info->thread_pool_size;
+ u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
@@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
+ int level = btrfs_super_log_root_level(disk_super);
if (fs_devices->rw_devices == 0) {
btrfs_warn(fs_info, "log replay required on RO media");
@@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1);
+ fs_info->generation + 1,
+ level, NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.offset = 0;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
btrfs_init_devices_late(fs_info);
location.objectid = BTRFS_CSUM_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->csum_root = root;
@@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (IS_ERR(root)) {
ret = PTR_ERR(root);
if (ret != -ENOENT)
- return ret;
+ goto out;
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->free_space_root = root;
}
return 0;
+out:
+ btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+ location.objectid, ret);
+ return ret;
}
int open_ctree(struct super_block *sb,
@@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
- int max_active;
int clear_free_space_tree = 0;
+ int level;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+ spin_lock_init(&fs_info->pending_raid_kobjs_lock);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- max_active = fs_info->thread_pool_size;
-
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb,
}
generation = btrfs_super_chunk_root_generation(disk_super);
+ level = btrfs_super_chunk_root_level(disk_super);
__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
btrfs_err(fs_info, "failed to read chunk root");
@@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb,
}
/*
- * keep the device that is marked to be the target device for the
- * dev_replace procedure
+ * Keep the devid that is marked to be the target device for the
+ * device replace procedure
*/
- btrfs_close_extra_devices(fs_devices, 0);
+ btrfs_free_extra_devids(fs_devices, 0);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
@@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb,
retry_root_backup:
generation = btrfs_super_generation(disk_super);
+ level = btrfs_super_root_level(disk_super);
tree_root->node = read_tree_block(fs_info,
btrfs_super_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(tree_root->node) ||
!extent_buffer_uptodate(tree_root->node)) {
btrfs_warn(fs_info, "failed to read tree root");
@@ -2834,7 +2897,7 @@ retry_root_backup:
goto fail_block_groups;
}
- btrfs_close_extra_devices(fs_devices, 1);
+ btrfs_free_extra_devids(fs_devices, 1);
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
if (ret) {
@@ -2953,6 +3016,7 @@ retry_root_backup:
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
+ btrfs_warn(fs_info, "failed to read fs tree: %d", err);
goto fail_qgroup;
}
@@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
struct buffer_head *bh;
int i;
int errors = 0;
+ bool primary_failed = false;
u64 bytenr;
if (max_mirrors == 0)
@@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
errors++;
+ if (i == 0)
+ primary_failed = true;
continue;
}
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
errors++;
+ if (i == 0)
+ primary_failed = true;
+ }
/* drop our reference */
brelse(bh);
@@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
brelse(bh);
}
+ /* log error, force error return */
+ if (primary_failed) {
+ btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+ device->devid);
+ return -1;
+ }
+
return errors < i ? 0 : -1;
}
@@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
* So here we should only check item pointers, not item data.
*/
if (btrfs_header_level(buf) == 0 &&
- btrfs_check_leaf_relaxed(root, buf)) {
+ btrfs_check_leaf_relaxed(fs_info, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
}
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
@@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group dirty_bgs list");
- spin_unlock(&cur_trans->dirty_bgs_lock);
- return;
- }
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
struct btrfs_block_group_cache,
io_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group on io_bgs list");
- return;
- }
list_del_init(&cache->io_list);
spin_lock(&cache->lock);