aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 08:33:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 08:33:17 -0700
commit1176032cb12bb89ad558a3e57e82f2f25b817eff (patch)
treebebe6ba9058795fa0e92344b1fb07355e018a3fc /fs/btrfs/volumes.c
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide (diff)
parentbtrfs: fix the gfp_mask for the reada_zones radix tree (diff)
downloadlinux-dev-1176032cb12bb89ad558a3e57e82f2f25b817eff.tar.xz
linux-dev-1176032cb12bb89ad558a3e57e82f2f25b817eff.zip
Merge branch 'for-linus-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "This has fixes and cleanups Dave Sterba collected for the merge window. The biggest functional fixes are between btrfs raid5/6 and scrub, and raid5/6 and device replacement. Some of our pending qgroup fixes are included as well while I bash on the rest in testing. We also have the usual set of cleanups, including one that makes __btrfs_map_block() much more maintainable, and conversions from atomic_t to refcount_t" * 'for-linus-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (71 commits) btrfs: fix the gfp_mask for the reada_zones radix tree Btrfs: fix reported number of inode blocks Btrfs: send, fix file hole not being preserved due to inline extent Btrfs: fix extent map leak during fallocate error path Btrfs: fix incorrect space accounting after failure to insert inline extent Btrfs: fix invalid attempt to free reserved space on failure to cow range btrfs: Handle delalloc error correctly to avoid ordered extent hang btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error btrfs: check if the device is flush capable btrfs: delete unused member nobarriers btrfs: scrub: Fix RAID56 recovery race condition btrfs: scrub: Introduce full stripe lock for RAID56 btrfs: Use ktime_get_real_ts for root ctime Btrfs: handle only applicable errors returned by btrfs_get_extent btrfs: qgroup: Fix qgroup corruption caused by inode_cache mount option btrfs: use q which is already obtained from bdev_get_queue Btrfs: switch to div64_u64 if with a u64 divisor Btrfs: update scrub_parity to use u64 stripe_len Btrfs: enable repair during read for raid56 profile btrfs: use clear_page where appropriate ...
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c854
1 files changed, 451 insertions, 403 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab8a66d852f9..017b67daa3bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret,
+ int mirror_num, int need_raid_map);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -1008,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
- fs_devices->rotating = 1;
-
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -2417,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&fs_info->free_chunk_lock);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2795,10 +2799,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
return ret;
}
+static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ logical, length);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info,
+ "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
+ logical, length, em->start, em->start + em->len);
+ free_extent_map(em);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* callers are responsible for dropping em's ref. */
+ return em;
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
@@ -2806,23 +2838,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em_tree = &fs_info->mapping_tree.map_tree;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset) {
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- if (em)
- free_extent_map(em);
- return -EINVAL;
+ return PTR_ERR(em);
}
map = em->map_lookup;
mutex_lock(&fs_info->chunk_mutex);
@@ -3736,7 +3760,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
/* Non-zero return value signifies invalidity */
@@ -3755,6 +3779,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
@@ -3851,11 +3876,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
- btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- bctl->meta.target, bctl->data.target);
+ meta_target, data_target);
}
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
@@ -3910,7 +3940,7 @@ out:
__cancel_balance(fs_info);
else {
kfree(bctl);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
return ret;
}
@@ -4000,7 +4030,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -4785,7 +4815,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- stripe_size = div_u64(stripe_size, raid_stripe_len);
+ stripe_size = div64_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4833,7 +4863,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
list_add_tail(&em->list, &trans->transaction->pending_chunks);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
}
write_unlock(&em_tree->lock);
if (ret) {
@@ -4888,7 +4918,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
@@ -4897,24 +4926,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em_tree = &fs_info->mapping_tree.map_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
- chunk_offset, chunk_size);
- return -EINVAL;
- }
-
- if (em->start != chunk_offset || em->len != chunk_size) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
- chunk_offset, chunk_size, em->start, em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
@@ -5055,15 +5069,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- read_unlock(&map_tree->map_tree.lock);
- if (!em)
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em))
return 1;
map = em->map_lookup;
@@ -5117,34 +5128,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
-
- /*
- * We could return errors for these cases, but that could get ugly and
- * we'd probably do the same thing which is just not do anything else
- * and exit, so return 1 so the callers don't try to use other copies.
- */
- if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
- logical+len);
- return 1;
- }
-
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu",
- logical, logical+len, em->start,
- em->start + em->len);
- free_extent_map(em);
+ em = get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ /*
+ * We could return errors for these cases, but that could get
+ * ugly and we'd probably do the same thing which is just not do
+ * anything else and exit, so return 1 so the callers don't try
+ * to use other copies.
+ */
return 1;
- }
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
@@ -5160,7 +5156,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+ fs_info->dev_replace.tgtdev)
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -5173,15 +5170,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
unsigned long len = fs_info->sectorsize;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
@@ -5189,20 +5182,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
@@ -5295,25 +5284,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
- atomic_set(&bbio->refs, 1);
+ refcount_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
- WARN_ON(!atomic_read(&bbio->refs));
- atomic_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bbio->refs));
+ refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
- if (atomic_dec_and_test(&bbio->refs))
+ if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
+/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ struct btrfs_bio **bbio_ret)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_bio *bbio;
+ u64 offset;
+ u64 stripe_nr;
+ u64 stripe_nr_end;
+ u64 stripe_end_offset;
+ u64 stripe_cnt;
+ u64 stripe_len;
+ u64 stripe_offset;
+ u64 num_stripes;
+ u32 stripe_index;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+ int ret = 0;
+ int i;
+
+ /* discard always return a bbio */
+ ASSERT(bbio_ret);
+
+ em = get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ offset = logical - em->start;
+ length = min_t(u64, em->len - offset, length);
+
+ stripe_len = map->stripe_len;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(offset, stripe_len);
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_nr * stripe_len;
+
+ stripe_nr_end = round_up(offset + length, map->stripe_len);
+ stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_cnt = stripe_nr_end - stripe_nr;
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + length);
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ num_stripes = min_t(u64, map->num_stripes,
+ sub_stripes * stripe_cnt);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = map->num_stripes;
+ } else {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ }
+
+ bbio = alloc_btrfs_bio(num_stripes, 0);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ bbio->stripes[i].length -=
+ stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ bbio->stripes[i].length -=
+ stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else {
+ bbio->stripes[i].length = length;
+ }
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+
+ *bbio_ret = bbio;
+ bbio->map_type = map->type;
+ bbio->num_stripes = num_stripes;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ u64 srcdev_devid, int *mirror_num,
+ u64 *physical)
+{
+ struct btrfs_bio *bbio = NULL;
+ int num_stripes;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+ int i;
+ int ret = 0;
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &length, &bbio, 0, 0);
+ if (ret) {
+ ASSERT(bbio == NULL);
+ return ret;
+ }
+
+ num_stripes = bbio->num_stripes;
+ if (*mirror_num > num_stripes) {
+ /*
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+ * that means that the requested area is not left of the left
+ * cursor
+ */
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num of the source
+ * drive. Therefore look it up first. At the end, patch the device
+ * pointer to the one of the target drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add the
+ * mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+
+ btrfs_put_bbio(bbio);
+
+ ASSERT(found);
+ if (!found)
+ return -EIO;
+
+ *mirror_num = index_srcdev + 1;
+ *physical = physical_of_found;
+ return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+ struct btrfs_bio **bbio_ret,
+ struct btrfs_dev_replace *dev_replace,
+ int *num_stripes_ret, int *max_errors_ret)
+{
+ struct btrfs_bio *bbio = *bbio_ret;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int tgtdev_indexes = 0;
+ int num_stripes = *num_stripes_ret;
+ int max_errors = *max_errors_ret;
+ int i;
+
+ if (op == BTRFS_MAP_WRITE) {
+ int index_where_to_add;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk to
+ * the new disk takes place at run time while the filesystem is
+ * mounted writable, the regular write operations to the old
+ * disk have to be duplicated to go to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that
+ * the write to the old disk is already set up in the stripes
+ * array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can also
+ * be used to read data in case it is needed to repair a corrupt
+ * block elsewhere. This is possible if the requested area is
+ * left of the left cursor. In this area, the target drive is a
+ * full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it simple,
+ * only add the mirror with the lowest physical
+ * address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+
+ *num_stripes_ret = num_stripes;
+ *max_errors_ret = max_errors;
+ bbio->num_tgtdevs = tgtdev_indexes;
+ *bbio_ret = bbio;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@@ -5322,14 +5639,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
- u64 stripe_end_offset;
u64 stripe_nr;
- u64 stripe_nr_orig;
- u64 stripe_nr_end;
u64 stripe_len;
u32 stripe_index;
int i;
@@ -5345,23 +5657,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, *length);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu len %llu",
- logical, *length);
- return -EINVAL;
- }
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ *length, bbio_ret);
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu, found %Lu-%Lu",
- logical, em->start, em->start + em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
offset = logical - em->start;
@@ -5400,14 +5702,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
raid56_full_stripe_start *= full_stripe_len;
}
- if (op == BTRFS_MAP_DISCARD) {
- /* we don't discard raid56 yet */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- *length = min_t(u64, em->len - offset, *length);
- } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
@@ -5438,105 +5733,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
- /*
- * in dev-replace case, for repair case (that's the only
- * case where the mirror is selected explicitly when
- * calling btrfs_map_block), blocks left of the left cursor
- * can also be read from the target drive.
- * For REQ_GET_READ_MIRRORS, the target drive is added as
- * the last one to the array of stripes. For READ, it also
- * needs to be supported using the same mirror number.
- * If the requested block is not left of the left cursor,
- * EIO is returned. This can happen because btrfs_num_copies()
- * returns one more in the dev-replace case.
- */
- u64 tmp_length = *length;
- struct btrfs_bio *tmp_bbio = NULL;
- int tmp_num_stripes;
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, 0);
- if (ret) {
- WARN_ON(tmp_bbio != NULL);
- goto out;
- }
-
- tmp_num_stripes = tmp_bbio->num_stripes;
- if (mirror_num > tmp_num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this
- * mirror, that means that the requested area
- * is not left of the left cursor
- */
- ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
- goto out;
- }
-
- /*
- * process the rest of the function using the mirror_num
- * of the source drive. Therefore look it up first.
- * At the end, patch the device pointer to the one of the
- * target drive.
- */
- for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add
- * the mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= tmp_bbio->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = tmp_bbio->stripes[i].physical;
- }
-
- btrfs_put_bbio(tmp_bbio);
-
- if (!found) {
- WARN_ON(1);
- ret = -EIO;
+ !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+ ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+ dev_replace->srcdev->devid,
+ &mirror_num,
+ &physical_to_patch_in_first_stripe);
+ if (ret)
goto out;
- }
-
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
+ else
+ patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
- stripe_nr_orig = stripe_nr;
- stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
- stripe_end_offset = stripe_nr_end * map->stripe_len -
- (offset + *length);
-
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->num_stripes,
- stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5549,8 +5767,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5566,10 +5783,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->sub_stripes *
- (stripe_nr_end - stripe_nr_orig),
- map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -5587,7 +5800,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
(op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
@@ -5612,8 +5825,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
+ if ((op != BTRFS_MAP_WRITE &&
+ op != BTRFS_MAP_GET_READ_MIRRORS) &&
+ mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5635,8 +5849,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+ if (op == BTRFS_MAP_WRITE)
num_alloc_stripes <<= 1;
if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
@@ -5648,14 +5862,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- need_raid_map &&
- ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
- mirror_num > 1)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ (need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5679,173 +5891,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
RAID6_Q_STRIPE;
}
- if (op == BTRFS_MAP_DISCARD) {
- u32 factor = 0;
- u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
- u32 remaining_stripes = 0;
- u32 last_stripe = 0;
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- sub_stripes = 1;
- else
- sub_stripes = map->sub_stripes;
-
- factor = map->num_stripes / sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_nr_end -
- stripe_nr_orig,
- factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
- map->stripe_len;
-
- if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
-
- /*
- * Special for the first stripe and
- * the last stripe:
- *
- * |-------|...|-------|
- * |----------|
- * off end_off
- */
- if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
-
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
-
- if (i == sub_stripes - 1)
- stripe_offset = 0;
- } else
- bbio->stripes[i].length = *length;
-
- stripe_index++;
- if (stripe_index == map->num_stripes) {
- /* This could only happen for RAID0/10 */
- stripe_index = 0;
- stripe_nr++;
- }
- }
- } else {
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
- }
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
- tgtdev_indexes = 0;
- if (dev_replace_is_ongoing &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
- dev_replace->tgtdev != NULL) {
- int index_where_to_add;
- u64 srcdev_devid = dev_replace->srcdev->devid;
-
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk
- * to the new disk takes place at run time while the
- * filesystem is mounted writable, the regular write
- * operations to the old disk have to be duplicated to go
- * to the new disk as well.
- * Note that device->missing is handled by the caller, and
- * that the write to the old disk is already set up in the
- * stripes array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
-
- new->physical = old->physical;
- new->length = old->length;
- new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing &&
- op == BTRFS_MAP_GET_READ_MIRRORS &&
- dev_replace->tgtdev != NULL) {
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- /*
- * During the dev-replace procedure, the target drive can
- * also be used to read data in case it is needed to repair
- * a corrupt block elsewhere. This is possible if the
- * requested area is left of the left cursor. In this area,
- * the target drive is a full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bbio->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ need_full_stripe(op)) {
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
+ &max_errors);
}
*bbio_ret = bbio;
@@ -5853,7 +5919,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
- bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5886,19 +5951,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map)
+ struct btrfs_bio **bbio_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
- mirror_num, need_raid_map);
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
@@ -5908,24 +5969,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 rmap_len;
int i, j, nr = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_start, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_err(fs_info, "couldn't find em for chunk %Lu",
- chunk_start);
+ em = get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
return -EIO;
- }
- if (em->start != chunk_start) {
- btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu",
- em->start, chunk_start);
- free_extent_map(em);
- return -EIO;
- }
map = em->map_lookup;
-
length = em->len;
rmap_len = map->stripe_len;
@@ -5949,7 +5997,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
continue;
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;