diff options
Diffstat (limited to 'drivers/md')
36 files changed, 623 insertions, 670 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 30ba3573626c..b7e2d9666614 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -463,6 +463,15 @@ config DM_MULTIPATH_HST If unsure, say N. +config DM_MULTIPATH_IOA + tristate "I/O Path Selector based on CPU submission" + depends on DM_MULTIPATH + help + This path selector selects the path based on the CPU the IO is + executed on and the CPU to path mapping setup at path addition time. + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target" depends on BLK_DEV_DM @@ -530,11 +539,22 @@ config DM_VERITY_VERIFY_ROOTHASH_SIG bool "Verity data device root hash signature verification support" depends on DM_VERITY select SYSTEM_DATA_VERIFICATION - help + help Add ability for dm-verity device to be validated if the pre-generated tree of cryptographic checksums passed has a pkcs#7 signature file that can validate the roothash of the tree. + By default, rely on the builtin trusted keyring. + + If unsure, say N. + +config DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + bool "Verity data device root hash signature verification with secondary keyring" + depends on DM_VERITY_VERIFY_ROOTHASH_SIG + depends on SECONDARY_TRUSTED_KEYRING + help + Rely on the secondary trusted keyring to verify dm-verity signatures. + If unsure, say N. config DM_VERITY_FEC diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 6d3e234dc46a..ef7ddc27685c 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -7,23 +7,28 @@ dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ dm-rq.o dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-historical-service-time-y += dm-ps-historical-service-time.o +dm-io-affinity-y += dm-ps-io-affinity.o +dm-queue-length-y += dm-ps-queue-length.o +dm-round-robin-y += dm-ps-round-robin.o +dm-service-time-y += dm-ps-service-time.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o dm-mirror-y += dm-raid1.o -dm-log-userspace-y \ - += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-log-userspace-y += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o -dm-cache-smq-y += dm-cache-policy-smq.o +dm-cache-smq-y += dm-cache-policy-smq.o dm-ebs-y += dm-ebs-target.o dm-era-y += dm-era-target.o dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o +dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o + md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o -dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o linear-y += md-linear.o multipath-y += md-multipath.o faulty-y += md-faulty.o @@ -59,14 +64,15 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o +obj-$(CONFIG_DM_MULTIPATH_IOA) += dm-io-affinity.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o -obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ +obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o -obj-$(CONFIG_DM_RAID) += dm-raid.o -obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_RAID) += dm-raid.o +obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..85b1f2a9b72d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct hd_struct *part; + struct block_device *part; unsigned long start_time; struct btree_op op; @@ -1073,7 +1073,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct hd_struct *part; + struct block_device *part; }; static void detached_dev_end_io(struct bio *bio) @@ -1230,8 +1230,9 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO; - - return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + if (!dc->bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); } void bch_cached_dev_request_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46a00134a36a..0e06d721cd8e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1114,9 +1114,6 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); - struct closure cl; - - closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); @@ -1130,12 +1127,6 @@ static void cached_dev_detach_finish(struct work_struct *w) dc->writeback_thread = NULL; } - memset(&dc->sb.set_uuid, 0, 16); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - mutex_lock(&bch_register_lock); calc_cached_dev_sectors(dc->disk.c); @@ -1408,7 +1399,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, dc->bdev, &bcache_cached_ops); if (ret) return ret; @@ -1447,8 +1438,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, - "bcache")) + if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) goto err; if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; @@ -2342,9 +2332,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; } - if (kobject_add(&ca->kobj, - &part_to_dev(bdev->bd_part)->kobj, - "bcache")) { + if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { err = "error calling kobject_add"; ret = -ENOMEM; goto out; @@ -2383,38 +2371,38 @@ kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); -static bool bch_is_open_backing(struct block_device *bdev) +static bool bch_is_open_backing(dev_t dev) { struct cache_set *c, *tc; struct cached_dev *dc, *t; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) list_for_each_entry_safe(dc, t, &c->cached_devs, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; list_for_each_entry_safe(dc, t, &uncached_devices, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; return false; } -static bool bch_is_open_cache(struct block_device *bdev) +static bool bch_is_open_cache(dev_t dev) { struct cache_set *c, *tc; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { struct cache *ca = c->cache; - if (ca->bdev == bdev) + if (ca->bdev->bd_dev == dev) return true; } return false; } -static bool bch_is_open(struct block_device *bdev) +static bool bch_is_open(dev_t dev) { - return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); + return bch_is_open_cache(dev) || bch_is_open_backing(dev); } struct async_reg_args { @@ -2538,9 +2526,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, sb); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(strim(path)); + dev_t dev; + mutex_lock(&bch_register_lock); - if (!IS_ERR(bdev) && bch_is_open(bdev)) + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) err = "device already registered"; else err = "device busy"; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3c74996978da..a129e4d2707c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -705,6 +705,15 @@ static int bch_writeback_thread(void *arg) * bch_cached_dev_detach(). */ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + up_write(&dc->writeback_lock); break; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9644424591da..541c45027cc8 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -712,10 +712,6 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } -/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ -#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 -__always_inline -#endif static dm_block_t block_div(dm_block_t b, uint32_t n) { do_div(b, n); @@ -2844,7 +2840,6 @@ static void cache_postsuspend(struct dm_target *ti) static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid) { - int r; struct cache *cache = context; if (dirty) { @@ -2853,11 +2848,7 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, } else clear_bit(from_cblock(cblock), cache->dirty_bitset); - r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); - if (r) - return r; - - return 0; + return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } /* diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index d522093cb39d..086d293c2b03 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -96,19 +96,12 @@ struct mapped_device { */ struct workqueue_struct *wq; - /* - * freeze/thaw support require holding onto a super block - */ - struct super_block *frozen_sb; - /* forced geometry settings */ struct hd_geometry geometry; /* kobject and completion */ struct dm_kobject_holder kobj_holder; - struct block_device *bdev; - struct dm_stats stats; /* for blk-mq request-based DM support */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 392337f16ecf..5f9f9b3a226d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1090,16 +1090,16 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = { .post = crypt_iv_tcw_post }; -static struct crypt_iv_operations crypt_iv_random_ops = { +static const struct crypt_iv_operations crypt_iv_random_ops = { .generator = crypt_iv_random_gen }; -static struct crypt_iv_operations crypt_iv_eboiv_ops = { +static const struct crypt_iv_operations crypt_iv_eboiv_ops = { .ctr = crypt_iv_eboiv_ctr, .generator = crypt_iv_eboiv_gen }; -static struct crypt_iv_operations crypt_iv_elephant_ops = { +static const struct crypt_iv_operations crypt_iv_elephant_ops = { .ctr = crypt_iv_elephant_ctr, .dtr = crypt_iv_elephant_dtr, .init = crypt_iv_elephant_init, @@ -3166,11 +3166,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else - cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + cc->crypt_queue = alloc_workqueue("kcryptd-%s", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_UNBOUND | WQ_SYSFS, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index cb85610527c2..55bcfb74f51f 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -86,7 +86,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv else ba = dm_bufio_new(ec->bufio, block, &b); - if (unlikely(IS_ERR(ba))) { + if (IS_ERR(ba)) { /* * Carry on with next buffer, if any, to issue all possible * data but return error. diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3fc3757def55..5a7a1b90e671 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3462,7 +3462,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, 0); + *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); if (IS_ERR(*hash)) { *error = error_alg; r = PTR_ERR(*hash); @@ -3519,7 +3519,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) struct journal_completion comp; comp.ic = ic; - ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0); + ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); if (IS_ERR(ic->journal_crypt)) { *error = "Invalid journal cipher"; r = PTR_ERR(ic->journal_crypt); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index cd0478d44058..5e306bba4375 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1600,6 +1600,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para if (!argc) { DMWARN("Empty message received."); + r = -EINVAL; goto out_argv; } diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index 186f91e2752c..186f91e2752c 100644 --- a/drivers/md/dm-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c new file mode 100644 index 000000000000..077655cd4fae --- /dev/null +++ b/drivers/md/dm-ps-io-affinity.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Oracle Corporation + * + * Module Author: Mike Christie + */ +#include "dm-path-selector.h" + +#include <linux/device-mapper.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "multipath io-affinity" + +struct path_info { + struct dm_path *path; + cpumask_var_t cpumask; + refcount_t refcount; + bool failed; +}; + +struct selector { + struct path_info **path_map; + cpumask_var_t path_mask; + atomic_t map_misses; +}; + +static void ioa_free_path(struct selector *s, unsigned int cpu) +{ + struct path_info *pi = s->path_map[cpu]; + + if (!pi) + return; + + if (refcount_dec_and_test(&pi->refcount)) { + cpumask_clear_cpu(cpu, s->path_mask); + free_cpumask_var(pi->cpumask); + kfree(pi); + + s->path_map[cpu] = NULL; + } +} + +static int ioa_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL; + unsigned int cpu; + int ret; + + if (argc != 1) { + *error = "io-affinity ps: invalid number of arguments"; + return -EINVAL; + } + + pi = kzalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "io-affinity ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + path->pscontext = pi; + refcount_set(&pi->refcount, 1); + + if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) { + *error = "io-affinity ps: Error allocating cpumask context"; + ret = -ENOMEM; + goto free_pi; + } + + ret = cpumask_parse(argv[0], pi->cpumask); + if (ret) { + *error = "io-affinity ps: invalid cpumask"; + ret = -EINVAL; + goto free_mask; + } + + for_each_cpu(cpu, pi->cpumask) { + if (cpu >= nr_cpu_ids) { + DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u", + cpu, nr_cpu_ids); + break; + } + + if (s->path_map[cpu]) { + DMWARN("CPU mapping for %u exists. Ignoring.", cpu); + continue; + } + + cpumask_set_cpu(cpu, s->path_mask); + s->path_map[cpu] = pi; + refcount_inc(&pi->refcount); + continue; + } + + if (refcount_dec_and_test(&pi->refcount)) { + *error = "io-affinity ps: No new/valid CPU mapping found"; + ret = -EINVAL; + goto free_mask; + } + + return 0; + +free_mask: + free_cpumask_var(pi->cpumask); +free_pi: + kfree(pi); + return ret; +} + +static int ioa_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), + GFP_KERNEL); + if (!s->path_map) + goto free_selector; + + if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL)) + goto free_map; + + atomic_set(&s->map_misses, 0); + ps->context = s; + return 0; + +free_map: + kfree(s->path_map); +free_selector: + kfree(s); + return -ENOMEM; +} + +static void ioa_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + unsigned cpu; + + for_each_cpu(cpu, s->path_mask) + ioa_free_path(s, cpu); + + free_cpumask_var(s->path_mask); + kfree(s->path_map); + kfree(s); + + ps->context = NULL; +} + +static int ioa_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct selector *s = ps->context; + struct path_info *pi; + int sz = 0; + + if (!path) { + DMEMIT("0 "); + return sz; + } + + switch(type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&s->map_misses)); + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask)); + break; + } + + return sz; +} + +static void ioa_fail_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = true; +} + +static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = false; + return 0; +} + +static struct dm_path *ioa_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + unsigned int cpu, node; + struct selector *s = ps->context; + const struct cpumask *cpumask; + struct path_info *pi; + int i; + + cpu = get_cpu(); + + pi = s->path_map[cpu]; + if (pi && !pi->failed) + goto done; + + /* + * Perf is not optimal, but we at least try the local node then just + * try not to fail. + */ + if (!pi) + atomic_inc(&s->map_misses); + + node = cpu_to_node(cpu); + cpumask = cpumask_of_node(node); + for_each_cpu(i, cpumask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + + for_each_cpu(i, s->path_mask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + pi = NULL; + +done: + put_cpu(); + return pi ? pi->path : NULL; +} + +static struct path_selector_type ioa_ps = { + .name = "io-affinity", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ioa_create, + .destroy = ioa_destroy, + .status = ioa_status, + .add_path = ioa_add_path, + .fail_path = ioa_fail_path, + .reinstate_path = ioa_reinstate_path, + .select_path = ioa_select_path, +}; + +static int __init dm_ioa_init(void) +{ + int ret = dm_register_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("register failed %d", ret); + return ret; +} + +static void __exit dm_ioa_exit(void) +{ + int ret = dm_unregister_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("unregister failed %d", ret); +} + +module_init(dm_ioa_init); +module_exit(dm_ioa_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on"); +MODULE_AUTHOR("Mike Christie <michael.christie@oracle.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-ps-queue-length.c index 5fd018d18418..5fd018d18418 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-ps-queue-length.c diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-ps-round-robin.c index bdbb7e6e8212..bdbb7e6e8212 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-ps-round-robin.c diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-ps-service-time.c index 9cfda665e9eb..9cfda665e9eb 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-ps-service-time.c diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9c1f7c4de65b..23c38777e8f6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -700,8 +700,7 @@ static void rs_set_capacity(struct raid_set *rs) { struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); - set_capacity(gendisk, rs->md.array_sectors); - revalidate_disk_size(gendisk, true); + set_capacity_and_notify(gendisk, rs->md.array_sectors); } /* @@ -3728,6 +3727,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size_bytes); blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); + + /* + * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. + */ + if (rs_is_raid1(rs) || rs_is_raid10(rs)) { + limits->discard_granularity = chunk_size_bytes; + limits->max_discard_sectors = rs->md.chunk_sectors; + } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 729a72ec30cc..13b4385f4d5a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -397,7 +397,7 @@ static int map_request(struct dm_rq_target_io *tio) } /* The target has remapped the I/O so dispatch it */ - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + trace_block_rq_remap(clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 151d022b032d..df359d33cda8 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -496,7 +496,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", .version = {1, 6, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index bff4c7fa1cd2..262e2b0fd975 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -550,6 +550,7 @@ static int switch_iterate_devices(struct dm_target *ti, static struct target_type switch_target = { .name = "switch", .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, .module = THIS_MODULE, .ctr = switch_ctr, .dtr = switch_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ce543b761be7..188f41287f18 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -18,7 +18,6 @@ #include <linux/mutex.h> #include <linux/delay.h> #include <linux/atomic.h> -#include <linux/lcm.h> #include <linux/blk-mq.h> #include <linux/mount.h> #include <linux/dax.h> @@ -348,16 +347,9 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, dev_t dm_get_dev_t(const char *path) { dev_t dev; - struct block_device *bdev; - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) + if (lookup_bdev(path, &dev)) dev = name_to_dev_t(path); - else { - dev = bdev->bd_dev; - bdput(bdev); - } - return dev; } EXPORT_SYMBOL_GPL(dm_get_dev_t); @@ -1247,12 +1239,6 @@ void dm_table_event_callback(struct dm_table *t, void dm_table_event(struct dm_table *t) { - /* - * You can no longer call dm_table_event() from interrupt - * context, use a bottom half instead. - */ - BUG_ON(in_interrupt()); - mutex_lock(&_event_lock); if (t->event_fn) t->event_fn(t->event_context); @@ -1455,10 +1441,6 @@ int dm_calculate_queue_limits(struct dm_table *table, zone_sectors = ti_limits.chunk_sectors; } - /* Stack chunk_sectors if target-specific splitting is required */ - if (ti->max_io_len) - ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len, - ti_limits.chunk_sectors); /* Set I/O hints portion of queue limits */ if (ti->type->io_hints) ti->type->io_hints(ti, &ti_limits); diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index e673dacf6418..7357c1bd5863 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -178,6 +178,7 @@ static void unstripe_io_hints(struct dm_target *ti, static struct target_type unstripe_target = { .name = "unstriped", .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, .module = THIS_MODULE, .ctr = unstripe_ctr, .dtr = unstripe_dtr, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index f74982dcbea0..6b8e5bdd8526 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -538,6 +538,15 @@ static int verity_verify_io(struct dm_verity_io *io) } /* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +/* * End one "io" structure with a given error. */ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) @@ -564,7 +573,8 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { verity_finish_io(io, bio->bi_status); return; } diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index 614e43db93aa..29385dc470d5 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -119,8 +119,13 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, } ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, - sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE, - NULL, NULL); + sig_len, +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_SECONDARY_KEYRING + VERIFY_USE_SECONDARY_KEYRING, +#else + NULL, +#endif + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); return ret; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 9ae4ce7df95c..d5223a0e5cc5 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -319,7 +319,7 @@ err1: #else static int persistent_memory_claim(struct dm_writecache *wc) { - BUG(); + return -EOPNOTSUPP; } #endif @@ -2041,7 +2041,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 10, "Invalid number of feature args"}, + {0, 16, "Invalid number of feature args"}, }; as.argc = argc; @@ -2479,6 +2479,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; + if (wc->max_age != MAX_AGE_UNSPECIFIED) + extra_args += 2; if (wc->cleaner) extra_args++; if (wc->writeback_fua_set) diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index b65ca8dcfbdc..faa1dbffc8b4 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -59,6 +59,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) static struct target_type zero_target = { .name = "zero", .version = {1, 1, 0}, + .features = DM_TARGET_NOWAIT, .module = THIS_MODULE, .ctr = zero_ctr, .map = zero_map, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c18fc2548518..b3c3c8b4cb42 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -476,8 +476,10 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, return -EAGAIN; map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + if (!map) { + ret = -EIO; + goto out; + } do { struct dm_target *tgt; @@ -507,7 +509,6 @@ out: static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) - __acquires(md->io_barrier) { struct dm_target *tgt; struct dm_table *map; @@ -541,7 +542,6 @@ retry: } static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) - __releases(md->io_barrier) { dm_put_live_table(md, srcu_idx); } @@ -570,7 +570,10 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, } } - r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!bdev->bd_disk->fops->ioctl) + r = -ENOTTY; + else + r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); out: dm_unprepare_ioctl(md, srcu_idx); return r; @@ -1037,15 +1040,18 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector) sector_t max_len; /* - * Does the target need to split even further? - * - q->limits.chunk_sectors reflects ti->max_io_len so - * blk_max_size_offset() provides required splitting. - * - blk_max_size_offset() also respects q->limits.max_sectors + * Does the target need to split IO even further? + * - varied (per target) IO splitting is a tenet of DM; this + * explains why stacked chunk_sectors based splitting via + * blk_max_size_offset() isn't possible here. So pass in + * ti->max_io_len to override stacked chunk_sectors. */ - max_len = blk_max_size_offset(ti->table->md->queue, - target_offset); - if (len > max_len) - len = max_len; + if (ti->max_io_len) { + max_len = blk_max_size_offset(ti->table->md->queue, + target_offset, ti->max_io_len); + if (len > max_len) + len = max_len; + } return len; } @@ -1196,11 +1202,9 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, * ->zero_page_range() is mandatory dax operation. If we are * here, something is wrong. */ - dm_put_live_table(md, srcu_idx); goto out; } ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); - out: dm_put_live_table(md, srcu_idx); @@ -1273,8 +1277,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(clone->bi_disk->queue, clone, - bio_dev(io->orig_bio), sector); + trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: @@ -1419,18 +1422,12 @@ static int __send_empty_flush(struct clone_info *ci) */ bio_init(&flush_bio, NULL, 0); flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + flush_bio.bi_disk = ci->io->md->disk; + bio_associate_blkg(&flush_bio); + ci->bio = &flush_bio; ci->sector_count = 0; - /* - * Empty flush uses a statically initialized bio, as the base for - * cloning. However, blkg association requires that a bdev is - * associated with a gendisk, which doesn't happen until the bdev is - * opened. So, blkg association is done at issue time of the flush - * rather than when the device is created in alloc_dev(). - */ - bio_set_dev(ci->bio, ci->io->md->bdev); - BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); @@ -1589,7 +1586,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.sector_count = bio_sectors(bio); while (ci.sector_count && !error) { error = __split_and_process_non_flush(&ci); - if (current->bio_list && ci.sector_count && !error) { + if (ci.sector_count && !error) { /* * Remainder must be passed to submit_bio_noacct() * so that it gets handled *after* bios already submitted @@ -1610,12 +1607,12 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * (by eliminating DM's splitting and just using bio_split) */ part_stat_lock(); - __dm_part_stat_sub(&dm_disk(md)->part0, + __dm_part_stat_sub(dm_disk(md)->part0, sectors[op_stat_group(bio_op(bio))], ci.sector_count); part_stat_unlock(); bio_chain(b, bio); - trace_block_split(md->queue, b, bio->bi_iter.bi_sector); + trace_block_split(b, bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); break; } @@ -1747,11 +1744,6 @@ static void cleanup_mapped_device(struct mapped_device *md) cleanup_srcu_struct(&md->io_barrier); - if (md->bdev) { - bdput(md->bdev); - md->bdev = NULL; - } - mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); @@ -1843,10 +1835,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->wq) goto bad; - md->bdev = bdget_disk(md->disk, 0); - if (!md->bdev) - goto bad; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ @@ -1971,8 +1959,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - bd_set_nr_sectors(md->bdev, size); + set_capacity_and_notify(md->disk, size); dm_table_event_callback(t, event_callback, md); @@ -2255,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put); static bool md_in_flight_bios(struct mapped_device *md) { int cpu; - struct hd_struct *part = &dm_disk(md)->part0; + struct block_device *part = dm_disk(md)->part0; long sum = 0; for_each_possible_cpu(cpu) { @@ -2390,27 +2377,19 @@ static int lock_fs(struct mapped_device *md) { int r; - WARN_ON(md->frozen_sb); - - md->frozen_sb = freeze_bdev(md->bdev); - if (IS_ERR(md->frozen_sb)) { - r = PTR_ERR(md->frozen_sb); - md->frozen_sb = NULL; - return r; - } - - set_bit(DMF_FROZEN, &md->flags); + WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - return 0; + r = freeze_bdev(md->disk->part0); + if (!r) + set_bit(DMF_FROZEN, &md->flags); + return r; } static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - - thaw_bdev(md->bdev, md->frozen_sb); - md->frozen_sb = NULL; + thaw_bdev(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 4aaf4820b6f6..7fbd41e156c9 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -581,8 +581,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) process_metadata_update(mddev, msg); break; case CHANGE_CAPACITY: - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); @@ -664,9 +663,27 @@ out: * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. */ -static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) +static int lock_token(struct md_cluster_info *cinfo) { - int error, set_bit = 0; + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; struct mddev *mddev = cinfo->mddev; /* @@ -677,34 +694,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)) { - error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - WARN_ON_ONCE(error); + WARN_ON_ONCE(rv); md_wakeup_thread(mddev->thread); set_bit = 1; } - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); - if (set_bit) - clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - if (error) - pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", - __func__, __LINE__, error); - - /* Lock the receive sequence */ - mutex_lock(&cinfo->recv_mutex); - return error; -} - -/* lock_comm() - * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. - */ -static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) -{ wait_event(cinfo->wait, !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); - - return lock_token(cinfo, mddev_locked); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; } static void unlock_comm(struct md_cluster_info *cinfo) @@ -784,9 +786,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, { int ret; - lock_comm(cinfo, mddev_locked); - ret = __sendmsg(cinfo, cmsg); - unlock_comm(cinfo); + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } return ret; } @@ -1061,7 +1065,7 @@ static int metadata_update_start(struct mddev *mddev) return 0; } - ret = lock_token(cinfo, 1); + ret = lock_token(cinfo); clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); return ret; } @@ -1255,7 +1259,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) int raid_slot = -1; md_update_sb(mddev, 1); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); @@ -1296,13 +1303,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) if (ret) pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n", __func__, __LINE__); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } else { /* revert to previous sectors */ ret = mddev->pers->resize(mddev, old_dev_sectors); - if (!ret) - revalidate_disk_size(mddev->gendisk, true); ret = __sendmsg(cinfo, &cmsg); if (ret) pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", @@ -1407,7 +1411,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) + return -EAGAIN; ret = __sendmsg(cinfo, &cmsg); if (ret) { unlock_comm(cinfo); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5ab22069b5be..68cac7d19278 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -200,9 +200,8 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) "copied raid_disks doesn't match mddev->raid_disks"); rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); - revalidate_disk_size(mddev->gendisk, true); kfree_rcu(oldconf, rcu); return 0; } @@ -258,8 +257,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, - bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 98bac4f304ae..ca409428b4fc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -464,7 +464,7 @@ struct md_io { bio_end_io_t *orig_bi_end_io; void *orig_bi_private; unsigned long start_time; - struct hd_struct *part; + struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -639,7 +639,7 @@ static void md_submit_flush_data(struct work_struct *ws) * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ - mddev->last_flush = mddev->start_flush; + mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); @@ -660,13 +660,17 @@ static void md_submit_flush_data(struct work_struct *ws) */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { - ktime_t start = ktime_get_boottime(); + ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); + /* flush requests wait until ongoing flush completes, + * hence coalescing all the pending requests. + */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || - ktime_after(mddev->last_flush, start), + ktime_before(req_start, mddev->prev_flush_start), mddev->lock); - if (!ktime_after(mddev->last_flush, start)) { + /* new request after previous flush is completed */ + if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; @@ -2414,7 +2418,6 @@ EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; - struct kobject *ko; int err; /* prevent duplicates */ @@ -2477,9 +2480,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - ko = &part_to_dev(rdev->bdev->bd_part)->kobj; /* failure here is OK */ - err = sysfs_create_link(&rdev->kobj, ko, "block"); + err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); @@ -5355,10 +5357,9 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) if (!err) { mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); - } + if (mddev->pers) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } mddev_unlock(mddev); return err ?: len; @@ -5765,11 +5766,12 @@ static int md_alloc(dev_t dev, char *name) return error; } -static struct kobject *md_probe(dev_t dev, int *part, void *data) +static void md_probe(dev_t dev) { + if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) + return; if (create_on_open) md_alloc(dev, NULL); - return NULL; } static int add_named_array(const char *val, const struct kernel_param *kp) @@ -6107,8 +6109,7 @@ int do_md_run(struct mddev *mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); @@ -6423,10 +6424,9 @@ static int do_md_stop(struct mddev *mddev, int mode, if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); - set_capacity(disk, 0); + set_capacity_and_notify(disk, 0); mutex_unlock(&mddev->open_mutex); mddev->changed = 1; - revalidate_disk_size(disk, true); if (mddev->ro) mddev->ro = 0; @@ -6535,7 +6535,7 @@ static void autorun_devices(int part) break; } - md_probe(dev, NULL, NULL); + md_probe(dev); mddev = mddev_find(dev); if (!mddev || !mddev->gendisk) { if (mddev) @@ -6948,8 +6948,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7257,8 +7259,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); else if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } } return rv; @@ -7278,6 +7280,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; @@ -7480,7 +7483,6 @@ static inline bool md_ioctl_valid(unsigned int cmd) { switch (cmd) { case ADD_NEW_DISK: - case BLKROSET: case GET_ARRAY_INFO: case GET_BITMAP_FILE: case GET_DISK_INFO: @@ -7507,7 +7509,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; - int ro; bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) @@ -7590,8 +7591,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); - set_bit(MD_CLOSING, &mddev->flags); + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); @@ -7687,35 +7691,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto unlock; } break; - - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto unlock; - } - err = -EINVAL; - - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto unlock; - - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto unlock; - - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } - } - goto unlock; } /* @@ -7809,6 +7784,36 @@ static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif /* CONFIG_COMPAT */ +static int md_set_read_only(struct block_device *bdev, bool ro) +{ + struct mddev *mddev = bdev->bd_disk->private_data; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + if (!mddev->raid_disks && !mddev->external) { + err = -ENODEV; + goto out_unlock; + } + + /* + * Transitioning to read-auto need only happen for arrays that call + * md_write_start and which are not ready for writes yet. + */ + if (!ro && mddev->ro == 1 && mddev->pers) { + err = restart_array(mddev); + if (err) + goto out_unlock; + mddev->ro = 2; + } + +out_unlock: + mddev_unlock(mddev); + return err; +} + static int md_open(struct block_device *bdev, fmode_t mode) { /* @@ -7886,6 +7891,7 @@ const struct block_device_operations md_fops = #endif .getgeo = md_getgeo, .check_events = md_check_events, + .set_read_only = md_set_read_only, }; static int md_thread(void *arg) @@ -8445,7 +8451,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and @@ -8582,26 +8588,6 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); -/* This is used by raid0 and raid10 */ -void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, - struct bio *bio, sector_t start, sector_t size) -{ - struct bio *discard_bio = NULL; - - if (__blkdev_issue_discard(rdev->bdev, start, size, - GFP_NOIO, 0, &discard_bio) || !discard_bio) - return; - - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); -} -EXPORT_SYMBOL(md_submit_discard_bio); - /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9035,10 +9021,9 @@ void md_do_sync(struct md_thread *thread) mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); - if (!mddev_is_clustered(mddev)) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); - } + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } spin_lock(&mddev->lock); @@ -9567,18 +9552,15 @@ static int __init md_init(void) if (!md_rdev_misc_wq) goto err_rdev_misc_wq; - if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + ret = __register_blkdev(MD_MAJOR, "md", md_probe); + if (ret < 0) goto err_md; - if ((ret = register_blkdev(0, "mdp")) < 0) + ret = __register_blkdev(0, "mdp", md_probe); + if (ret < 0) goto err_mdp; mdp_major = ret; - blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, - md_probe, NULL, NULL); - register_reboot_notifier(&md_notifier); raid_table_header = register_sysctl_table(raid_root_table); @@ -9662,8 +9644,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } } - if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) - update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } /* * Since mddev->delta_disks has already updated in update_raid_disks, @@ -9845,9 +9830,6 @@ static __exit void md_exit(void) struct list_head *tmp; int delay = 1; - blk_unregister_region(MKDEV(MD_MAJOR,0), 512); - blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); - unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); diff --git a/drivers/md/md.h b/drivers/md/md.h index ccfb69868c2e..34070ab30a8a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -495,9 +495,9 @@ struct mddev { */ struct bio *flush_bio; atomic_t flush_pending; - ktime_t start_flush, last_flush; /* last_flush is when the last completed - * flush was started. - */ + ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed + * flush was started. + */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; @@ -713,8 +713,6 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); -extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, - struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f44177593a5..67f157f2525d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -477,6 +477,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -499,9 +500,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - md_submit_discard_bio(mddev, rdev, bio, + if (__blkdev_issue_discard(rdev->bdev, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start); + dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || + !discard_bio) + continue; + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(discard_bio, + disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); } bio_endio(bio); } @@ -571,8 +581,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(mddev->gendisk), bio_sector); + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 960d854c07f8..c0347997f6ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1305,8 +1305,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, - disk_devt(mddev->gendisk), r1_bio->sector); + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1517,8 +1517,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)conf->mirrors[i].rdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..c5d88ef6a45c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); + int size = offsetof(struct r10bio, devs[conf->copies]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->geo.raid_disks; i++) { + for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->geo.raid_disks; slot++) { + for (slot = 0; slot < conf->copies; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,6 +336,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } + BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1127,7 +1128,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1200,8 +1201,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, - read_bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), r10_bio->sector); submit_bio_noacct(read_bio); return; @@ -1250,8 +1250,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(conf->mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)rdev; @@ -1275,75 +1274,12 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } -static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) -{ - int i; - struct r10conf *conf = mddev->private; - struct md_rdev *blocked_rdev; - -retry_wait: - blocked_rdev = NULL; - rcu_read_lock(); - for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } - - if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; - - /* Discard request doesn't care the write result - * so it doesn't need to wait blocked disk here. - */ - if (!r10_bio->sectors) - continue; - - is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - } - } - rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - allow_barrier(conf); - raid10_log(conf->mddev, "%s wait rdev %d blocked", - __func__, blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_wait; - } -} - static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; + struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1401,9 +1337,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); - - wait_blocked_dev(mddev, r10_bio); - +retry_write: + blocked_rdev = NULL; rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1414,6 +1349,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1434,6 +1379,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1469,6 +1423,35 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) { + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } + allow_barrier(conf); + raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1508,7 +1491,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); + r10_bio->read_slot = -1; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1516,296 +1500,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } -static struct bio *raid10_split_bio(struct r10conf *conf, - struct bio *bio, sector_t sectors, bool want_first) -{ - struct bio *split; - - split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, bio); - allow_barrier(conf); - if (want_first) { - submit_bio_noacct(bio); - bio = split; - } else - submit_bio_noacct(split); - wait_barrier(conf); - - return bio; -} - -static void raid_end_discard_bio(struct r10bio *r10bio) -{ - struct r10conf *conf = r10bio->mddev->private; - struct r10bio *first_r10bio; - - while (atomic_dec_and_test(&r10bio->remaining)) { - - allow_barrier(conf); - - if (!test_bit(R10BIO_Discard, &r10bio->state)) { - first_r10bio = (struct r10bio *)r10bio->master_bio; - free_r10bio(r10bio); - r10bio = first_r10bio; - } else { - md_write_end(r10bio->mddev); - bio_endio(r10bio->master_bio); - free_r10bio(r10bio); - break; - } - } -} - -static void raid10_end_discard_request(struct bio *bio) -{ - struct r10bio *r10_bio = bio->bi_private; - struct r10conf *conf = r10_bio->mddev->private; - struct md_rdev *rdev = NULL; - int dev; - int slot, repl; - - /* - * We don't care the return value of discard bio - */ - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - set_bit(R10BIO_Uptodate, &r10_bio->state); - - dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - /* raid10_remove_disk uses smp_mb to make sure rdev is set to - * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacment is NULL - */ - smp_rmb(); - rdev = conf->mirrors[dev].rdev; - } - - raid_end_discard_bio(r10_bio); - rdev_dec_pending(rdev, conf->mddev); -} - -/* There are some limitations to handle discard bio - * 1st, the discard size is bigger than stripe_size*2. - * 2st, if the discard bio spans reshape progress, we use the old way to - * handle discard bio - */ -static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) -{ - struct r10conf *conf = mddev->private; - struct geom *geo = &conf->geo; - struct r10bio *r10_bio, *first_r10bio; - int far_copies = geo->far_copies; - bool first_copy = true; - - int disk; - sector_t chunk; - unsigned int stripe_size; - sector_t split_size; - - sector_t bio_start, bio_end; - sector_t first_stripe_index, last_stripe_index; - sector_t start_disk_offset; - unsigned int start_disk_index; - sector_t end_disk_offset; - unsigned int end_disk_index; - unsigned int remainder; - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - return -EAGAIN; - - wait_barrier(conf); - - /* Check reshape again to avoid reshape happens after checking - * MD_RECOVERY_RESHAPE and before wait_barrier - */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - goto out; - - stripe_size = geo->raid_disks << geo->chunk_shift; - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* Maybe one discard bio is smaller than strip size or across one stripe - * and discard region is larger than one stripe size. For far offset layout, - * if the discard region is not aligned with stripe size, there is hole - * when we submit discard bio to member disk. For simplicity, we only - * handle discard bio which discard region is bigger than stripe_size*2 - */ - if (bio_sectors(bio) < stripe_size*2) - goto out; - - /* For far and far offset layout, if bio is not aligned with stripe size, - * it splits the part that is not aligned with strip size. - */ - div_u64_rem(bio_start, stripe_size, &remainder); - if ((far_copies > 1) && remainder) { - split_size = stripe_size - remainder; - bio = raid10_split_bio(conf, bio, split_size, false); - } - div_u64_rem(bio_end, stripe_size, &remainder); - if ((far_copies > 1) && remainder) { - split_size = bio_sectors(bio) - remainder; - bio = raid10_split_bio(conf, bio, split_size, true); - } - - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* raid10 uses chunk as the unit to store data. It's similar like raid0. - * One stripe contains the chunks from all member disk (one chunk from - * one disk at the same HBA address). For layout detail, see 'man md 4' - */ - chunk = bio_start >> geo->chunk_shift; - chunk *= geo->near_copies; - first_stripe_index = chunk; - start_disk_index = sector_div(first_stripe_index, geo->raid_disks); - if (geo->far_offset) - first_stripe_index *= geo->far_copies; - start_disk_offset = (bio_start & geo->chunk_mask) + - (first_stripe_index << geo->chunk_shift); - - chunk = bio_end >> geo->chunk_shift; - chunk *= geo->near_copies; - last_stripe_index = chunk; - end_disk_index = sector_div(last_stripe_index, geo->raid_disks); - if (geo->far_offset) - last_stripe_index *= geo->far_copies; - end_disk_offset = (bio_end & geo->chunk_mask) + - (last_stripe_index << geo->chunk_shift); - -retry_discard: - r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); - r10_bio->mddev = mddev; - r10_bio->state = 0; - r10_bio->sectors = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); - wait_blocked_dev(mddev, r10_bio); - - /* For far layout it needs more than one r10bio to cover all regions. - * Inspired by raid10_sync_request, we can use the first r10bio->master_bio - * to record the discard bio. Other r10bio->master_bio record the first - * r10bio. The first r10bio only release after all other r10bios finish. - * The discard bio returns only first r10bio finishes - */ - if (first_copy) { - r10_bio->master_bio = bio; - set_bit(R10BIO_Discard, &r10_bio->state); - first_copy = false; - first_r10bio = r10_bio; - } else - r10_bio->master_bio = (struct bio *)first_r10bio; - - rcu_read_lock(); - for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - r10_bio->devs[disk].bio = NULL; - r10_bio->devs[disk].repl_bio = NULL; - - if (rdev && (test_bit(Faulty, &rdev->flags))) - rdev = NULL; - if (rrdev && (test_bit(Faulty, &rrdev->flags))) - rrdev = NULL; - if (!rdev && !rrdev) - continue; - - if (rdev) { - r10_bio->devs[disk].bio = bio; - atomic_inc(&rdev->nr_pending); - } - if (rrdev) { - r10_bio->devs[disk].repl_bio = bio; - atomic_inc(&rrdev->nr_pending); - } - } - rcu_read_unlock(); - - atomic_set(&r10_bio->remaining, 1); - for (disk = 0; disk < geo->raid_disks; disk++) { - sector_t dev_start, dev_end; - struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - /* - * Now start to calculate the start and end address for each disk. - * The space between dev_start and dev_end is the discard region. - * - * For dev_start, it needs to consider three conditions: - * 1st, the disk is before start_disk, you can imagine the disk in - * the next stripe. So the dev_start is the start address of next - * stripe. - * 2st, the disk is after start_disk, it means the disk is at the - * same stripe of first disk - * 3st, the first disk itself, we can use start_disk_offset directly - */ - if (disk < start_disk_index) - dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) - dev_start = first_stripe_index * mddev->chunk_sectors; - else - dev_start = start_disk_offset; - - if (disk < end_disk_index) - dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) - dev_end = last_stripe_index * mddev->chunk_sectors; - else - dev_end = end_disk_offset; - - /* It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time - */ - if (r10_bio->devs[disk].bio) { - mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - mbio->bi_end_io = raid10_end_discard_request; - mbio->bi_private = r10_bio; - r10_bio->devs[disk].bio = mbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rdev, mbio, - dev_start + choose_data_offset(r10_bio, rdev), - dev_end - dev_start); - bio_endio(mbio); - } - if (r10_bio->devs[disk].repl_bio) { - rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - rbio->bi_end_io = raid10_end_discard_request; - rbio->bi_private = r10_bio; - r10_bio->devs[disk].repl_bio = rbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rrdev, rbio, - dev_start + choose_data_offset(r10_bio, rrdev), - dev_end - dev_start); - bio_endio(rbio); - } - } - - if (!geo->far_offset && --far_copies) { - first_stripe_index += geo->stride >> geo->chunk_shift; - start_disk_offset += geo->stride; - last_stripe_index += geo->stride >> geo->chunk_shift; - end_disk_offset += geo->stride; - atomic_inc(&first_r10bio->remaining); - raid_end_discard_bio(r10_bio); - wait_barrier(conf); - goto retry_discard; - } - - raid_end_discard_bio(r10_bio); - - return 0; -out: - allow_barrier(conf); - return -EAGAIN; -} - static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1820,10 +1514,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - if (!raid10_handle_discard(mddev, bio)) - return true; - /* * If this request crosses a chunk boundary, we need to split * it. @@ -4063,7 +3753,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - UINT_MAX); + mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1461fd55311b..79cd2b7d3128 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,6 +179,5 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, - R10BIO_Discard, }; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39343479ac2a..3a90cc0e43ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1222,9 +1222,9 @@ again: set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bi->bi_disk->queue, - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(bi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1272,9 +1272,9 @@ again: if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(rbi->bi_disk->queue, - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(rbi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -5468,8 +5468,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(align_bi->bi_disk->queue, - align_bi, disk_devt(mddev->gendisk), + trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bi); return 1; |