aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig6
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c11
-rw-r--r--drivers/md/bitmap.h2
-rw-r--r--drivers/md/dm-crypt.c28
-rw-r--r--drivers/md/dm-flakey.c212
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-ioctl.c46
-rw-r--r--drivers/md/dm-kcopyd.c55
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-log.c10
-rw-r--r--drivers/md/dm-mpath.c39
-rw-r--r--drivers/md/dm-raid.c8
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-snap.c2
-rw-r--r--drivers/md/dm-stripe.c23
-rw-r--r--drivers/md/dm-table.c139
-rw-r--r--drivers/md/dm.c52
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c21
-rw-r--r--drivers/md/md.c197
-rw-r--r--drivers/md/md.h32
-rw-r--r--drivers/md/multipath.c39
-rw-r--r--drivers/md/raid0.c61
-rw-r--r--drivers/md/raid1.c94
-rw-r--r--drivers/md/raid10.c113
-rw-r--r--drivers/md/raid10.h4
-rw-r--r--drivers/md/raid5.c167
-rw-r--r--drivers/md/raid5.h4
31 files changed, 716 insertions, 662 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 98d9ec85e0eb..8420129fc5ee 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -327,4 +327,10 @@ config DM_UEVENT
---help---
Generate udev events for DM events.
+config DM_FLAKEY
+ tristate "Flakey target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ A target that intermittently fails I/O for debugging purposes.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d0138606c2e8..448838b1f92a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
+obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a35320fb59f..5c9362792f1d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
atomic_inc(&bitmap->pending_writes);
set_buffer_locked(bh);
set_buffer_mapped(bh);
- submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh);
+ submit_bh(WRITE | REQ_SYNC, bh);
bh = bh->b_this_page;
}
@@ -854,7 +854,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
if (bitmap->flags & BITMAP_HOSTENDIAN)
set_bit(bit, kaddr);
else
- ext2_set_bit(bit, kaddr);
+ __test_and_set_bit_le(bit, kaddr);
kunmap_atomic(kaddr, KM_USER0);
PRINTK("set file bit %lu page %lu\n", bit, page->index);
}
@@ -1050,7 +1050,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
if (bitmap->flags & BITMAP_HOSTENDIAN)
b = test_bit(bit, paddr);
else
- b = ext2_test_bit(bit, paddr);
+ b = test_bit_le(bit, paddr);
kunmap_atomic(paddr, KM_USER0);
if (b) {
/* if the disk bit is set, set the memory bit */
@@ -1226,7 +1226,7 @@ void bitmap_daemon_work(mddev_t *mddev)
clear_bit(file_page_offset(bitmap, j),
paddr);
else
- ext2_clear_bit(file_page_offset(bitmap, j),
+ __test_and_clear_bit_le(file_page_offset(bitmap, j),
paddr);
kunmap_atomic(paddr, KM_USER0);
} else
@@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bitmap->lock);
- md_unplug(bitmap->mddev);
- schedule();
+ io_schedule();
finish_wait(&bitmap->overflow_wait, &__wait);
continue;
}
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 931a7a7c3796..d0aeaf46d932 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared aswell, thus setting the counter to 0.
+ * bit can be cleared as well, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd91664..c8827ffd85bb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_destructor = dm_crypt_bio_destructor;
}
-static void kcryptd_unplug(struct crypt_config *cc)
-{
- blk_unplug(bdev_get_queue(cc->dev->bdev));
-}
-
static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
{
struct crypt_config *cc = io->target->private;
@@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
* one in order to decrypt the whole bio data *afterwards*.
*/
clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
- if (!clone) {
- kcryptd_unplug(cc);
+ if (!clone)
return 1;
- }
crypt_inc_pending(io);
@@ -1331,20 +1324,29 @@ static int crypt_setkey_allcpus(struct crypt_config *cc)
static int crypt_set_key(struct crypt_config *cc, char *key)
{
+ int r = -EINVAL;
+ int key_string_len = strlen(key);
+
/* The key size may not be changed. */
- if (cc->key_size != (strlen(key) >> 1))
- return -EINVAL;
+ if (cc->key_size != (key_string_len >> 1))
+ goto out;
/* Hyphen (which gives a key_size of zero) means there is no key. */
if (!cc->key_size && strcmp(key, "-"))
- return -EINVAL;
+ goto out;
if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
- return -EINVAL;
+ goto out;
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- return crypt_setkey_allcpus(cc);
+ r = crypt_setkey_allcpus(cc);
+
+out:
+ /* Hex key string not needed after here, so wipe it. */
+ memset(key, '0', key_string_len);
+
+ return r;
}
static int crypt_wipe_key(struct crypt_config *cc)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 000000000000..ea790623c30b
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "flakey"
+
+/*
+ * Flakey: Used for testing only, simulates intermittent,
+ * catastrophic device failure.
+ */
+struct flakey_c {
+ struct dm_dev *dev;
+ unsigned long start_time;
+ sector_t start;
+ unsigned up_interval;
+ unsigned down_interval;
+};
+
+/*
+ * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct flakey_c *fc;
+ unsigned long long tmp;
+
+ if (argc != 4) {
+ ti->error = "dm-flakey: Invalid argument count";
+ return -EINVAL;
+ }
+
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc) {
+ ti->error = "dm-flakey: Cannot allocate linear context";
+ return -ENOMEM;
+ }
+ fc->start_time = jiffies;
+
+ if (sscanf(argv[1], "%llu", &tmp) != 1) {
+ ti->error = "dm-flakey: Invalid device sector";
+ goto bad;
+ }
+ fc->start = tmp;
+
+ if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
+ ti->error = "dm-flakey: Invalid up interval";
+ goto bad;
+ }
+
+ if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
+ ti->error = "dm-flakey: Invalid down interval";
+ goto bad;
+ }
+
+ if (!(fc->up_interval + fc->down_interval)) {
+ ti->error = "dm-flakey: Total (up + down) interval is zero";
+ goto bad;
+ }
+
+ if (fc->up_interval + fc->down_interval < fc->up_interval) {
+ ti->error = "dm-flakey: Interval overflow";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
+ ti->error = "dm-flakey: Device lookup failed";
+ goto bad;
+ }
+
+ ti->num_flush_requests = 1;
+ ti->private = fc;
+ return 0;
+
+bad:
+ kfree(fc);
+ return -EINVAL;
+}
+
+static void flakey_dtr(struct dm_target *ti)
+{
+ struct flakey_c *fc = ti->private;
+
+ dm_put_device(ti, fc->dev);
+ kfree(fc);
+}
+
+static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+ struct flakey_c *fc = ti->private;
+
+ return fc->start + (bi_sector - ti->begin);
+}
+
+static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct flakey_c *fc = ti->private;
+
+ bio->bi_bdev = fc->dev->bdev;
+ if (bio_sectors(bio))
+ bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+}
+
+static int flakey_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct flakey_c *fc = ti->private;
+ unsigned elapsed;
+
+ /* Are we alive ? */
+ elapsed = (jiffies - fc->start_time) / HZ;
+ if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+ return -EIO;
+
+ flakey_map_bio(ti, bio);
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int flakey_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned int maxlen)
+{
+ struct flakey_c *fc = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
+ (unsigned long long)fc->start, fc->up_interval,
+ fc->down_interval);
+ break;
+ }
+ return 0;
+}
+
+static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+{
+ struct flakey_c *fc = ti->private;
+
+ return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+}
+
+static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct flakey_c *fc = ti->private;
+ struct request_queue *q = bdev_get_queue(fc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = fc->dev->bdev;
+ bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+ struct flakey_c *fc = ti->private;
+
+ return fn(ti, fc->dev, fc->start, ti->len, data);
+}
+
+static struct target_type flakey_target = {
+ .name = "flakey",
+ .version = {1, 1, 0},
+ .module = THIS_MODULE,
+ .ctr = flakey_ctr,
+ .dtr = flakey_dtr,
+ .map = flakey_map,
+ .status = flakey_status,
+ .ioctl = flakey_ioctl,
+ .merge = flakey_merge,
+ .iterate_devices = flakey_iterate_devices,
+};
+
+static int __init dm_flakey_init(void)
+{
+ int r = dm_register_target(&flakey_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_flakey_exit(void)
+{
+ dm_unregister_target(&flakey_target);
+}
+
+/* Module hooks */
+module_init(dm_flakey_init);
+module_exit(dm_flakey_exit);
+
+MODULE_DESCRIPTION(DM_NAME " flakey target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 136d4f71a116..76a5af00a26b 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
if (sync)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= REQ_SYNC;
/*
* For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6d12775a1061..4cacdad2270a 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1501,14 +1501,10 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
return r;
}
-static void free_params(struct dm_ioctl *param)
-{
- vfree(param);
-}
-
static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
{
struct dm_ioctl tmp, *dmi;
+ int secure_data;
if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
return -EFAULT;
@@ -1516,17 +1512,30 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
return -EINVAL;
+ secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
+
dmi = vmalloc(tmp.data_size);
- if (!dmi)
+ if (!dmi) {
+ if (secure_data && clear_user(user, tmp.data_size))
+ return -EFAULT;
return -ENOMEM;
-
- if (copy_from_user(dmi, user, tmp.data_size)) {
- vfree(dmi);
- return -EFAULT;
}
+ if (copy_from_user(dmi, user, tmp.data_size))
+ goto bad;
+
+ /* Wipe the user buffer so we do not return it to userspace */
+ if (secure_data && clear_user(user, tmp.data_size))
+ goto bad;
+
*param = dmi;
return 0;
+
+bad:
+ if (secure_data)
+ memset(dmi, 0, tmp.data_size);
+ vfree(dmi);
+ return -EFAULT;
}
static int validate_params(uint cmd, struct dm_ioctl *param)
@@ -1534,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
/* Always clear this flag */
param->flags &= ~DM_BUFFER_FULL_FLAG;
param->flags &= ~DM_UEVENT_GENERATED_FLAG;
+ param->flags &= ~DM_SECURE_DATA_FLAG;
/* Ignores parameters */
if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1561,10 +1571,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
{
int r = 0;
+ int wipe_buffer;
unsigned int cmd;
struct dm_ioctl *uninitialized_var(param);
ioctl_fn fn = NULL;
- size_t param_size;
+ size_t input_param_size;
/* only root can play with this */
if (!capable(CAP_SYS_ADMIN))
@@ -1611,13 +1622,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
if (r)
return r;
+ input_param_size = param->data_size;
+ wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
+
r = validate_params(cmd, param);
if (r)
goto out;
- param_size = param->data_size;
param->data_size = sizeof(*param);
- r = fn(param, param_size);
+ r = fn(param, input_param_size);
/*
* Copy the results back to userland.
@@ -1625,8 +1638,11 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
if (!r && copy_to_user(user, param, param->data_size))
r = -EFAULT;
- out:
- free_params(param);
+out:
+ if (wipe_buffer)
+ memset(param, 0, input_param_size);
+
+ vfree(param);
return r;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 924f5f0084c2..1bb73a13ca40 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,13 +37,6 @@ struct dm_kcopyd_client {
unsigned int nr_pages;
unsigned int nr_free_pages;
- /*
- * Block devices to unplug.
- * Non-NULL pointer means that a block device has some pending requests
- * and needs to be unplugged.
- */
- struct block_device *unplug[2];
-
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
@@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job)
return 0;
}
-/*
- * Unplug the block device at the specified index.
- */
-static void unplug(struct dm_kcopyd_client *kc, int rw)
-{
- if (kc->unplug[rw] != NULL) {
- blk_unplug(bdev_get_queue(kc->unplug[rw]));
- kc->unplug[rw] = NULL;
- }
-}
-
-/*
- * Prepare block device unplug. If there's another device
- * to be unplugged at the same array index, we unplug that
- * device first.
- */
-static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
- struct block_device *bdev)
-{
- if (likely(kc->unplug[rw] == bdev))
- return;
- unplug(kc, rw);
- kc->unplug[rw] = bdev;
-}
-
static void complete_io(unsigned long error, void *context)
{
struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -386,16 +354,10 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
- if (job->rw == READ) {
+ if (job->rw == READ)
r = dm_io(&io_req, 1, &job->source, NULL);
- prepare_unplug(job->kc, READ, job->source.bdev);
- } else {
- if (job->num_dests > 1)
- io_req.bi_rw |= REQ_UNPLUG;
+ else
r = dm_io(&io_req, job->num_dests, job->dests, NULL);
- if (!(io_req.bi_rw & REQ_UNPLUG))
- prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
- }
return r;
}
@@ -466,6 +428,7 @@ static void do_work(struct work_struct *work)
{
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
+ struct blk_plug plug;
/*
* The order that these are called is *very* important.
@@ -473,18 +436,12 @@ static void do_work(struct work_struct *work)
* Pages jobs when successful will jump onto the io jobs
* list. io jobs call wake when they complete and it all
* starts again.
- *
- * Note that io_jobs add block devices to the unplug array,
- * this array is cleared with "unplug" calls. It is thus
- * forbidden to run complete_jobs after io_jobs and before
- * unplug because the block device could be destroyed in
- * job completion callback.
*/
+ blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
process_jobs(&kc->io_jobs, kc, run_io_job);
- unplug(kc, READ);
- unplug(kc, WRITE);
+ blk_finish_plug(&plug);
}
/*
@@ -665,8 +622,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
- memset(kc->unplug, 0, sizeof(kc->unplug));
-
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
goto bad_slab;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 049eaf12aaab..1f23e048f077 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
{
struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
- if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
+ if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
return;
spin_lock(&receiving_list_lock);
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6951536ea29c..a1f321889676 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -251,20 +251,20 @@ struct log_c {
*/
static inline int log_test_bit(uint32_t *bs, unsigned bit)
{
- return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+ return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
}
static inline void log_set_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
- ext2_set_bit(bit, (unsigned long *) bs);
+ __test_and_set_bit_le(bit, (unsigned long *) bs);
l->touched_cleaned = 1;
}
static inline void log_clear_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
- ext2_clear_bit(bit, (unsigned long *) bs);
+ __test_and_clear_bit_le(bit, (unsigned long *) bs);
l->touched_dirtied = 1;
}
@@ -543,7 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
return -EINVAL;
}
- r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
if (r)
return r;
@@ -740,7 +740,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
return 0;
do {
- *region = ext2_find_next_zero_bit(
+ *region = find_next_zero_bit_le(
(unsigned long *) lc->sync_bits,
lc->region_count,
lc->sync_search);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index b82d28819e2a..a550a057d991 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -844,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
{
/* target parameters */
static struct param _params[] = {
- {1, 1024, "invalid number of priority groups"},
- {1, 1024, "invalid initial priority group number"},
+ {0, 1024, "invalid number of priority groups"},
+ {0, 1024, "invalid initial priority group number"},
};
int r;
@@ -879,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
if (r)
goto bad;
+ if ((!m->nr_priority_groups && next_pg_num) ||
+ (m->nr_priority_groups && !next_pg_num)) {
+ ti->error = "invalid initial priority group";
+ r = -EINVAL;
+ goto bad;
+ }
+
/* parse the priority groups */
while (as.argc) {
struct priority_group *pg;
@@ -1065,7 +1072,7 @@ out:
static int action_dev(struct multipath *m, struct dm_dev *dev,
action_fn action)
{
- int r = 0;
+ int r = -EINVAL;
struct pgpath *pgpath;
struct priority_group *pg;
@@ -1283,24 +1290,22 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (!error && !clone->errors)
return 0; /* I/O complete */
- if (error == -EOPNOTSUPP)
- return error;
-
- if (clone->cmd_flags & REQ_DISCARD)
- /*
- * Pass all discard request failures up.
- * FIXME: only fail_path if the discard failed due to a
- * transport problem. This requires precise understanding
- * of the underlying failure (e.g. the SCSI sense).
- */
+ if (error == -EOPNOTSUPP || error == -EREMOTEIO)
return error;
if (mpio->pgpath)
fail_path(mpio->pgpath);
spin_lock_irqsave(&m->lock, flags);
- if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
- r = -EIO;
+ if (!m->nr_valid_paths) {
+ if (!m->queue_if_no_path) {
+ if (!__must_push_back(m))
+ r = -EIO;
+ } else {
+ if (error == -EBADE)
+ r = error;
+ }
+ }
spin_unlock_irqrestore(&m->lock, flags);
return r;
@@ -1417,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
else if (m->current_pg)
pg_num = m->current_pg->pg_num;
else
- pg_num = 1;
+ pg_num = (m->nr_priority_groups ? 1 : 0);
DMEMIT("%u ", pg_num);
@@ -1671,7 +1676,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b9e1e15ef11c..e5d8904fc8f6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -390,13 +390,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
return md_raid5_congested(&rs->md, bits);
}
-static void raid_unplug(struct dm_target_callbacks *cb)
-{
- struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
-
- md_raid5_unplug_device(rs->md.private);
-}
-
/*
* Construct a RAID4/5/6 mapping:
* Args:
@@ -487,7 +480,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
rs->callbacks.congested_fn = raid_is_congested;
- rs->callbacks.unplug_fn = raid_unplug;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
return 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dee326775c60..976ad4688afc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work)
do_reads(ms, &reads);
do_writes(ms, &writes);
do_failures(ms, &failures);
-
- dm_table_unplug_all(ms->ti->table);
}
/*-----------------------------------------------------------------
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index dad011aed0c9..7771ed212182 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
/*
* Possible cases:
* 1) DM_RH_DIRTY
- * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+ * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed
* 3) DM_RH_RECOVERING: flushing pending writes
* Either case, the region should have not been connected to list.
*/
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index fdde53cd12b7..a2d330942cb2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1080,7 +1080,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
argv++;
argc--;
- r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
+ r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
if (r) {
ti->error = "Cannot get COW device";
goto bad_cow;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index dddfa14f2982..3d80cf0c152d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -396,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti,
blk_limits_io_opt(limits, chunk_size * sc->stripes);
}
+static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct stripe_c *sc = ti->private;
+ sector_t bvm_sector = bvm->bi_sector;
+ uint32_t stripe;
+ struct request_queue *q;
+
+ stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
+
+ q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
+ bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 3, 1},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
@@ -407,6 +427,7 @@ static struct target_type stripe_target = {
.status = stripe_status,
.iterate_devices = stripe_iterate_devices,
.io_hints = stripe_io_hints,
+ .merge = stripe_merge,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 38e4eb1bb965..cb8380c9767f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
struct dm_target *targets;
unsigned discards_supported:1;
+ unsigned integrity_supported:1;
/*
* Indicates the rw permissions for the new logical
@@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
return -EINVAL;
}
- t->mempools = dm_alloc_md_mempools(type);
+ t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
if (!t->mempools)
return -ENOMEM;
@@ -926,18 +927,80 @@ static int dm_table_build_index(struct dm_table *t)
}
/*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+ bool match_all)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd = NULL;
+ struct gendisk *prev_disk = NULL, *template_disk = NULL;
+
+ list_for_each_entry(dd, devices, list) {
+ template_disk = dd->dm_dev.bdev->bd_disk;
+ if (!blk_get_integrity(template_disk))
+ goto no_integrity;
+ if (!match_all && !blk_integrity_is_initialized(template_disk))
+ continue; /* skip uninitialized profiles */
+ else if (prev_disk &&
+ blk_integrity_compare(prev_disk, template_disk) < 0)
+ goto no_integrity;
+ prev_disk = template_disk;
+ }
+
+ return template_disk;
+
+no_integrity:
+ if (prev_disk)
+ DMWARN("%s: integrity not set: %s and %s profile mismatch",
+ dm_device_name(t->md),
+ prev_disk->disk_name,
+ template_disk->disk_name);
+ return NULL;
+}
+
+/*
* Register the mapped device for blk_integrity support if
- * the underlying devices support it.
+ * the underlying devices have an integrity profile. But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
*/
static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *dd;
+ struct gendisk *template_disk = NULL;
- list_for_each_entry(dd, devices, list)
- if (bdev_get_integrity(dd->dm_dev.bdev))
- return blk_integrity_register(dm_disk(md), NULL);
+ template_disk = dm_table_get_integrity_disk(t, false);
+ if (!template_disk)
+ return 0;
+ if (!blk_integrity_is_initialized(dm_disk(md))) {
+ t->integrity_supported = 1;
+ return blk_integrity_register(dm_disk(md), NULL);
+ }
+
+ /*
+ * If DM device already has an initalized integrity
+ * profile the new profile should not conflict.
+ */
+ if (blk_integrity_is_initialized(template_disk) &&
+ blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+ DMWARN("%s: conflict with existing integrity profile: "
+ "%s profile mismatch",
+ dm_device_name(t->md),
+ template_disk->disk_name);
+ return 1;
+ }
+
+ /* Preserve existing initialized integrity profile */
+ t->integrity_supported = 1;
return 0;
}
@@ -1091,41 +1154,27 @@ combine_limits:
/*
* Set the integrity profile for this device if all devices used have
- * matching profiles.
+ * matching profiles. We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles. Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
*/
static void dm_table_set_integrity(struct dm_table *t)
{
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_dev_internal *prev = NULL, *dd = NULL;
+ struct gendisk *template_disk = NULL;
if (!blk_get_integrity(dm_disk(t->md)))
return;
- list_for_each_entry(dd, devices, list) {
- if (prev &&
- blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
- dd->dm_dev.bdev->bd_disk) < 0) {
- DMWARN("%s: integrity not set: %s and %s mismatch",
- dm_device_name(t->md),
- prev->dm_dev.bdev->bd_disk->disk_name,
- dd->dm_dev.bdev->bd_disk->disk_name);
- goto no_integrity;
- }
- prev = dd;
+ template_disk = dm_table_get_integrity_disk(t, true);
+ if (!template_disk &&
+ blk_integrity_is_initialized(dm_disk(t->md))) {
+ DMWARN("%s: device no longer has a valid integrity profile",
+ dm_device_name(t->md));
+ return;
}
-
- if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
- goto no_integrity;
-
blk_integrity_register(dm_disk(t->md),
- bdev_get_integrity(prev->dm_dev.bdev));
-
- return;
-
-no_integrity:
- blk_integrity_register(dm_disk(t->md), NULL);
-
- return;
+ blk_get_integrity(template_disk));
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1275,29 +1324,6 @@ int dm_table_any_busy_target(struct dm_table *t)
return 0;
}
-void dm_table_unplug_all(struct dm_table *t)
-{
- struct dm_dev_internal *dd;
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_target_callbacks *cb;
-
- list_for_each_entry(dd, devices, list) {
- struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
- char b[BDEVNAME_SIZE];
-
- if (likely(q))
- blk_unplug(q);
- else
- DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
- dm_device_name(t->md),
- bdevname(dd->dm_dev.bdev, b));
- }
-
- list_for_each_entry(cb, &t->target_callbacks, list)
- if (cb->unplug_fn)
- cb->unplug_fn(cb);
-}
-
struct mapped_device *dm_table_get_md(struct dm_table *t)
{
return t->md;
@@ -1345,4 +1371,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
EXPORT_SYMBOL(dm_table_get_md);
EXPORT_SYMBOL(dm_table_put);
EXPORT_SYMBOL(dm_table_get);
-EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eaa3af0e0632..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
cpu = part_stat_lock();
part_round_stats(cpu, &dm_disk(md)->part0);
part_stat_unlock();
- dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
+ atomic_set(&dm_disk(md)->part0.in_flight[rw],
+ atomic_inc_return(&md->pending[rw]));
}
static void end_io_acct(struct dm_io *io)
@@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io)
* After this is decremented the bio must not be touched if it is
* a flush.
*/
- dm_disk(md)->part0.in_flight[rw] = pending =
- atomic_dec_return(&md->pending[rw]);
+ pending = atomic_dec_return(&md->pending[rw]);
+ atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
pending += atomic_read(&md->pending[rw^0x1]);
/* nudge anyone waiting on suspend queue */
@@ -807,8 +808,6 @@ void dm_requeue_unmapped_request(struct request *clone)
dm_unprep_request(rq);
spin_lock_irqsave(q->queue_lock, flags);
- if (elv_queue_empty(q))
- blk_plug_device(q);
blk_requeue_request(q, rq);
spin_unlock_irqrestore(q->queue_lock, flags);
@@ -1613,10 +1612,10 @@ static void dm_request_fn(struct request_queue *q)
* number of in-flight I/Os after the queue is stopped in
* dm_suspend().
*/
- while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+ while (!blk_queue_stopped(q)) {
rq = blk_peek_request(q);
if (!rq)
- goto plug_and_out;
+ goto delay_and_out;
/* always use block 0 to find the target for flushes for now */
pos = 0;
@@ -1627,7 +1626,7 @@ static void dm_request_fn(struct request_queue *q)
BUG_ON(!dm_target_is_valid(ti));
if (ti->type->busy && ti->type->busy(ti))
- goto plug_and_out;
+ goto delay_and_out;
blk_start_request(rq);
clone = rq->special;
@@ -1647,11 +1646,8 @@ requeued:
BUG_ON(!irqs_disabled());
spin_lock(q->queue_lock);
-plug_and_out:
- if (!elv_queue_empty(q))
- /* Some requests still remain, retry later */
- blk_plug_device(q);
-
+delay_and_out:
+ blk_delay_queue(q, HZ / 10);
out:
dm_table_put(map);
@@ -1680,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
return r;
}
-static void dm_unplug_all(struct request_queue *q)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table(md);
-
- if (map) {
- if (dm_request_based(md))
- generic_unplug_device(q);
-
- dm_table_unplug_all(map);
- dm_table_put(map);
- }
-}
-
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r = bdi_bits;
@@ -1817,7 +1799,6 @@ static void dm_init_md_queue(struct mapped_device *md)
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
- md->queue->unplug_fn = dm_unplug_all;
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
}
@@ -2263,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
int r = 0;
DECLARE_WAITQUEUE(wait, current);
- dm_unplug_all(md->queue);
-
add_wait_queue(&md->wait, &wait);
while (1) {
@@ -2539,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
clear_bit(DMF_SUSPENDED, &md->flags);
- dm_table_unplug_all(map);
r = 0;
out:
dm_table_put(map);
@@ -2643,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
{
struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+ unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
if (!pools)
return NULL;
@@ -2662,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
if (!pools->tio_pool)
goto free_io_pool_and_out;
- pools->bs = (type == DM_TYPE_BIO_BASED) ?
- bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+ pools->bs = bioset_create(pool_size, 0);
if (!pools->bs)
goto free_tio_pool_and_out;
+ if (integrity && bioset_integrity_create(pools->bs, pool_size))
+ goto free_bioset_and_out;
+
return pools;
+free_bioset_and_out:
+ bioset_free(pools->bs);
+
free_tio_pool_and_out:
mempool_destroy(pools->tio_pool);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
/*
* Mempool operations
*/
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
void dm_free_md_mempools(struct dm_md_mempools *pools);
#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 339fdc670751..23078dabb6df 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
*
* Different modes can be active at a time, but only
* one can be set at array creation. Others can be added later.
- * A mode can be one-shot or recurrent with the recurrance being
+ * A mode can be one-shot or recurrent with the recurrence being
* once in every N requests.
* The bottom 5 bits of the "layout" indicate the mode. The
* remainder indicate a period, or 0 for one-shot.
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 8a2f767f26d8..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
return maxsectors << 9;
}
-static void linear_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
- linear_conf_t *conf;
- int i;
-
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
-
- for (i=0; i < mddev->raid_disks; i++) {
- struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
- blk_unplug(r_queue);
- }
- rcu_read_unlock();
-}
-
static int linear_congested(void *data, int bits)
{
mddev_t *mddev = data;
@@ -216,7 +200,6 @@ static int linear_run (mddev_t *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
- mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = linear_conf(mddev, mddev->raid_disks);
if (!conf)
@@ -225,11 +208,9 @@ static int linear_run (mddev_t *mddev)
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- mddev->queue->unplug_fn = linear_unplug;
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
- md_integrity_register(mddev);
- return 0;
+ return md_integrity_register(mddev);
}
static void free_conf(struct rcu_head *head)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b76cfc89e1b5..6e853c61d87e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -287,6 +287,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
mddev_t *mddev = q->queuedata;
int rv;
int cpu;
+ unsigned int sectors;
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
@@ -311,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
atomic_inc(&mddev->active_io);
rcu_read_unlock();
+ /*
+ * save the sectors now since our bio can
+ * go away inside make_request
+ */
+ sectors = bio_sectors(bio);
rv = mddev->pers->make_request(mddev, bio);
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
- bio_sectors(bio));
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
part_stat_unlock();
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
@@ -442,48 +447,59 @@ EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
- * require having a whole queue
+ * require having a whole queue or request structures.
+ * We allocate an md_plug_cb for each md device and each thread it gets
+ * plugged on. This links tot the private plug_handle structure in the
+ * personality data where we keep a count of the number of outstanding
+ * plugs so other code can see if a plug is active.
*/
-static void plugger_work(struct work_struct *work)
-{
- struct plug_handle *plug =
- container_of(work, struct plug_handle, unplug_work);
- plug->unplug_fn(plug);
-}
-static void plugger_timeout(unsigned long data)
-{
- struct plug_handle *plug = (void *)data;
- kblockd_schedule_work(NULL, &plug->unplug_work);
-}
-void plugger_init(struct plug_handle *plug,
- void (*unplug_fn)(struct plug_handle *))
-{
- plug->unplug_flag = 0;
- plug->unplug_fn = unplug_fn;
- init_timer(&plug->unplug_timer);
- plug->unplug_timer.function = plugger_timeout;
- plug->unplug_timer.data = (unsigned long)plug;
- INIT_WORK(&plug->unplug_work, plugger_work);
-}
-EXPORT_SYMBOL_GPL(plugger_init);
+struct md_plug_cb {
+ struct blk_plug_cb cb;
+ mddev_t *mddev;
+};
-void plugger_set_plug(struct plug_handle *plug)
+static void plugger_unplug(struct blk_plug_cb *cb)
{
- if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag))
- mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1);
+ struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
+ if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
+ md_wakeup_thread(mdcb->mddev->thread);
+ kfree(mdcb);
}
-EXPORT_SYMBOL_GPL(plugger_set_plug);
-int plugger_remove_plug(struct plug_handle *plug)
+/* Check that an unplug wakeup will come shortly.
+ * If not, wakeup the md thread immediately
+ */
+int mddev_check_plugged(mddev_t *mddev)
{
- if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) {
- del_timer(&plug->unplug_timer);
- return 1;
- } else
+ struct blk_plug *plug = current->plug;
+ struct md_plug_cb *mdcb;
+
+ if (!plug)
return 0;
-}
-EXPORT_SYMBOL_GPL(plugger_remove_plug);
+ list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
+ if (mdcb->cb.callback == plugger_unplug &&
+ mdcb->mddev == mddev) {
+ /* Already on the list, move to top */
+ if (mdcb != list_first_entry(&plug->cb_list,
+ struct md_plug_cb,
+ cb.list))
+ list_move(&mdcb->cb.list, &plug->cb_list);
+ return 1;
+ }
+ }
+ /* Not currently on the callback list */
+ mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
+ if (!mdcb)
+ return 0;
+
+ mdcb->mddev = mddev;
+ mdcb->cb.callback = plugger_unplug;
+ atomic_inc(&mddev->plug_cnt);
+ list_add(&mdcb->cb.list, &plug->cb_list);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(mddev_check_plugged);
static inline mddev_t *mddev_get(mddev_t *mddev)
{
@@ -533,6 +549,7 @@ void mddev_init(mddev_t *mddev)
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
+ atomic_set(&mddev->plug_cnt, 0);
spin_lock_init(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
@@ -548,6 +565,9 @@ static mddev_t * mddev_find(dev_t unit)
{
mddev_t *mddev, *new = NULL;
+ if (unit && MAJOR(unit) != MD_MAJOR)
+ unit &= ~((1<<MdpMinorShift)-1);
+
retry:
spin_lock(&all_mddevs_lock);
@@ -772,8 +792,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
bio->bi_end_io = super_written;
atomic_inc(&mddev->pending_writes);
- submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
- bio);
+ submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
}
void md_super_wait(mddev_t *mddev)
@@ -801,7 +820,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
struct completion event;
int ret;
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= REQ_SYNC;
bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
rdev->meta_bdev : rdev->bdev;
@@ -1770,12 +1789,6 @@ int md_integrity_register(mddev_t *mddev)
continue;
if (rdev->raid_disk < 0)
continue;
- /*
- * If at least one rdev is not integrity capable, we can not
- * enable data integrity for the md device.
- */
- if (!bdev_get_integrity(rdev->bdev))
- return -EINVAL;
if (!reference) {
/* Use the first rdev as the reference */
reference = rdev;
@@ -1786,6 +1799,8 @@ int md_integrity_register(mddev_t *mddev)
rdev->bdev->bd_disk) < 0)
return -EINVAL;
}
+ if (!reference || !bdev_get_integrity(reference->bdev))
+ return 0;
/*
* All component devices are integrity capable and have matching
* profiles, register the common profile for the md device.
@@ -1796,8 +1811,12 @@ int md_integrity_register(mddev_t *mddev)
mdname(mddev));
return -EINVAL;
}
- printk(KERN_NOTICE "md: data integrity on %s enabled\n",
- mdname(mddev));
+ printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
+ if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+ printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
return 0;
}
EXPORT_SYMBOL(md_integrity_register);
@@ -1947,8 +1966,6 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
__bdevname(dev, b));
return PTR_ERR(bdev);
}
- if (!shared)
- set_bit(AllReserved, &rdev->flags);
rdev->bdev = bdev;
return err;
}
@@ -2465,6 +2482,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (rdev->raid_disk != -1)
return -EBUSY;
+ if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
+ return -EBUSY;
+
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
@@ -2610,12 +2630,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
mddev_lock(mddev);
list_for_each_entry(rdev2, &mddev->disks, same_set)
- if (test_bit(AllReserved, &rdev2->flags) ||
- (rdev->bdev == rdev2->bdev &&
- rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->sectors,
- rdev2->data_offset,
- rdev2->sectors))) {
+ if (rdev->bdev == rdev2->bdev &&
+ rdev != rdev2 &&
+ overlaps(rdev->data_offset, rdev->sectors,
+ rdev2->data_offset,
+ rdev2->sectors)) {
overlap = 1;
break;
}
@@ -4133,10 +4152,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
}
mddev->array_sectors = sectors;
- set_capacity(mddev->gendisk, mddev->array_sectors);
- if (mddev->pers)
+ if (mddev->pers) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
-
+ }
return len;
}
@@ -4619,6 +4638,7 @@ static int do_md_run(mddev_t *mddev)
}
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
out:
return err;
@@ -4707,6 +4727,7 @@ static void md_clean(mddev_t *mddev)
mddev->sync_speed_min = mddev->sync_speed_max = 0;
mddev->recovery = 0;
mddev->in_sync = 0;
+ mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
@@ -4714,7 +4735,6 @@ static void md_clean(mddev_t *mddev)
mddev->bitmap_info.chunksize = 0;
mddev->bitmap_info.daemon_sleep = 0;
mddev->bitmap_info.max_write_behind = 0;
- mddev->plug = NULL;
}
static void __md_stop_writes(mddev_t *mddev)
@@ -4807,7 +4827,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
__md_stop_writes(mddev);
md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
- mddev->queue->unplug_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
/* tell userspace to handle 'inactive' */
@@ -4822,6 +4841,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
set_capacity(disk, 0);
mutex_unlock(&mddev->open_mutex);
+ mddev->changed = 1;
revalidate_disk(disk);
if (mddev->ro)
@@ -5578,6 +5598,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
mddev->delta_disks = raid_disks - mddev->raid_disks;
rv = mddev->pers->check_reshape(mddev);
+ if (rv < 0)
+ mddev->delta_disks = 0;
return rv;
}
@@ -6004,7 +6026,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
- check_disk_size_change(mddev->gendisk, bdev);
+ check_disk_change(bdev);
out:
return err;
}
@@ -6019,6 +6041,21 @@ static int md_release(struct gendisk *disk, fmode_t mode)
return 0;
}
+
+static int md_media_changed(struct gendisk *disk)
+{
+ mddev_t *mddev = disk->private_data;
+
+ return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+ mddev_t *mddev = disk->private_data;
+
+ mddev->changed = 0;
+ return 0;
+}
static const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
@@ -6029,6 +6066,8 @@ static const struct block_device_operations md_fops =
.compat_ioctl = md_compat_ioctl,
#endif
.getgeo = md_getgeo,
+ .media_changed = md_media_changed,
+ .revalidate_disk= md_revalidate,
};
static int md_thread(void * arg)
@@ -6238,7 +6277,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
* rt is a sector_t, so could be 32bit or 64bit.
* So we divide before multiply in case it is 32bit and close
* to the limit.
- * We scale the divisor (db) by 32 to avoid loosing precision
+ * We scale the divisor (db) by 32 to avoid losing precision
* near the end of resync when the number of remaining sectors
* is close to 'db'.
* We then divide rt by 32 after multiplying by db to compensate.
@@ -6660,14 +6699,6 @@ int md_allow_write(mddev_t *mddev)
}
EXPORT_SYMBOL_GPL(md_allow_write);
-void md_unplug(mddev_t *mddev)
-{
- if (mddev->queue)
- blk_unplug(mddev->queue);
- if (mddev->plug)
- mddev->plug->unplug_fn(mddev->plug);
-}
-
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
void md_do_sync(mddev_t *mddev)
@@ -6846,7 +6877,6 @@ void md_do_sync(mddev_t *mddev)
>= mddev->resync_max - mddev->curr_resync_completed
)) {
/* time to update curr_resync_completed */
- md_unplug(mddev);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
@@ -6922,7 +6952,6 @@ void md_do_sync(mddev_t *mddev)
* about not overloading the IO subsystem. (things like an
* e2fsck being done on the RAID array should execute fast)
*/
- md_unplug(mddev);
cond_resched();
currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6941,8 +6970,6 @@ void md_do_sync(mddev_t *mddev)
* this also signals 'finished resyncing' to md_stop
*/
out:
- md_unplug(mddev);
-
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
/* tell personality that we are finished */
@@ -6985,9 +7012,6 @@ void md_do_sync(mddev_t *mddev)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
mddev->curr_resync = 0;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
- mddev->curr_resync_completed = 0;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -7028,7 +7052,7 @@ static int remove_and_add_spares(mddev_t *mddev)
}
}
- if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
+ if (mddev->degraded && !mddev->recovery_disabled) {
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
@@ -7151,7 +7175,20 @@ void md_check_recovery(mddev_t *mddev)
/* Only thing we do on a ro array is remove
* failed devices.
*/
- remove_and_add_spares(mddev);
+ mdk_rdev_t *rdev;
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Blocked, &rdev->flags) &&
+ test_bit(Faulty, &rdev->flags) &&
+ atomic_read(&rdev->nr_pending)==0) {
+ if (mddev->pers->hot_remove_disk(
+ mddev, rdev->raid_disk)==0) {
+ char nm[20];
+ sprintf(nm,"rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
@@ -7321,7 +7358,7 @@ static int __init md_init(void)
{
int ret = -ENOMEM;
- md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
+ md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
goto err_wq;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eec517ced31a..0b1fd3f1d85b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;
-/* generic plugging support - like that provided with request_queue,
- * but does not require a request_queue
- */
-struct plug_handle {
- void (*unplug_fn)(struct plug_handle *);
- struct timer_list unplug_timer;
- struct work_struct unplug_work;
- unsigned long unplug_flag;
-};
-#define PLUGGED_FLAG 1
-void plugger_init(struct plug_handle *plug,
- void (*unplug_fn)(struct plug_handle *));
-void plugger_set_plug(struct plug_handle *plug);
-int plugger_remove_plug(struct plug_handle *plug);
-static inline void plugger_flush(struct plug_handle *plug)
-{
- del_timer_sync(&plug->unplug_timer);
- cancel_work_sync(&plug->unplug_work);
-}
-
/*
* MD's 'extended' device
*/
@@ -93,10 +73,8 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
-#define AllReserved 6 /* If whole device is reserved for
- * one array */
#define AutoDetected 7 /* added by auto-detect */
-#define Blocked 8 /* An error occured on an externally
+#define Blocked 8 /* An error occurred on an externally
* managed array, don't allow writes
* until it is cleared */
wait_queue_head_t blocked_wait;
@@ -201,6 +179,9 @@ struct mddev_s
int delta_disks, new_level, new_layout;
int new_chunk_sectors;
+ atomic_t plug_cnt; /* If device is expecting
+ * more bios soon.
+ */
struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
sector_t curr_resync; /* last block scheduled */
@@ -276,6 +257,8 @@ struct mddev_s
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
+ int changed; /* True if we might need to
+ * reread partition info */
int degraded; /* whether md should consider
* adding a spare
*/
@@ -336,7 +319,6 @@ struct mddev_s
struct list_head all_mddevs;
struct attribute_group *to_remove;
- struct plug_handle *plug; /* if used by personality */
struct bio_set *bio_set;
@@ -516,7 +498,6 @@ extern int md_integrity_register(mddev_t *mddev);
extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void restore_bitmap_write_access(struct file *file);
-extern void md_unplug(mddev_t *mddev);
extern void mddev_init(mddev_t *mddev);
extern int md_run(mddev_t *mddev);
@@ -530,4 +511,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
mddev_t *mddev);
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
mddev_t *mddev);
+extern int mddev_check_plugged(mddev_t *mddev);
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 6d7ddf32ef2e..c35890990985 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error)
rdev_dec_pending(rdev, conf->mddev);
}
-static void unplug_slaves(mddev_t *mddev)
-{
- multipath_conf_t *conf = mddev->private;
- int i;
-
- rcu_read_lock();
- for (i=0; i<mddev->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)
- && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
-}
-
-static void multipath_unplug(struct request_queue *q)
-{
- unplug_slaves(q->queuedata);
-}
-
-
static int multipath_make_request(mddev_t *mddev, struct bio * bio)
{
multipath_conf_t *conf = mddev->private;
@@ -345,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
p->rdev = rdev;
goto abort;
}
- md_integrity_register(mddev);
+ err = md_integrity_register(mddev);
}
abort:
@@ -435,7 +405,6 @@ static int multipath_run (mddev_t *mddev)
* bookkeeping area. [whatever we allocate in multipath_run(),
* should be freed in multipath_stop()]
*/
- mddev->queue->queue_lock = &mddev->queue->__queue_lock;
conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
mddev->private = conf;
@@ -518,10 +487,12 @@ static int multipath_run (mddev_t *mddev)
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
- mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
- md_integrity_register(mddev);
+
+ if (md_integrity_register(mddev))
+ goto out_free_conf;
+
return 0;
out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a39f4c355e55..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
#include "raid0.h"
#include "raid5.h"
-static void raid0_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
- raid0_conf_t *conf = mddev->private;
- mdk_rdev_t **devlist = conf->devlist;
- int raid_disks = conf->strip_zone[0].nb_dev;
- int i;
-
- for (i=0; i < raid_disks; i++) {
- struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
-
- blk_unplug(r_queue);
- }
-}
-
static int raid0_congested(void *data, int bits)
{
mddev_t *mddev = data;
@@ -179,6 +164,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
rdev1->new_raid_disk = j;
}
+ if (mddev->level == 1) {
+ /* taiking over a raid1 array-
+ * we have only one active disk
+ */
+ j = 0;
+ rdev1->new_raid_disk = j;
+ }
+
if (j < 0 || j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
"aborting!\n", mdname(mddev), j);
@@ -264,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
mdname(mddev),
(unsigned long long)smallest->sectors);
}
- mddev->queue->unplug_fn = raid0_unplug;
mddev->queue->backing_dev_info.congested_fn = raid0_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
@@ -353,7 +345,6 @@ static int raid0_run(mddev_t *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
- mddev->queue->queue_lock = &mddev->queue->__queue_lock;
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
@@ -388,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
- md_integrity_register(mddev);
- return 0;
+ return md_integrity_register(mddev);
}
static int raid0_stop(mddev_t *mddev)
@@ -644,12 +634,39 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
return priv_conf;
}
+static void *raid0_takeover_raid1(mddev_t *mddev)
+{
+ raid0_conf_t *priv_conf;
+
+ /* Check layout:
+ * - (N - 1) mirror drives must be already faulty
+ */
+ if ((mddev->raid_disks - 1) != mddev->degraded) {
+ printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Set new parameters */
+ mddev->new_level = 0;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
+ mddev->delta_disks = 1 - mddev->raid_disks;
+ mddev->raid_disks = 1;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ create_strip_zones(mddev, &priv_conf);
+ return priv_conf;
+}
+
static void *raid0_takeover(mddev_t *mddev)
{
/* raid0 can take over:
* raid4 - if all data disks are active.
* raid5 - providing it is Raid4 layout and one disk is faulty
* raid10 - assuming we have all necessary active disks
+ * raid1 - with (N -1) mirror drives faulty
*/
if (mddev->level == 4)
return raid0_takeover_raid45(mddev);
@@ -665,6 +682,12 @@ static void *raid0_takeover(mddev_t *mddev)
if (mddev->level == 10)
return raid0_takeover_raid10(mddev);
+ if (mddev->level == 1)
+ return raid0_takeover_raid1(mddev);
+
+ printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+ mddev->level);
+
return ERR_PTR(-EINVAL);
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a23ffa397ba9..2b7a7ff401dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
#define NR_RAID1_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
-
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
- r1bio_t *r1_bio;
int size = offsetof(r1bio_t, bios[pi->raid_disks]);
/* allocate a r1bio with room for raid_disks entries in the bios array */
- r1_bio = kzalloc(size, gfp_flags);
- if (!r1_bio && pi->mddev)
- unplug_slaves(pi->mddev);
-
- return r1_bio;
+ return kzalloc(size, gfp_flags);
}
static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
- if (!r1_bio) {
- unplug_slaves(pi->mddev);
+ if (!r1_bio)
return NULL;
- }
/*
* Allocate bios : 1 for reading, n-1 for writing
@@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
return new_disk;
}
-static void unplug_slaves(mddev_t *mddev)
-{
- conf_t *conf = mddev->private;
- int i;
-
- rcu_read_lock();
- for (i=0; i<mddev->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
-}
-
-static void raid1_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
-
- unplug_slaves(mddev);
- md_wakeup_thread(mddev->thread);
-}
-
static int raid1_congested(void *data, int bits)
{
mddev_t *mddev = data;
@@ -580,20 +540,16 @@ static int raid1_congested(void *data, int bits)
}
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
{
/* Any writes that have been queued but are awaiting
* bitmap updates get flushed here.
- * We return 1 if any requests were actually submitted.
*/
- int rv = 0;
-
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(conf->mddev->queue);
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to
* disk before proceeding w/ I/O */
@@ -605,10 +561,8 @@ static int flush_pending_writes(conf_t *conf)
generic_make_request(bio);
bio = next;
}
- rv = 1;
} else
spin_unlock_irq(&conf->device_lock);
- return rv;
}
/* Barriers....
@@ -640,8 +594,7 @@ static void raise_barrier(conf_t *conf)
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
- conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
@@ -649,8 +602,7 @@ static void raise_barrier(conf_t *conf)
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
@@ -672,7 +624,7 @@ static void wait_barrier(conf_t *conf)
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -708,8 +660,7 @@ static void freeze_array(conf_t *conf)
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- raid1_unplug(conf->mddev->queue); }));
+ flush_pending_writes(conf));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(conf_t *conf)
@@ -771,6 +722,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev;
+ int plugged;
/*
* Register the new request and wait if the reconstruction
@@ -862,6 +814,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
disks = conf->raid_disks;
retry_write:
blocked_rdev = NULL;
@@ -959,7 +913,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
atomic_inc(&r1_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
- blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
@@ -968,7 +921,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync)
+ if (do_sync || !bitmap || !plugged)
md_wakeup_thread(mddev->thread);
return 0;
@@ -1175,7 +1128,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
p->rdev = rdev;
goto abort;
}
- md_integrity_register(mddev);
+ err = md_integrity_register(mddev);
}
abort:
@@ -1558,15 +1511,17 @@ static void raid1d(mddev_t *mddev)
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- int unplug=0;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
-
+
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
- unplug += flush_pending_writes(conf);
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
@@ -1580,10 +1535,9 @@ static void raid1d(mddev_t *mddev)
mddev = r1_bio->mddev;
conf = mddev->private;
- if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+ if (test_bit(R1BIO_IsSync, &r1_bio->state))
sync_request_write(mddev, r1_bio);
- unplug = 1;
- } else {
+ else {
int disk;
/* we got a read error. Maybe the drive is bad. Maybe just
@@ -1633,14 +1587,12 @@ static void raid1d(mddev_t *mddev)
bio->bi_end_io = raid1_end_read_request;
bio->bi_rw = READ | do_sync;
bio->bi_private = r1_bio;
- unplug = 1;
generic_make_request(bio);
}
}
cond_resched();
}
- if (unplug)
- unplug_slaves(mddev);
+ blk_finish_plug(&plug);
}
@@ -2021,7 +1973,6 @@ static int run(mddev_t *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- mddev->queue->queue_lock = &conf->device_lock;
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -2064,11 +2015,9 @@ static int run(mddev_t *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
- mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
- md_integrity_register(mddev);
- return 0;
+ return md_integrity_register(mddev);
}
static int stop(mddev_t *mddev)
@@ -2090,7 +2039,6 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 69b659544390..8e9462626ec5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
*
* RAID-10 support for md.
*
- * Base on code in raid1.c. See raid1.c for futher copyright information.
+ * Base on code in raid1.c. See raid1.c for further copyright information.
*
*
* This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
*/
#define NR_RAID10_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
-
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
conf_t *conf = data;
- r10bio_t *r10_bio;
int size = offsetof(struct r10bio_s, devs[conf->copies]);
/* allocate a r10bio with room for raid_disks entries in the bios array */
- r10_bio = kzalloc(size, gfp_flags);
- if (!r10_bio && conf->mddev)
- unplug_slaves(conf->mddev);
-
- return r10_bio;
+ return kzalloc(size, gfp_flags);
}
static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
int nalloc;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
- if (!r10_bio) {
- unplug_slaves(conf->mddev);
+ if (!r10_bio)
return NULL;
- }
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */
@@ -349,14 +340,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
/*
* RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
* parameters: near_copies and far_copies.
* near_copies * far_copies must be <= raid_disks.
* Normally one of these will be 1.
* If both are 1, we get raid0.
* If near_copies == raid_disks, we get raid1.
*
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
* first chunk, followed by near_copies copies of the next chunk and
* so on.
* If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -597,37 +588,6 @@ rb_out:
return disk;
}
-static void unplug_slaves(mddev_t *mddev)
-{
- conf_t *conf = mddev->private;
- int i;
-
- rcu_read_lock();
- for (i=0; i < conf->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
-
- unplug_slaves(q->queuedata);
- md_wakeup_thread(mddev->thread);
-}
-
static int raid10_congested(void *data, int bits)
{
mddev_t *mddev = data;
@@ -649,20 +609,16 @@ static int raid10_congested(void *data, int bits)
return ret;
}
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
{
/* Any writes that have been queued but are awaiting
* bitmap updates get flushed here.
- * We return 1 if any requests were actually submitted.
*/
- int rv = 0;
-
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(conf->mddev->queue);
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to disk
* before proceeding w/ I/O */
@@ -674,11 +630,10 @@ static int flush_pending_writes(conf_t *conf)
generic_make_request(bio);
bio = next;
}
- rv = 1;
} else
spin_unlock_irq(&conf->device_lock);
- return rv;
}
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -708,17 +663,15 @@ static void raise_barrier(conf_t *conf, int force)
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
- /* No wait for all pending IO to complete */
+ /* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
@@ -739,7 +692,7 @@ static void wait_barrier(conf_t *conf)
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -775,8 +728,8 @@ static void freeze_array(conf_t *conf)
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- raid10_unplug(conf->mddev->queue); }));
+ flush_pending_writes(conf));
+
spin_unlock_irq(&conf->resync_lock);
}
@@ -803,6 +756,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
unsigned long flags;
mdk_rdev_t *blocked_rdev;
+ int plugged;
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
@@ -911,6 +865,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
raid10_find_phys(conf, r10_bio);
retry_write:
blocked_rdev = NULL;
@@ -971,7 +927,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
- blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
@@ -988,9 +943,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync)
+ if (do_sync || !mddev->bitmap || !plugged)
md_wakeup_thread(mddev->thread);
-
return 0;
}
@@ -1230,7 +1184,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
p->rdev = rdev;
goto abort;
}
- md_integrity_register(mddev);
+ err = md_integrity_register(mddev);
}
abort:
@@ -1681,15 +1635,16 @@ static void raid10d(mddev_t *mddev)
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- int unplug=0;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
- unplug += flush_pending_writes(conf);
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
@@ -1703,13 +1658,11 @@ static void raid10d(mddev_t *mddev)
mddev = r10_bio->mddev;
conf = mddev->private;
- if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+ if (test_bit(R10BIO_IsSync, &r10_bio->state))
sync_request_write(mddev, r10_bio);
- unplug = 1;
- } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+ else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
- unplug = 1;
- } else {
+ else {
int mirror;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
@@ -1756,14 +1709,12 @@ static void raid10d(mddev_t *mddev)
bio->bi_rw = READ | do_sync;
bio->bi_private = r10_bio;
bio->bi_end_io = raid10_end_read_request;
- unplug = 1;
generic_make_request(bio);
}
}
cond_resched();
}
- if (unplug)
- unplug_slaves(mddev);
+ blk_finish_plug(&plug);
}
@@ -2304,8 +2255,6 @@ static int run(mddev_t *mddev)
if (!conf)
goto out;
- mddev->queue->queue_lock = &conf->device_lock;
-
mddev->thread = conf->thread;
conf->thread = NULL;
@@ -2376,7 +2325,6 @@ static int run(mddev_t *mddev)
md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size;
- mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->backing_dev_info.congested_fn = raid10_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
@@ -2394,7 +2342,10 @@ static int run(mddev_t *mddev)
if (conf->near_copies < conf->raid_disks)
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
- md_integrity_register(mddev);
+
+ if (md_integrity_register(mddev))
+ goto out_free_conf;
+
return 0;
out_free_conf:
@@ -2463,11 +2414,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
mddev->recovery_cp = MaxSector;
conf = setup_conf(mddev);
- if (!IS_ERR(conf))
+ if (!IS_ERR(conf)) {
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0)
rdev->new_raid_disk = rdev->raid_disk * 2;
-
+ conf->barrier = 1;
+ }
+
return conf;
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e21..944b1104d3b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
spinlock_t device_lock;
/* geometry */
- int near_copies; /* number of copies layed out raid0 style */
- int far_copies; /* number of copies layed out
+ int near_copies; /* number of copies laid out raid0 style */
+ int far_copies; /* number of copies laid out
* at large strides across drives
*/
int far_offset; /* far_copies are offset by 1 stripe
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5044babfcda0..f301e6ae220c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
*
* We group bitmap updates into batches. Each batch has a number.
* We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
* new additions.
* When we discover that we will need to write to any block in a stripe
* (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
* When we are ready to do a write, if that batch hasn't been written yet,
* we plug the array and queue the stripe for later.
* When an unplug happens, we increment bm_flush, thus closing the current
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state)) {
+ if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- plugger_set_plug(&conf->plug);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- sh->bm_seq - conf->seq_write > 0) {
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
- plugger_set_plug(&conf->plug);
- } else {
+ else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf)
return 0;
}
-static void unplug_slaves(mddev_t *mddev);
-
static struct stripe_head *
get_active_stripe(raid5_conf_t *conf, sector_t sector,
int previous, int noblock, int noquiesce)
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- md_raid5_unplug_device(conf)
- );
+ );
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
@@ -1473,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list),
conf->device_lock,
- unplug_slaves(conf->mddev)
- );
+ );
osh = get_free_stripe(conf);
spin_unlock_irq(&conf->device_lock);
atomic_set(&nsh->count, 1);
@@ -3627,8 +3621,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
atomic_inc(&conf->preread_active_stripes);
list_add_tail(&sh->lru, &conf->hold_list);
}
- } else
- plugger_set_plug(&conf->plug);
+ }
}
static void activate_bit_delay(raid5_conf_t *conf)
@@ -3645,60 +3638,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
}
}
-static void unplug_slaves(mddev_t *mddev)
-{
- raid5_conf_t *conf = mddev->private;
- int i;
- int devs = max(conf->raid_disks, conf->previous_raid_disks);
-
- rcu_read_lock();
- for (i = 0; i < devs; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
-}
-
-void md_raid5_unplug_device(raid5_conf_t *conf)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&conf->device_lock, flags);
-
- if (plugger_remove_plug(&conf->plug)) {
- conf->seq_flush++;
- raid5_activate_delayed(conf);
- }
- md_wakeup_thread(conf->mddev->thread);
-
- spin_unlock_irqrestore(&conf->device_lock, flags);
-
- unplug_slaves(conf->mddev);
-}
-EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
-
-static void raid5_unplug(struct plug_handle *plug)
-{
- raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
- md_raid5_unplug_device(conf);
-}
-
-static void raid5_unplug_queue(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
- md_raid5_unplug_device(mddev->private);
-}
-
int md_raid5_congested(mddev_t *mddev, int bits)
{
raid5_conf_t *conf = mddev->private;
@@ -3988,6 +3927,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
int remaining;
+ int plugged;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bi);
@@ -4006,6 +3946,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+ plugged = mddev_check_plugged(mddev);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
int disks, data_disks;
@@ -4100,7 +4041,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
* add failed due to overlap. Flush everything
* and wait a while
*/
- md_raid5_unplug_device(conf);
+ md_wakeup_thread(mddev->thread);
release_stripe(sh);
schedule();
goto retry;
@@ -4120,6 +4061,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
}
}
+ if (!plugged)
+ md_wakeup_thread(mddev->thread);
+
spin_lock_irq(&conf->device_lock);
remaining = raid5_dec_bi_phys_segments(bi);
spin_unlock_irq(&conf->device_lock);
@@ -4365,7 +4309,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
- unplug_slaves(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
@@ -4522,24 +4465,30 @@ static void raid5d(mddev_t *mddev)
struct stripe_head *sh;
raid5_conf_t *conf = mddev->private;
int handled;
+ struct blk_plug plug;
pr_debug("+++ raid5d active\n");
md_check_recovery(mddev);
+ blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
- if (conf->seq_flush != conf->seq_write) {
- int seq = conf->seq_flush;
+ if (atomic_read(&mddev->plug_cnt) == 0 &&
+ !list_empty(&conf->bitmap_list)) {
+ /* Now is a good time to flush some bitmap updates */
+ conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
bitmap_unplug(mddev->bitmap);
spin_lock_irq(&conf->device_lock);
- conf->seq_write = seq;
+ conf->seq_write = conf->seq_flush;
activate_bit_delay(conf);
}
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ raid5_activate_delayed(conf);
while ((bio = remove_bio_from_retry(conf))) {
int ok;
@@ -4569,7 +4518,7 @@ static void raid5d(mddev_t *mddev)
spin_unlock_irq(&conf->device_lock);
async_tx_issue_pending_all();
- unplug_slaves(mddev);
+ blk_finish_plug(&plug);
pr_debug("--- raid5d inactive\n");
}
@@ -5186,8 +5135,6 @@ static int run(mddev_t *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- plugger_init(&conf->plug, raid5_unplug);
- mddev->plug = &conf->plug;
if (mddev->queue) {
int chunk_size;
/* read-ahead size must cover two whole stripes, which
@@ -5205,7 +5152,6 @@ static int run(mddev_t *mddev)
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested;
mddev->queue->queue_lock = &conf->device_lock;
- mddev->queue->unplug_fn = raid5_unplug_queue;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
@@ -5238,7 +5184,6 @@ static int stop(mddev_t *mddev)
mddev->thread = NULL;
if (mddev->queue)
mddev->queue->backing_dev_info.congested_fn = NULL;
- plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
free_conf(conf);
mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
@@ -5517,7 +5462,6 @@ static int raid5_start_reshape(mddev_t *mddev)
raid5_conf_t *conf = mddev->private;
mdk_rdev_t *rdev;
int spares = 0;
- int added_devices = 0;
unsigned long flags;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5527,8 +5471,8 @@ static int raid5_start_reshape(mddev_t *mddev)
return -ENOSPC;
list_for_each_entry(rdev, &mddev->disks, same_set)
- if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks)
- && !test_bit(Faulty, &rdev->flags))
+ if (!test_bit(In_sync, &rdev->flags)
+ && !test_bit(Faulty, &rdev->flags))
spares++;
if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5571,34 +5515,35 @@ static int raid5_start_reshape(mddev_t *mddev)
* to correctly record the "partially reconstructed" state of
* such devices during the reshape and confusion could result.
*/
- if (mddev->delta_disks >= 0)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk < 0 &&
- !test_bit(Faulty, &rdev->flags)) {
- if (raid5_add_disk(mddev, rdev) == 0) {
- char nm[20];
- if (rdev->raid_disk >= conf->previous_raid_disks) {
- set_bit(In_sync, &rdev->flags);
- added_devices++;
- } else
- rdev->recovery_offset = 0;
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj,
- &rdev->kobj, nm))
- /* Failure here is OK */;
- } else
- break;
- } else if (rdev->raid_disk >= conf->previous_raid_disks
- && !test_bit(Faulty, &rdev->flags)) {
- /* This is a spare that was manually added */
- set_bit(In_sync, &rdev->flags);
- added_devices++;
- }
+ if (mddev->delta_disks >= 0) {
+ int added_devices = 0;
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (raid5_add_disk(mddev, rdev) == 0) {
+ char nm[20];
+ if (rdev->raid_disk
+ >= conf->previous_raid_disks) {
+ set_bit(In_sync, &rdev->flags);
+ added_devices++;
+ } else
+ rdev->recovery_offset = 0;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ if (sysfs_create_link(&mddev->kobj,
+ &rdev->kobj, nm))
+ /* Failure here is OK */;
+ }
+ } else if (rdev->raid_disk >= conf->previous_raid_disks
+ && !test_bit(Faulty, &rdev->flags)) {
+ /* This is a spare that was manually added */
+ set_bit(In_sync, &rdev->flags);
+ added_devices++;
+ }
- /* When a reshape changes the number of devices, ->degraded
- * is measured against the larger of the pre and post number of
- * devices.*/
- if (mddev->delta_disks > 0) {
+ /* When a reshape changes the number of devices,
+ * ->degraded is measured against the larger of the
+ * pre and post number of devices.
+ */
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
- added_devices;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2ace0582b409..3ca77a2613ba 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -400,8 +400,6 @@ struct raid5_private_data {
* Cleared when a sync completes.
*/
- struct plug_handle plug;
-
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
@@ -503,6 +501,6 @@ static inline int algorithm_is_DDF(int layout)
}
extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_unplug_device(raid5_conf_t *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
extern int raid5_set_cache_size(mddev_t *mddev, int size);
#endif