aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2019-06-20 23:37:20 -0600
committerJens Axboe <axboe@kernel.dk>2019-06-20 23:37:20 -0600
commit8d54094ef430d89cea75ef586183f05a7c9a2b22 (patch)
tree7c6c9932d861743623cf9d74d287a7042be71799
parentf2fs: use block layer helper for show_bio_op macro (diff)
parentmd: add bitmap_abort label in md_run (diff)
downloadlinux-rng-8d54094ef430d89cea75ef586183f05a7c9a2b22.tar.xz
linux-rng-8d54094ef430d89cea75ef586183f05a7c9a2b22.zip
Merge branch 'md-next' of https://github.com/liu-song-6/linux into for-5.3/block
Pull MD changes from Song. * 'md-next' of https://github.com/liu-song-6/linux: md: add bitmap_abort label in md_run md-bitmap: create and destroy wb_info_pool with the change of bitmap md-bitmap: create and destroy wb_info_pool with the change of backlog md: introduce mddev_create/destroy_wb_pool for the change of member device md/raid1: fix potential data inconsistency issue with write behind device
-rw-r--r--drivers/md/md-bitmap.c20
-rw-r--r--drivers/md/md.c116
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/raid1.c68
4 files changed, 218 insertions, 9 deletions
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c01d41198f5e..b092c7b5282f 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev)
sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
+ struct md_rdev *rdev;
if (!bitmap)
goto out;
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, true);
+
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2462,12 +2468,26 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
+ unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
+ if (!backlog && mddev->wb_info_pool) {
+ /* wb_info_pool is not needed if backlog is zero */
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ } else if (backlog && !mddev->wb_info_pool) {
+ /* wb_info_pool is needed since backlog is not zero */
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, false);
+ }
+ if (old_mwb != backlog)
+ md_bitmap_update_sb(mddev->bitmap);
return len;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1f37a1adc926..692fc365e73c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -37,6 +37,7 @@
*/
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int rdev_init_wb(struct md_rdev *rdev)
+{
+ if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ return 0;
+
+ spin_lock_init(&rdev->wb_list_lock);
+ INIT_LIST_HEAD(&rdev->wb_list);
+ init_waitqueue_head(&rdev->wb_io_wait);
+ set_bit(WBCollisionCheck, &rdev->flags);
+
+ return 1;
+}
+
+/*
+ * Create wb_info_pool if rdev is the first multi-queue device flaged
+ * with writemostly, also write-behind mode is enabled.
+ */
+void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
+{
+ if (mddev->bitmap_info.max_write_behind == 0)
+ return;
+
+ if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ return;
+
+ if (mddev->wb_info_pool == NULL) {
+ unsigned int noio_flag;
+
+ if (!is_suspend)
+ mddev_suspend(mddev);
+ noio_flag = memalloc_noio_save();
+ mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ memalloc_noio_restore(noio_flag);
+ if (!mddev->wb_info_pool)
+ pr_err("can't alloc memory pool for writemostly\n");
+ if (!is_suspend)
+ mddev_resume(mddev);
+ }
+}
+EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
+
+/*
+ * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ */
+static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+ return;
+
+ if (mddev->wb_info_pool) {
+ struct md_rdev *temp;
+ int num = 0;
+
+ /*
+ * Check if other rdevs need wb_info_pool.
+ */
+ rdev_for_each(temp, mddev)
+ if (temp != rdev &&
+ test_bit(WBCollisionCheck, &temp->flags))
+ num++;
+ if (!num) {
+ mddev_suspend(rdev->mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ mddev_resume(rdev->mddev);
+ }
+ }
+}
+
static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = {
@@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b);
+ if (mddev->raid_disks)
+ mddev_create_wb_pool(mddev, rdev, false);
+
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
+ mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev)
mddev->bitmap = bitmap;
}
- if (err) {
- mddev_detach(mddev);
- if (mddev->private)
- pers->free(mddev, mddev->private);
- mddev->private = NULL;
- module_put(pers->owner);
- md_bitmap_destroy(mddev);
- goto abort;
+ if (err)
+ goto bitmap_abort;
+
+ if (mddev->bitmap_info.max_write_behind > 0) {
+ bool creat_pool = false;
+
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(WriteMostly, &rdev->flags) &&
+ rdev_init_wb(rdev))
+ creat_pool = true;
+ }
+ if (creat_pool && mddev->wb_info_pool == NULL) {
+ mddev->wb_info_pool =
+ mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ if (!mddev->wb_info_pool) {
+ err = -ENOMEM;
+ goto bitmap_abort;
+ }
+ }
}
+
if (mddev->queue) {
bool nonrot = true;
@@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev)
sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
+bitmap_abort:
+ mddev_detach(mddev);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ module_put(pers->owner);
+ md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
@@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
}
void md_stop_writes(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7c930c091193..10f98200e2f8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -109,6 +109,14 @@ struct md_rdev {
* for reporting to userspace and storing
* in superblock.
*/
+
+ /*
+ * The members for check collision of write behind IOs.
+ */
+ struct list_head wb_list;
+ spinlock_t wb_list_lock;
+ wait_queue_head_t wb_io_wait;
+
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state'
@@ -193,6 +201,10 @@ enum flag_bits {
* it didn't fail, so don't use FailFast
* any more for metadata
*/
+ WBCollisionCheck, /*
+ * multiqueue device should check if there
+ * is collision between write behind bios.
+ */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -245,6 +257,14 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
+#define NR_WB_INFOS 8
+/* record current range of write behind IOs */
+struct wb_info {
+ sector_t lo;
+ sector_t hi;
+ struct list_head list;
+};
+
struct mddev {
void *private;
struct md_personality *pers;
@@ -461,6 +481,7 @@ struct mddev {
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
+ mempool_t *wb_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
@@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7860b5f33f2..3d44da663797 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
+static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi, *temp_wi;
+ unsigned long flags;
+ int ret = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(temp_wi, &rdev->wb_list, list) {
+ /* collision happened */
+ if (hi > temp_wi->lo && lo < temp_wi->hi) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!ret) {
+ wi->lo = lo;
+ wi->hi = hi;
+ list_add(&wi->list, &rdev->wb_list);
+ } else
+ mempool_free(wi, mddev->wb_info_pool);
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+
+ return ret;
+}
+
+static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi;
+ unsigned long flags;
+ int found = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(wi, &rdev->wb_list, list)
+ if (hi == wi->hi && lo == wi->lo) {
+ list_del(&wi->list);
+ mempool_free(wi, mddev->wb_info_pool);
+ found = 1;
+ break;
+ }
+
+ if (!found)
+ WARN_ON("The write behind IO is not recorded\n");
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+ wake_up(&rdev->wb_io_wait);
+}
+
/*
* for resync bio, r1bio pointer can be retrieved from the per-bio
* 'struct resync_pages'.
@@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ remove_wb(rdev, lo, hi);
+ }
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ wait_event(rdev->wb_io_wait,
+ check_and_add_wb(rdev, lo, hi) == 0);
+ }
+ if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}