aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-03-01 10:08:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-03-01 10:08:47 -0800
commit7e30309968c18b504320275d527914272fbe1a1c (patch)
treee5806f11034b5b6a7c1d755dd391342fd430326b
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk (diff)
parentmd/raid1: fix NULL pointer dereference (diff)
downloadlinux-dev-7e30309968c18b504320275d527914272fbe1a1c.tar.xz
linux-dev-7e30309968c18b504320275d527914272fbe1a1c.zip
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD bugfixes from Shaohua Li: - fix raid5-ppl flush request handling hang from Artur - fix a potential deadlock in raid5/10 reshape from BingJing - fix a deadlock for dm-raid from Heinz - fix two md-cluster of raid10 from Lidong and Guoqing - fix a NULL deference problem in device removal from Neil - fix a NULL deference problem in raid1/raid10 in specific condition from Yufen - other cleanup and fixes * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: md/raid1: fix NULL pointer dereference md: fix a potential deadlock of raid5/raid10 reshape md-cluster: choose correct label when clustered layout is not supported md: raid5: avoid string overflow warning raid5-ppl: fix handling flush requests md raid10: fix NULL deference in handle_write_completed() md: only allow remove_and_add_spares when no sync_thread running. md: document lifetime of internal rdev pointer. md: fix md_write_start() deadlock w/o metadata devices MD: Free bioset when md_run fails raid10: change the size of resync window for clustered raid md-multipath: Use seq_putc() in multipath_status() md/raid1: Fix trailing semicolon md/raid5: simplify uninitialization of shrinker
-rw-r--r--drivers/md/md-multipath.c2
-rw-r--r--drivers/md/md.c53
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/raid1.c11
-rw-r--r--drivers/md/raid1.h12
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/md/raid10.h13
-rw-r--r--drivers/md/raid5-log.h3
-rw-r--r--drivers/md/raid5-ppl.c10
-rw-r--r--drivers/md/raid5.c19
-rw-r--r--drivers/md/raid5.h12
11 files changed, 124 insertions, 31 deletions
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index e40065bdbfc8..0a7e99d62c69 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -157,7 +157,7 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
rcu_read_unlock();
- seq_printf (seq, "]");
+ seq_putc(seq, ']');
}
static int multipath_congested(struct mddev *mddev, int bits)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index bc67ab6844f0..254e44e44668 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -801,6 +801,9 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio;
int ff = 0;
+ if (!page)
+ return;
+
if (test_bit(Faulty, &rdev->flags))
return;
@@ -5452,6 +5455,7 @@ int md_run(struct mddev *mddev)
* the only valid external interface is through the md
* device.
*/
+ mddev->has_superblocks = false;
rdev_for_each(rdev, mddev) {
if (test_bit(Faulty, &rdev->flags))
continue;
@@ -5465,6 +5469,9 @@ int md_run(struct mddev *mddev)
set_disk_ro(mddev->gendisk, 1);
}
+ if (rdev->sb_page)
+ mddev->has_superblocks = true;
+
/* perform some consistency tests on the device.
* We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere.
@@ -5497,8 +5504,10 @@ int md_run(struct mddev *mddev)
}
if (mddev->sync_set == NULL) {
mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!mddev->sync_set)
- return -ENOMEM;
+ if (!mddev->sync_set) {
+ err = -ENOMEM;
+ goto abort;
+ }
}
spin_lock(&pers_lock);
@@ -5511,7 +5520,8 @@ int md_run(struct mddev *mddev)
else
pr_warn("md: personality for level %s is not loaded!\n",
mddev->clevel);
- return -EINVAL;
+ err = -EINVAL;
+ goto abort;
}
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
@@ -5524,7 +5534,8 @@ int md_run(struct mddev *mddev)
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
module_put(pers->owner);
- return -EINVAL;
+ err = -EINVAL;
+ goto abort;
}
if (pers->sync_request) {
@@ -5593,7 +5604,7 @@ int md_run(struct mddev *mddev)
mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
- return err;
+ goto abort;
}
if (mddev->queue) {
bool nonrot = true;
@@ -5655,6 +5666,18 @@ int md_run(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_action);
sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
+
+abort:
+ if (mddev->bio_set) {
+ bioset_free(mddev->bio_set);
+ mddev->bio_set = NULL;
+ }
+ if (mddev->sync_set) {
+ bioset_free(mddev->sync_set);
+ mddev->sync_set = NULL;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -8049,6 +8072,7 @@ EXPORT_SYMBOL(md_done_sync);
bool md_write_start(struct mddev *mddev, struct bio *bi)
{
int did_change = 0;
+
if (bio_data_dir(bi) != WRITE)
return true;
@@ -8081,6 +8105,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
rcu_read_unlock();
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
+ if (!mddev->has_superblocks)
+ return true;
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
mddev->suspended);
@@ -8543,6 +8569,19 @@ void md_do_sync(struct md_thread *thread)
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ mddev->delta_disks > 0 &&
+ mddev->pers->finish_reshape &&
+ mddev->pers->size &&
+ mddev->queue) {
+ mddev_lock_nointr(mddev);
+ md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
+ mddev_unlock(mddev);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
+
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
@@ -8569,6 +8608,10 @@ static int remove_and_add_spares(struct mddev *mddev,
int removed = 0;
bool remove_some = false;
+ if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ /* Mustn't remove devices when resync thread is running */
+ return 0;
+
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) &&
rdev->raid_disk >= 0 &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 58cd20a5e85e..fbc925cce810 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -468,6 +468,8 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
+
+ bool has_superblocks:1;
};
enum recovery_flags {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f978eddc7a21..fe872dc6712e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1809,6 +1809,17 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct md_rdev *repl =
conf->mirrors[conf->raid_disks + number].rdev;
freeze_array(conf, 0);
+ if (atomic_read(&repl->nr_pending)) {
+ /* It means that some queued IO of retry_list
+ * hold repl. Thus, we cannot set replacement
+ * as NULL, avoiding rdev NULL pointer
+ * dereference in sync_request_write and
+ * handle_write_finished.
+ */
+ err = -EBUSY;
+ unfreeze_array(conf);
+ goto abort;
+ }
clear_bit(Replacement, &repl->flags);
p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c7294e7557e0..eb84bc68e2fd 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -26,6 +26,18 @@
#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS)
+/* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk.
+ * There are three safe ways to access raid1_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery is known to be happening - i.e. in code that is
+ * called as part of performing resync/recovery.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ * and if it is non-NULL, increment rdev->nr_pending before dropping the
+ * RCU lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if it has
+ * been incremented, the pointer is put back in .rdev.
+ */
+
struct raid1_info {
struct md_rdev *rdev;
sector_t head_position;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 99c9207899a7..c5e6c60fc0d4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -141,7 +141,7 @@ static void r10bio_pool_free(void *r10_bio, void *data)
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
-#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/*
@@ -2655,7 +2655,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
rdev = conf->mirrors[dev].rdev;
- if (r10_bio->devs[m].bio == NULL)
+ if (r10_bio->devs[m].bio == NULL ||
+ r10_bio->devs[m].bio->bi_end_io == NULL)
continue;
if (!r10_bio->devs[m].bio->bi_status) {
rdev_clear_badblocks(
@@ -2670,7 +2671,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
md_error(conf->mddev, rdev);
}
rdev = conf->mirrors[dev].replacement;
- if (r10_bio->devs[m].repl_bio == NULL)
+ if (r10_bio->devs[m].repl_bio == NULL ||
+ r10_bio->devs[m].repl_bio->bi_end_io == NULL)
continue;
if (!r10_bio->devs[m].repl_bio->bi_status) {
@@ -3782,7 +3784,7 @@ static int raid10_run(struct mddev *mddev)
if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered"
" raid10\n");
- goto out;
+ goto out_free_conf;
}
}
@@ -4830,17 +4832,11 @@ static void raid10_finish_reshape(struct mddev *mddev)
return;
if (mddev->delta_disks > 0) {
- sector_t size = raid10_size(mddev, 0, 0);
- md_set_array_sectors(mddev, size);
if (mddev->recovery_cp > mddev->resync_max_sectors) {
mddev->recovery_cp = mddev->resync_max_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
- mddev->resync_max_sectors = size;
- if (mddev->queue) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
- }
+ mddev->resync_max_sectors = mddev->array_sectors;
} else {
int d;
rcu_read_lock();
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index db2ac22ac1b4..e2e8840de9bf 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -2,6 +2,19 @@
#ifndef _RAID10_H
#define _RAID10_H
+/* Note: raid10_info.rdev can be set to NULL asynchronously by
+ * raid10_remove_disk.
+ * There are three safe ways to access raid10_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery/reshape is known to be happening - i.e. in code
+ * that is called as part of performing resync/recovery/reshape.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ * and if it is non-NULL, increment rdev->nr_pending before dropping the
+ * RCU lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if it has
+ * been incremented, the pointer is put back in .rdev.
+ */
+
struct raid10_info {
struct md_rdev *rdev, *replacement;
sector_t head_position;
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 0c76bcedfc1c..a001808a2b77 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -44,6 +44,7 @@ extern void ppl_write_stripe_run(struct r5conf *conf);
extern void ppl_stripe_write_finished(struct stripe_head *sh);
extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
extern void ppl_quiesce(struct r5conf *conf, int quiesce);
+extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
static inline bool raid5_has_ppl(struct r5conf *conf)
{
@@ -104,7 +105,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf))
- ret = 0;
+ ret = ppl_handle_flush_request(conf->log, bio);
return ret;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 2764c2290062..42890a08375b 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -693,6 +693,16 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
}
}
+int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+ if (bio->bi_iter.bi_size == 0) {
+ bio_endio(bio);
+ return 0;
+ }
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ return -EAGAIN;
+}
+
void ppl_stripe_write_finished(struct stripe_head *sh)
{
struct ppl_io_unit *io;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 50d01144b805..b5d2601483e3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2196,15 +2196,16 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
static int grow_stripes(struct r5conf *conf, int num)
{
struct kmem_cache *sc;
+ size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks);
if (conf->mddev->gendisk)
- sprintf(conf->cache_name[0],
+ snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
else
- sprintf(conf->cache_name[0],
+ snprintf(conf->cache_name[0], namelen,
"raid%d-%p", conf->level, conf->mddev);
- sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
+ snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name],
@@ -6764,9 +6765,7 @@ static void free_conf(struct r5conf *conf)
log_exit(conf);
- if (conf->shrinker.nr_deferred)
- unregister_shrinker(&conf->shrinker);
-
+ unregister_shrinker(&conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
@@ -8001,13 +8000,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- if (mddev->delta_disks > 0) {
- md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- if (mddev->queue) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
- }
- } else {
+ if (mddev->delta_disks <= 0) {
int d;
spin_lock_irq(&conf->device_lock);
mddev->degraded = raid5_calc_degraded(conf);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2e6123825095..3f8da26032ac 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -450,6 +450,18 @@ enum {
* HANDLE gets cleared if stripe_handle leaves nothing locked.
*/
+/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk.
+ * There are three safe ways to access disk_info.rdev.
+ * 1/ when holding mddev->reconfig_mutex
+ * 2/ when resync/recovery/reshape is known to be happening - i.e. in code that
+ * is called as part of performing resync/recovery/reshape.
+ * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
+ * and if it is non-NULL, increment rdev->nr_pending before dropping the RCU
+ * lock.
+ * When .rdev is set to NULL, the nr_pending count checked again and if
+ * it has been incremented, the pointer is put back in .rdev.
+ */
+
struct disk_info {
struct md_rdev *rdev, *replacement;
struct page *extra_page; /* extra page to use in prexor */