From c31df25f20e35add6a453328c61eca15434fae18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 May 2015 23:34:20 -0700 Subject: md/raid10: make sync_request_write() call bio_copy_data() Refactor sync_request_write() of md/raid10 to use bio_copy_data() instead of open coding bio_vec iterations. Cc: Christoph Hellwig Cc: Neil Brown Cc: linux-raid@vger.kernel.org Reviewed-by: Christoph Hellwig Acked-by: NeilBrown Signed-off-by: Kent Overstreet [dpark: add more description in commit message] Signed-off-by: Dongsu Park Signed-off-by: Ming Lin Signed-off-by: NeilBrown --- drivers/md/raid10.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f55c3f35b746..03f460a1de60 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2099,17 +2099,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; - - for (j=0; j < vcnt ; j++) { - tbio->bi_io_vec[j].bv_offset = 0; - tbio->bi_io_vec[j].bv_len = PAGE_SIZE; - - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } tbio->bi_end_io = end_sync_write; + bio_copy_data(tbio, fbio); + d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); @@ -2124,17 +2117,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * that are active */ for (i = 0; i < conf->copies; i++) { - int j, d; + int d; tbio = r10_bio->devs[i].repl_bio; if (!tbio || !tbio->bi_end_io) continue; if (r10_bio->devs[i].bio->bi_end_io != end_sync_write && r10_bio->devs[i].bio != fbio) - for (j = 0; j < vcnt; j++) - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); + bio_copy_data(tbio, fbio); d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, -- cgit v1.2.3-59-g8ed1b From 4c9309c0cce96ea93be638dd243616e56e4bfe7d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 16 May 2015 14:02:38 +0300 Subject: md: convert to kstrto*() Convert away from deprecated simple_strto*() functions. Add "fit into sector_t" checks. Signed-off-by: Alexey Dobriyan Signed-off-by: NeilBrown --- drivers/md/md.c | 149 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 81 insertions(+), 68 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 4dbed4a67aaf..b6bbcf0cc430 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2630,13 +2630,14 @@ errors_show(struct md_rdev *rdev, char *page) static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&rdev->corrected_errors, n); - return len; - } - return -EINVAL; + unsigned int n; + int rv; + + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&rdev->corrected_errors, n); + return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); @@ -2653,13 +2654,16 @@ slot_show(struct md_rdev *rdev, char *page) static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; + int slot; int err; - int slot = simple_strtoul(buf, &e, 10); + if (strncmp(buf, "none", 4)==0) slot = -1; - else if (e==buf || (*e && *e!= '\n')) - return -EINVAL; + else { + err = kstrtouint(buf, 10, (unsigned int *)&slot); + if (err < 0) + return err; + } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating @@ -3544,12 +3548,12 @@ layout_show(struct mddev *mddev, char *page) static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; int err; - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) return err; @@ -3593,12 +3597,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; + unsigned int n; int err; - unsigned long n = simple_strtoul(buf, &e, 10); - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) @@ -3645,12 +3649,12 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + unsigned long n; int err; - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtoul(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) @@ -3688,19 +3692,24 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + unsigned long long n; int err; - char *e; - unsigned long long n = simple_strtoull(buf, &e, 10); + + if (cmd_match(buf, "none")) + n = MaxSector; + else { + err = kstrtoull(buf, 10, &n); + if (err < 0) + return err; + if (n != (sector_t)n) + return -EINVAL; + } err = mddev_lock(mddev); if (err) return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) err = -EBUSY; - else if (cmd_match(buf, "none")) - n = MaxSector; - else if (!*buf || (*e && *e != '\n')) - err = -EINVAL; if (!err) { mddev->recovery_cp = n; @@ -3936,14 +3945,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) { static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; + int rv; - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&mddev->max_corr_read_errors, n); - return len; - } - return -EINVAL; + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&mddev->max_corr_read_errors, n); + return len; } static struct md_sysfs_entry max_corr_read_errors = @@ -4300,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page) static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { - int min; - char *e; + unsigned int min; + int rv; + if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_min = 0; - return len; + min = 0; + } else { + rv = kstrtouint(buf, 10, &min); + if (rv < 0) + return rv; + if (min == 0) + return -EINVAL; } - min = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || min <= 0) - return -EINVAL; mddev->sync_speed_min = min; return len; } @@ -4326,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page) static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { - int max; - char *e; + unsigned int max; + int rv; + if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_max = 0; - return len; + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; } - max = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || max <= 0) - return -EINVAL; mddev->sync_speed_max = max; return len; } @@ -4517,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old; + unsigned long long old, new; int err; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); @@ -4559,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old; + unsigned long long old, new; int err; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); @@ -4606,11 +4623,13 @@ static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; - char *e; + unsigned long long new; int err; - unsigned long long new = simple_strtoull(buf, &e, 10); - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); if (err) @@ -9013,13 +9032,7 @@ static int get_ro(char *buffer, struct kernel_param *kp) } static int set_ro(const char *val, struct kernel_param *kp) { - char *e; - int num = simple_strtoul(val, &e, 10); - if (*val && (*e == '\0' || *e == '\n')) { - start_readonly = num; - return 0; - } - return -EINVAL; + return kstrtouint(val, 10, (unsigned int *)&start_readonly); } module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); -- cgit v1.2.3-59-g8ed1b From b1b4648648e18775082858eca2517322f63e57a1 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 8 May 2015 18:19:06 +1000 Subject: md/raid5: split wait_for_stripe and introduce wait_for_quiescent I noticed heavy spin lock contention at get_active_stripe(), introduced at being wake up stage, where a bunch of processes try to re-hold the spin lock again. After giving some thoughts on this issue, I found the lock could be relieved(and even avoided) if we turn the wait_for_stripe to per waitqueue for each lock hash and make the wake up exclusive: wake up one process each time, which avoids the lock contention naturally. Before go hacking with wait_for_stripe, I found it actually has 2 usages: for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, wait_for_quiescent, and the next patch will turn the second usage into one waitqueue for each hash value, and make it exclusive, to relieve the lock contention. v2: wake_up(wait_for_quiescent) when (active_stripes == 0) Commit log refactor suggestion from Neil. Signed-off-by: Yuanhan Liu Signed-off-by: NeilBrown --- drivers/md/raid5.c | 15 +++++++++------ drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b6793d2e051f..a9112b39afee 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, if (do_wakeup) { wake_up(&conf->wait_for_stripe); + if (atomic_read(&conf->active_stripes) == 0) + wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } @@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); @@ -4760,7 +4762,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return; } @@ -4855,7 +4857,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, conf->device_lock); atomic_inc(&conf->active_aligned_reads); @@ -5699,7 +5701,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return handled; } @@ -6433,6 +6435,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); + init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -7466,7 +7469,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7480,7 +7483,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 896d603ad0da..9b84b8820fc5 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -511,6 +511,7 @@ struct r5conf { struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; atomic_t empty_inactive_list_nr; struct llist_head released_stripes; + wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; -- cgit v1.2.3-59-g8ed1b From e9e4c377e2f563892c50d1d093dd55c7d518fc3d Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Fri, 8 May 2015 18:19:07 +1000 Subject: md/raid5: per hash value and exclusive wait_for_stripe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed heavy spin lock contention at get_active_stripe() with fsmark multiple thread write workloads. Here is how this hot contention comes from. We have limited stripes, and it's a multiple thread write workload. Hence, those stripes will be taken soon, which puts later processes to sleep for waiting free stripes. When enough stripes(>= 1/4 total stripes) are released, all process are woken, trying to get the lock. But there is one only being able to get this lock for each hash lock, making other processes spinning out there for acquiring the lock. Thus, it's effectiveless to wakeup all processes and let them battle for a lock that permits one to access only each time. Instead, we could make it be a exclusive wake up: wake up one process only. That avoids the heavy spin lock contention naturally. To do the exclusive wake up, we've to split wait_for_stripe into multiple wait queues, to make it per hash value, just like the hash lock. Here are some test results I have got with this patch applied(all test run 3 times): `fsmark.files_per_sec' ===================== next-20150317 this patch ------------------------- ------------------------- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params ------------------------- ------------------------- -------- ------------------------------ 25.600 ±0.0 92.700 ±2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 25.600 ±0.0 77.800 ±0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 32.000 ±0.0 93.800 ±1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 32.000 ±0.0 81.233 ±1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 48.800 ±14.5 99.667 ±2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 6.400 ±0.0 12.800 ±0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 63.133 ±8.2 82.800 ±0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 245.067 ±0.7 306.567 ±7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose 17.533 ±0.3 21.000 ±0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose 188.167 ±1.9 215.033 ±3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync 254.500 ±1.8 290.733 ±2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync `time.system_time' ===================== next-20150317 this patch ------------------------- ------------------------- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params ------------------------- ------------------------- -------- ------------------------------ 7235.603 ±1.2 185.163 ±1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 7666.883 ±2.9 202.750 ±1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 14567.893 ±0.7 421.230 ±0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 3697.667 ±14.0 148.190 ±1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 5572.867 ±3.8 310.717 ±1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 5565.050 ±0.5 313.277 ±1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 2420.707 ±17.1 171.043 ±2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 3743.300 ±4.6 379.827 ±3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose 3308.687 ±6.3 363.050 ±2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose Where, 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk As you can see, though there are no much performance gain for hard disk workload, the system time is dropped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu Signed-off-by: NeilBrown --- drivers/md/raid5.c | 27 +++++++++++++++++++-------- drivers/md/raid5.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a9112b39afee..9a3b143b0b68 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(&conf->empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 << hash; spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup & (1 << i)) + wake_up(&conf->wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(&conf->wait_for_stripe); if (atomic_read(&conf->active_stripes) == 0) wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned) @@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - wait_event_lock_irq( - conf->wait_for_stripe, + wait_event_exclusive_cmd( + conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), - *(conf->hash_locks + hash)); + spin_unlock_irq(conf->hash_locks + hash), + spin_lock_irq(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); } else { @@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf->inactive_list + hash)) + wake_up(&conf->wait_for_stripe[hash]); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -2179,7 +2188,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf->wait_for_stripe, + wait_event_exclusive_cmd(conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -6436,7 +6445,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); init_waitqueue_head(&conf->wait_for_quiescent); - init_waitqueue_head(&conf->wait_for_stripe); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(&conf->wait_for_stripe[i]); + } init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9b84b8820fc5..02c3bf8fbfe7 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -512,7 +512,7 @@ struct r5conf { atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, -- cgit v1.2.3-59-g8ed1b From 713bc5c2deb437ef40d48374ae6a57e030bd9e35 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 28 May 2015 17:33:47 -0700 Subject: md/raid5: ignore released_stripes check conf->released_stripes list isn't always related to where there are free stripes pending. Active stripes can be in the list too. And even free stripes were active very recently. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9a3b143b0b68..59e44e99eef3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -681,8 +681,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { sh = get_free_stripe(conf, hash); - if (!sh && llist_empty(&conf->released_stripes) && - !test_bit(R5_DID_ALLOC, &conf->cache_state)) + if (!sh && !test_bit(R5_DID_ALLOC, + &conf->cache_state)) set_bit(R5_ALLOC_MORE, &conf->cache_state); } -- cgit v1.2.3-59-g8ed1b From 4e023612325a9034a542bfab79f78b1fe5ebb841 Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Thu, 11 Jun 2015 09:41:10 +0800 Subject: md: fix a build warning Warning like this: drivers/md/md.c: In function "update_array_info": drivers/md/md.c:6394:26: warning: logical not is only applied to the left hand side of comparison [-Wlogical-not-parentheses] !mddev->persistent != info->not_persistent|| Fix it as Neil Brown said: mddev->persistent != !info->not_persistent || Signed-off-by: Firo Yang Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b6bbcf0cc430..3d339e283cf6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6394,7 +6394,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->ctime != info->ctime || mddev->level != info->level || /* mddev->layout != info->layout || */ - !mddev->persistent != info->not_persistent|| + mddev->persistent != !info->not_persistent || mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) -- cgit v1.2.3-59-g8ed1b From bd6919228d7e1867ae9e24ab27e3e4a366c87d21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Jun 2015 17:01:40 +1000 Subject: md: clear mddev->private when it has been freed. If ->private is set when ->run is called, it is assumed to be a 'config' prepared as part of 'reshape'. So it is important when we free that config, that we also clear ->private. This is not often a problem as the mddev will normally be discarded shortly after the config us freed. However if an 'assemble' races with a final close, the assemble can use the old mddev which has a stale ->private. This leads to any of various sorts of crashes. So clear ->private after calling ->free(). Reported-by: Nate Clark Cc: stable@vger.kernel.org (v4.0+) Fixes: afa0f557cb15 ("md: rename ->stop to ->free") Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3d339e283cf6..939739f0f881 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5178,6 +5178,7 @@ int md_run(struct mddev *mddev) mddev_detach(mddev); if (mddev->private) pers->free(mddev, mddev->private); + mddev->private = NULL; module_put(pers->owner); bitmap_destroy(mddev); return err; @@ -5313,6 +5314,7 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->private = NULL; mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; @@ -5385,6 +5387,7 @@ static void __md_stop(struct mddev *mddev) mddev->pers = NULL; spin_unlock(&mddev->lock); pers->free(mddev, mddev->private); + mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); -- cgit v1.2.3-59-g8ed1b From 9a8c0fa861e4db60409b4dda254cef5e17e4d43c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Jun 2015 17:06:40 +1000 Subject: md: unlock mddev_lock on an error path. This error path retuns while still holding the lock - bad. Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.") Cc: stable@vger.kernel.org (v4.0+) Signed-off-by: NeilBrown --- drivers/md/md.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 939739f0f881..5fcce7371ee9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4014,8 +4014,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) else rdev = md_import_device(dev, -1, -1); - if (IS_ERR(rdev)) + if (IS_ERR(rdev)) { + mddev_unlock(mddev); return PTR_ERR(rdev); + } err = bind_rdev_to_array(rdev, mddev); out: if (err) -- cgit v1.2.3-59-g8ed1b From ab16bfc732c436658d13455f28b0b4a2608a7476 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Wed, 17 Jun 2015 12:31:46 +1000 Subject: md: clear Blocked flag on failed devices when array is read-only. The Blocked flag indicates that a device has failed but that this fact hasn't been recorded in the metadata yet. Writes to such devices cannot be allowed until the metadata has been updated. On a read-only array, the Blocked flag will never be cleared. This prevents the device being removed from the array. If the metadata is being handled by the kernel (i.e. !mddev->external), then we can be sure that if the array is switch to writable, then a metadata update will happen and will record the failure. So we don't need the flag set. If metadata is externally managed, it is upto the external manager to clear the 'blocked' flag. Reported-by: XiaoNi Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5fcce7371ee9..202deb43b822 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8130,6 +8130,15 @@ void md_check_recovery(struct mddev *mddev) int spares = 0; if (mddev->ro) { + struct md_rdev *rdev; + if (!mddev->external && mddev->in_sync) + /* 'Blocked' flag not needed as failed devices + * will be recorded if array switched to read/write. + * Leaving it set will prevent the device + * from being removed. + */ + rdev_for_each(rdev, mddev) + clear_bit(Blocked, &rdev->flags); /* On a read-only array we can: * - remove failed devices * - add already-in_sync devices if the array itself -- cgit v1.2.3-59-g8ed1b