From 03a9e24ef2aaa5f1f9837356aed79c860521407a Mon Sep 17 00:00:00 2001 From: "colyli@suse.de" Date: Sat, 28 Jan 2017 21:11:49 +0800 Subject: md linear: fix a race between linear_add() and linear_congested() Recently I receive a bug report that on Linux v3.0 based kerenl, hot add disk to a md linear device causes kernel crash at linear_congested(). From the crash image analysis, I find in linear_congested(), mddev->raid_disks contains value N, but conf->disks[] only has N-1 pointers available. Then a NULL pointer deference crashes the kernel. There is a race between linear_add() and linear_congested(), RCU stuffs used in these two functions cannot avoid the race. Since Linuv v4.0 RCU code is replaced by introducing mddev_suspend(). After checking the upstream code, it seems linear_congested() is not called in generic_make_request() code patch, so mddev_suspend() cannot provent it from being called. The possible race still exists. Here I explain how the race still exists in current code. For a machine has many CPUs, on one CPU, linear_add() is called to add a hard disk to a md linear device; at the same time on other CPU, linear_congested() is called to detect whether this md linear device is congested before issuing an I/O request onto it. Now I use a possible code execution time sequence to demo how the possible race happens, seq linear_add() linear_congested() 0 conf=mddev->private 1 oldconf=mddev->private 2 mddev->raid_disks++ 3 for (i=0; iraid_disks;i++) 4 bdev_get_queue(conf->disks[i].rdev->bdev) 5 mddev->private=newconf In linear_add() mddev->raid_disks is increased in time seq 2, and on another CPU in linear_congested() the for-loop iterates conf->disks[i] by the increased mddev->raid_disks in time seq 3,4. But conf with one more element (which is a pointer to struct dev_info type) to conf->disks[] is not updated yet, accessing its structure member in time seq 4 will cause a NULL pointer deference fault. To fix this race, there are 2 parts of modification in the patch, 1) Add 'int raid_disks' in struct linear_conf, as a copy of mddev->raid_disks. It is initialized in linear_conf(), always being consistent with pointers number of 'struct dev_info disks[]'. When iterating conf->disks[] in linear_congested(), use conf->raid_disks to replace mddev->raid_disks in the for-loop, then NULL pointer deference will not happen again. 2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to free oldconf memory. Because oldconf may be referenced as mddev->private in linear_congested(), kfree_rcu() makes sure that its memory will not be released until no one uses it any more. Also some code comments are added in this patch, to make this modification to be easier understandable. This patch can be applied for kernels since v4.0 after commit: 3be260cc18f8 ("md/linear: remove rcu protections in favour of suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for people who maintain kernels before Linux v4.0, they need to do some back back port to this patch. Changelog: - V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to replace rcu_call() in linear_add(). - v2: add RCU stuffs by suggestion from Shaohua and Neil. - v1: initial effort. Signed-off-by: Coly Li Cc: Shaohua Li Cc: Neil Brown Cc: stable@vger.kernel.org Signed-off-by: Shaohua Li --- drivers/md/linear.c | 39 ++++++++++++++++++++++++++++++++++----- drivers/md/linear.h | 1 + 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5975c9915684..26a73b2002cf 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -196,15 +217,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference(mddev->private); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif -- cgit v1.2.3-59-g8ed1b From 765d704db1f583630d52dc14c1ea573db6783459 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 4 Jan 2017 09:33:23 -0800 Subject: raid5: only dispatch IO from raid5d for harddisk raid We made raid5 stripe handling multi-thread before. It works well for SSD. But for harddisk, the multi-threading creates more disk seek, so not always improve performance. For several hard disks based raid5, multi-threading is required as raid5d becames a bottleneck especially for sequential write. To overcome the disk seek issue, we only dispatch IO from raid5d if the array is harddisk based. Other threads can still handle stripes, but can't dispatch IO. Idealy, we should control IO dispatching order according to IO position interrnally. Right now we still depend on block layer, which isn't very efficient sometimes though. My setup has 9 harddisks, each disk can do around 180M/s sequential write. So in theory, the raid5 can do 180 * 8 = 1440M/s sequential write. The test machine uses an ATOM CPU. I measure sequential write with large iodepth bandwidth to raid array: without patch: ~600M/s without patch and group_thread_cnt=4: 750M/s with patch and group_thread_cnt=4: 950M/s with patch, group_thread_cnt=4, skip_copy=1: 1150M/s We are pretty close to the maximum bandwidth in the large iodepth iodepth case. The performance gap of small iodepth sequential write between software raid and theory value is still very big though, because we don't have an efficient pipeline. Cc: NeilBrown Cc: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- drivers/md/raid5.h | 4 ++++ 2 files changed, 57 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3c7e106c12a2..9d744a8961d1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -863,6 +863,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp; + struct bio *bio; + + if (!conf->batch_bio_dispatch || !conf->group_cnt) + return; + + bio_list_init(&tmp); + spin_lock(&conf->pending_bios_lock); + bio_list_merge(&tmp, &conf->pending_bios); + bio_list_init(&conf->pending_bios); + spin_unlock(&conf->pending_bios_lock); + + while ((bio = bio_list_pop(&tmp))) + generic_make_request(bio); +} + +static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +{ + /* + * change group_cnt will drain all bios, so this is safe + * + * A read generally means a read-modify-write, which usually means a + * randwrite, so we don't delay it + */ + if (!conf->batch_bio_dispatch || !conf->group_cnt || + bio_op(bio) == REQ_OP_READ) { + generic_make_request(bio); + return; + } + spin_lock(&conf->pending_bios_lock); + bio_list_add(&conf->pending_bios, bio); + spin_unlock(&conf->pending_bios_lock); + md_wakeup_thread(conf->mddev->thread); +} + static void raid5_end_read_request(struct bio *bi); static void @@ -1043,7 +1080,7 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(bi); + defer_bio_issue(conf, bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1088,7 +1125,7 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(rbi); + defer_bio_issue(conf, rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -6126,6 +6163,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + flush_deferred_bios(conf); + r5l_flush_stripe_to_raid(conf->log); async_tx_issue_pending_all(); @@ -6711,6 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + bio_list_init(&conf->pending_bios); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + conf->batch_bio_dispatch = false; + break; + } + } + conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1440fa26e296..ebb89bda88f1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -684,6 +684,10 @@ struct r5conf { int group_cnt; int worker_cnt_per_group; struct r5l_log *log; + + struct bio_list pending_bios; + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; }; -- cgit v1.2.3-59-g8ed1b From 03b047f45c29dff02f913a0234ca0cc1ca51966f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 11 Jan 2017 13:39:14 -0800 Subject: md/r5cache: enable chunk_aligned_read with write back cache Chunk aligned read significantly reduces CPU usage of raid456. However, it is not safe to fully bypass the write back cache. This patch enables chunk aligned read with write back cache. For chunk aligned read, we track stripes in write back cache at a bigger granularity, "big_stripe". Each chunk may contain more than one stripe (for example, a 256kB chunk contains 64 4kB-page, so this chunk contain 64 stripes). For chunk_aligned_read, these stripes are grouped into one big_stripe, so we only need one lookup for the whole chunk. For each big_stripe, struct big_stripe_info tracks how many stripes of this big_stripe are in the write back cache. We count how many stripes of this big_stripe are in the write back cache. These counters are tracked in a radix tree (big_stripe_tree). r5c_tree_index() is used to calculate keys for the radix tree. chunk_aligned_read() calls r5c_big_stripe_cached() to look up big_stripe of each chunk in the tree. If this big_stripe is in the tree, chunk_aligned_read() aborts. This look up is protected by rcu_read_lock(). It is necessary to remember whether a stripe is counted in big_stripe_tree. Instead of adding new flag, we reuses existing flags: STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these two flags are set, the stripe is counted in big_stripe_tree. This requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to r5c_try_caching_write(); and moving clear_bit of STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to r5c_finish_stripe_write_out(). Signed-off-by: Song Liu Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 171 ++++++++++++++++++++++++++++++++++++++++++----- drivers/md/raid5.c | 20 ++++-- drivers/md/raid5.h | 1 + 3 files changed, 168 insertions(+), 24 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 302dea3296ba..76c0e5063f1b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "md.h" #include "raid5.h" #include "bitmap.h" @@ -164,8 +165,59 @@ struct r5l_log { struct work_struct deferred_io_work; /* to disable write back during in degraded mode */ struct work_struct disable_writeback_work; + + /* to for chunk_aligned_read in writeback mode, details below */ + spinlock_t tree_lock; + struct radix_tree_root big_stripe_tree; }; +/* + * Enable chunk_aligned_read() with write back cache. + * + * Each chunk may contain more than one stripe (for example, a 256kB + * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For + * chunk_aligned_read, these stripes are grouped into one "big_stripe". + * For each big_stripe, we count how many stripes of this big_stripe + * are in the write back cache. These data are tracked in a radix tree + * (big_stripe_tree). We use radix_tree item pointer as the counter. + * r5c_tree_index() is used to calculate keys for the radix tree. + * + * chunk_aligned_read() calls r5c_big_stripe_cached() to look up + * big_stripe of each chunk in the tree. If this big_stripe is in the + * tree, chunk_aligned_read() aborts. This look up is protected by + * rcu_read_lock(). + * + * It is necessary to remember whether a stripe is counted in + * big_stripe_tree. Instead of adding new flag, we reuses existing flags: + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these + * two flags are set, the stripe is counted in big_stripe_tree. This + * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to + * r5c_try_caching_write(); and moving clear_bit of + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to + * r5c_finish_stripe_write_out(). + */ + +/* + * radix tree requests lowest 2 bits of data pointer to be 2b'00. + * So it is necessary to left shift the counter by 2 bits before using it + * as data pointer of the tree. + */ +#define R5C_RADIX_COUNT_SHIFT 2 + +/* + * calculate key for big_stripe_tree + * + * sect: align_bi->bi_iter.bi_sector or sh->sector + */ +static inline sector_t r5c_tree_index(struct r5conf *conf, + sector_t sect) +{ + sector_t offset; + + offset = sector_div(sect, conf->chunk_sectors); + return sect; +} + /* * an IO range starts from a meta data block and end at the next meta data * block. The io unit's the meta data block tracks data/parity followed it. io @@ -412,16 +464,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - - if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); - atomic_dec(&conf->r5c_cached_partial_stripes); - } - - if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); - atomic_dec(&conf->r5c_cached_full_stripes); - } } static void r5c_handle_data_cached(struct stripe_head *sh) @@ -2320,6 +2362,10 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; + void **pslot; + sector_t tree_index; + int ret; + uintptr_t refcount; BUG_ON(!r5c_is_writeback(log)); @@ -2364,6 +2410,44 @@ int r5c_try_caching_write(struct r5conf *conf, } } + /* if the stripe is not counted in big_stripe_tree, add it now */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + if (pslot) { + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); + } else { + /* + * this radix_tree_insert can fail safely, so no + * need to call radix_tree_preload() + */ + ret = radix_tree_insert( + &log->big_stripe_tree, tree_index, + (void *)(1 << R5C_RADIX_COUNT_SHIFT)); + if (ret) { + spin_unlock(&log->tree_lock); + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + spin_unlock(&log->tree_lock); + + /* + * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is + * counted in the radix tree + */ + set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); + atomic_inc(&conf->r5c_cached_partial_stripes); + } + for (i = disks; i--; ) { dev = &sh->dev[i]; if (dev->towrite) { @@ -2438,17 +2522,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { + struct r5l_log *log = conf->log; int i; int do_wakeup = 0; + sector_t tree_index; + void **pslot; + uintptr_t refcount; - if (!conf->log || - !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) + if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) return; WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); - if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; for (i = sh->disks; i--; ) { @@ -2470,12 +2557,43 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (do_wakeup) wake_up(&conf->wait_for_overlap); - spin_lock_irq(&conf->log->stripe_in_journal_lock); + spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); - spin_unlock_irq(&conf->log->stripe_in_journal_lock); + spin_unlock_irq(&log->stripe_in_journal_lock); sh->log_start = MaxSector; - atomic_dec(&conf->log->stripe_in_journal_count); - r5c_update_log_state(conf->log); + + atomic_dec(&log->stripe_in_journal_count); + r5c_update_log_state(log); + + /* stop counting this stripe in big_stripe_tree */ + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + BUG_ON(pslot == NULL); + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + if (refcount == 1) + radix_tree_delete(&log->big_stripe_tree, tree_index); + else + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); + spin_unlock(&log->tree_lock); + } + + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_cached_partial_stripes); + } + + if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_cached_full_stripes); + } } int @@ -2535,6 +2653,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, return 0; } +/* check whether this big stripe is in write back cache. */ +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) +{ + struct r5l_log *log = conf->log; + sector_t tree_index; + void *slot; + + if (!log) + return false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + tree_index = r5c_tree_index(conf, sect); + slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); + return slot != NULL; +} + static int r5l_load_log(struct r5l_log *log) { struct md_rdev *rdev = log->rdev; @@ -2681,6 +2815,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->meta_pool) goto out_mempool; + spin_lock_init(&log->tree_lock); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9d744a8961d1..b62f671a93ab 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, atomic_dec(&conf->r5c_cached_partial_stripes); list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); r5c_check_cached_full_stripe(conf); - } else { - /* partial stripe */ - if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, - &sh->state)) - atomic_inc(&conf->r5c_cached_partial_stripes); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); - } } } } @@ -5062,6 +5062,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rdev->recovery_offset >= end_sector))) rdev = NULL; } + + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -5418,7 +5425,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - !r5c_is_writeback(conf->log) && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ebb89bda88f1..c0687df5ba06 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -792,4 +792,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); #endif -- cgit v1.2.3-59-g8ed1b From 39b99586b321a9cbd4fe9abb5ea9346d2c4ca9c6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 24 Jan 2017 14:08:23 -0800 Subject: md/r5cache: improve journal device efficiency It is important to be able to flush all stripes in raid5-cache. Therefore, we need reserve some space on the journal device for these flushes. If flush operation includes pending writes to the stripe, we need to reserve (conf->raid_disk + 1) pages per stripe for the flush out. This reduces the efficiency of journal space. If we exclude these pending writes from flush operation, we only need (conf->max_degraded + 1) pages per stripe. With this patch, when log space is critical (R5C_LOG_CRITICAL=1), pending writes will be excluded from stripe flush out. Therefore, we can reduce reserved space for flush out and thus improve journal device efficiency. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 36 +++++++++++++++++++++++++----------- drivers/md/raid5.c | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 20 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 76c0e5063f1b..caae85331a02 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -389,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) /* * Total log space (in sectors) needed to flush all data in cache * - * Currently, writing-out phase automatically includes all pending writes - * to the same sector. So the reclaim of each stripe takes up to - * (conf->raid_disks + 1) pages of log space. + * To avoid deadlock due to log space, it is necessary to reserve log + * space to flush critical stripes (stripes that occupying log space near + * last_checkpoint). This function helps check how much log space is + * required to flush all cached stripes. * - * To totally avoid deadlock due to log space, the code reserves - * (conf->raid_disks + 1) pages for each stripe in cache, which is not - * necessary in most cases. + * To reduce log space requirements, two mechanisms are used to give cache + * flush higher priorities: + * 1. In handle_stripe_dirtying() and schedule_reconstruction(), + * stripes ALREADY in journal can be flushed w/o pending writes; + * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal + * can be delayed (r5l_add_no_space_stripe). * - * To improve this, we will need writing-out phase to be able to NOT include - * pending writes, which will reduce the requirement to - * (conf->max_degraded + 1) pages per stripe in cache. + * In cache flush, the stripe goes through 1 and then 2. For a stripe that + * already passed 1, flushing it requires at most (conf->max_degraded + 1) + * pages of journal space. For stripes that has not passed 1, flushing it + * requires (conf->raid_disks + 1) pages of journal space. There are at + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space + * required to flush all cached stripes (in pages) is: + * + * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks + 1) + * or + * (stripe_in_journal_count) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks - max_degraded) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { @@ -408,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) if (!r5c_is_writeback(log)) return 0; - return BLOCK_SECTORS * (conf->raid_disks + 1) * - atomic_read(&log->stripe_in_journal_count); + return BLOCK_SECTORS * + ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + + (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); } /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b62f671a93ab..b0d1345c832c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2951,12 +2951,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * like to flush data in journal to RAID disks first, so complex rmw * is handled in the write patch (handle_stripe_dirtying). * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * */ -static inline bool delay_towrite(struct r5dev *dev, - struct stripe_head_state *s) +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) { - return !test_bit(R5_OVERWRITE, &dev->flags) && - !test_bit(R5_Insync, &dev->flags) && s->injournal; + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + return false; } static void @@ -2979,7 +3003,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite && !delay_towrite(dev, s)) { + if (dev->towrite && !delay_towrite(conf, dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) @@ -3731,7 +3755,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && @@ -3755,8 +3779,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, } } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ @@ -3796,7 +3820,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && -- cgit v1.2.3-59-g8ed1b From 9356863c9409efc79029c01a85d015efae977e69 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Feb 2017 13:41:39 +1100 Subject: md: ensure md devices are freed before module is unloaded. Commit: cbd199837750 ("md: Fix unfortunate interaction with evms") change mddev_put() so that it would not destroy an md device while ->ctime was non-zero. Unfortunately, we didn't make sure to clear ->ctime when unloading the module, so it is possible for an md device to remain after module unload. An attempt to open such a device will trigger an invalid memory reference in: get_gendisk -> kobj_lookup -> exact_lock -> get_disk when tring to access disk->fops, which was in the module that has been removed. So ensure we clear ->ctime in md_exit(), and explain how that is useful, as it isn't immediately obvious when looking at the code. Fixes: cbd199837750 ("md: Fix unfortunate interaction with evms") Tested-by: Guoqing Jiang Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 01175dac0db6..13020e5c8cc0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8980,7 +8980,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); -- cgit v1.2.3-59-g8ed1b From e8fd52eec2cd25b917983b3f3aa738b722522376 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 10 Feb 2017 16:18:08 -0800 Subject: md/raid5-cache: stripe reclaim only counts valid stripes When log space is tight, we try to reclaim stripes from log head. There are stripes which can't be reclaimed right now if some conditions are met. We skip such stripes but accidentally count them, which might cause no stripes are claimed. Fixing this by only counting valid stripes. Cc: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index caae85331a02..a01f4daeb390 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1418,9 +1418,9 @@ static void r5c_do_reclaim(struct r5conf *conf) !test_bit(STRIPE_HANDLE, &sh->state) && atomic_read(&sh->count) == 0) { r5c_flush_stripe(conf, sh); + if (count++ >= R5C_RECLAIM_STRIPE_GROUP) + break; } - if (count++ >= R5C_RECLAIM_STRIPE_GROUP) - break; } spin_unlock(&conf->device_lock); spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); -- cgit v1.2.3-59-g8ed1b From e33fbb9cc73d6502e69eaf1c178e0c39059763ea Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 10 Feb 2017 16:18:09 -0800 Subject: md/raid5-cache: exclude reclaiming stripes in reclaim check stripes which are being reclaimed are still accounted into cached stripes. The reclaim takes time. r5c_do_reclaim isn't aware of the stripes and does unnecessary stripe reclaim. In practice, I saw one stripe is reclaimed one time. This will cause bad IO pattern. Fixing this by excluding the reclaing stripes in the check. Cc: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 14 ++++++++++++-- drivers/md/raid5.c | 2 ++ drivers/md/raid5.h | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a01f4daeb390..3f307be01b10 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1327,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) atomic_inc(&conf->active_stripes); r5c_make_stripe_write_out(sh); + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_flushing_partial_stripes); + else + atomic_inc(&conf->r5c_flushing_full_stripes); raid5_release_stripe(sh); } @@ -1369,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf) unsigned long flags; int total_cached; int stripes_to_flush; + int flushing_partial, flushing_full; if (!r5c_is_writeback(log)) return; + flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); + flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + - atomic_read(&conf->r5c_cached_full_stripes); + atomic_read(&conf->r5c_cached_full_stripes) - + flushing_full - flushing_partial; if (total_cached > conf->min_nr_stripes * 3 / 4 || atomic_read(&conf->empty_inactive_list_nr) > 0) @@ -1384,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf) */ stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; else if (total_cached > conf->min_nr_stripes * 1 / 2 || - atomic_read(&conf->r5c_cached_full_stripes) > + atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > R5C_FULL_STRIPE_FLUSH_BATCH) /* * if stripe cache pressure moderate, or if there is many full @@ -2601,11 +2609,13 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_flushing_partial_stripes); atomic_dec(&conf->r5c_cached_partial_stripes); } if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_flushing_full_stripes); atomic_dec(&conf->r5c_cached_full_stripes); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b0d1345c832c..e2fa88fc6a1b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6838,6 +6838,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->r5c_full_stripe_list); atomic_set(&conf->r5c_cached_partial_stripes, 0); INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index c0687df5ba06..4bb27b97bf6b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -663,6 +663,8 @@ struct r5conf { struct list_head r5c_full_stripe_list; atomic_t r5c_cached_partial_stripes; struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; atomic_t empty_inactive_list_nr; struct llist_head released_stripes; -- cgit v1.2.3-59-g8ed1b From 26483819f89c5cf9d27620d70c95afeeeb9bece5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 13 Feb 2017 16:21:49 -0800 Subject: md: disable WRITE SAME if it fails in underlayer disks This makes md do the same thing as dm for write same IO failure. Please see 7eee4ae(dm: disable WRITE SAME if it fails) for details why we need this. We did a little bit different than dm. Instead of disabling writesame in the first IO error, we disable it till next writesame IO coming after the first IO error. This way we don't need to clone a bio. Also reported here: https://bugzilla.kernel.org/show_bug.cgi?id=118581 Suggested-by: NeilBrown Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/linear.c | 1 + drivers/md/md.h | 7 +++++++ drivers/md/multipath.c | 1 + drivers/md/raid0.c | 1 + 4 files changed, 10 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 26a73b2002cf..789008bc94ff 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -291,6 +291,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 2a514036a83d..42f8398181a8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -710,4 +710,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, { mddev->flags &= ~unsupported_flags; } + +static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + mddev->queue->limits.max_write_same_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index aa8c4e5c1ee2..065fe2823285 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 848365d474f3..b3d264452fd5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); -- cgit v1.2.3-59-g8ed1b From 10273170fd5602d8090b1312e66ad4746ab02c94 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 14 Feb 2017 23:29:00 +0800 Subject: md: fail if mddev->bio_set can't be created The current behaviour is to fall back to allocate bio from 'fs_bio_set', that isn't a correct way because it might cause deadlock. So this patch simply return failure if mddev->bio_set can't be created. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/md.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 13020e5c8cc0..137a1fe124e8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5228,8 +5228,11 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) + if (mddev->bio_set == NULL) { mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->bio_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); -- cgit v1.2.3-59-g8ed1b From 8e58e327e25c7fffc3fb79a24c76637bdda37716 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 14 Feb 2017 23:29:01 +0800 Subject: md/raid1: use bio_clone_bioset_partial() in case of write behind Write behind need to replace pages in bio's bvecs, and we have to clone a fresh bio with new bvec table, so use the introduced bio_clone_bioset_partial() for it. For other bio_clone_mddev() cases, we will use fast clone since they don't need to touch bvec table. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b0f647bcccb..691d6d9bf740 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1345,13 +1345,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 1; for (i = 0; i < disks; i++) { - struct bio *mbio; + struct bio *mbio = NULL; + sector_t offset; if (!r1_bio->bios[i]) continue; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); + offset = r1_bio->sector - bio->bi_iter.bi_sector; if (first_clone) { /* do behind I/O ? @@ -1361,8 +1360,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) + !waitqueue_active(&bitmap->behind_wait)) { + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset, + max_sectors); alloc_behind_pages(mbio, r1_bio); + } bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, @@ -1370,6 +1374,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, &r1_bio->state)); first_clone = 0; } + + if (!mbio) { + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(mbio, offset, max_sectors); + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; -- cgit v1.2.3-59-g8ed1b From ed7ef732ca9f7d6f42be8df6cc7bf4ace3534af3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 14 Feb 2017 23:29:02 +0800 Subject: md: remove unnecessary check on mddev mddev is never NULL and neither is ->bio_set, so remove the check. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 137a1fe124e8..0e408bc67904 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -193,9 +193,6 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev) { - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); } EXPORT_SYMBOL_GPL(bio_clone_mddev); -- cgit v1.2.3-59-g8ed1b From d7a1030839d35c04a620e841f406b9b2a8600041 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 14 Feb 2017 23:29:03 +0800 Subject: md: fast clone bio in bio_clone_mddev() Firstly bio_clone_mddev() is used in raid normal I/O and isn't in resync I/O path. Secondly all the direct access to bvec table in raid happens on resync I/O except for write behind of raid1, in which we still use bio_clone() for allocating new bvec table. So this patch replaces bio_clone() with bio_clone_fast() in bio_clone_mddev(). Also kill bio_clone_mddev() and call bio_clone_fast() directly, as suggested by Christoph Hellwig. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/faulty.c | 2 +- drivers/md/md.c | 7 ------- drivers/md/md.h | 2 -- drivers/md/raid1.c | 10 ++++++---- drivers/md/raid10.c | 11 +++++------ drivers/md/raid5.c | 4 ++-- 6 files changed, 14 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; diff --git a/drivers/md/md.c b/drivers/md/md.c index 0e408bc67904..55e7e7a8714e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -190,13 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat diff --git a/drivers/md/md.h b/drivers/md/md.h index 42f8398181a8..b8859cbf84b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 691d6d9bf740..ad5c9483bd50 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1108,7 +1108,7 @@ read_again: r1_bio->read_disk = rdisk; r1_bio->start_next_window = 0; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1376,7 +1376,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } if (!mbio) { - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, offset, max_sectors); } @@ -2286,7 +2286,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio->bi_vcnt = vcnt; } else { - wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); @@ -2424,7 +2425,8 @@ read_more: const unsigned long do_sync = r1_bio->master_bio->bi_opf & REQ_SYNC; r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1920756828df..ade7d69234d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1132,7 +1132,7 @@ read_again: } slot = r10_bio->read_slot; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1406,7 +1406,7 @@ retry_write: int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].bio = mbio; @@ -1457,7 +1457,7 @@ retry_write: smp_mb(); rdev = conf->mirrors[d].rdev; } - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].repl_bio = mbio; @@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + @@ -2641,8 +2641,7 @@ read_more: mdname(mddev), bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); + bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e2fa88fc6a1b..b193316804d4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5056,9 +5056,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } /* - * use bio_clone_mddev to make a copy of the bio + * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); if (!align_bi) return 0; /* -- cgit v1.2.3-59-g8ed1b From eae8263fb1f4256460270dd8f42334604dcdfac6 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 14 Feb 2017 16:26:24 +0900 Subject: md/raid5: Don't reinvent the wheel but use existing llist API Although llist provides proper APIs, they are not used. Make them used. Signed-off-by: Byungchul Park Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b193316804d4..7b7722bb2e8d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) { - struct stripe_head *sh; + struct stripe_head *sh, *t; int count = 0; struct llist_node *head; head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); - while (head) { + llist_for_each_entry_safe(sh, t, head, release_list) { int hash; - sh = llist_entry(head, struct stripe_head, release_list); - head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ smp_mb(); clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); -- cgit v1.2.3-59-g8ed1b From fd76863e37fef26fe05547fddfa6e3d05e1682e6 Mon Sep 17 00:00:00 2001 From: "colyli@suse.de" Date: Sat, 18 Feb 2017 03:05:56 +0800 Subject: RAID1: a new I/O barrier implementation to remove resync window 'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")' introduces a sliding resync window for raid1 I/O barrier, this idea limits I/O barriers to happen only inside a slidingresync window, for regular I/Os out of this resync window they don't need to wait for barrier any more. On large raid1 device, it helps a lot to improve parallel writing I/O throughput when there are background resync I/Os performing at same time. The idea of sliding resync widow is awesome, but code complexity is a challenge. Sliding resync window requires several variables to work collectively, this is complexed and very hard to make it work correctly. Just grep "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches to fix the original resync window patch. This is not the end, any further related modification may easily introduce more regreassion. Therefore I decide to implement a much simpler raid1 I/O barrier, by removing resync window code, I believe life will be much easier. The brief idea of the simpler barrier is, - Do not maintain a global unique resync window - Use multiple hash buckets to reduce I/O barrier conflicts, regular I/O only has to wait for a resync I/O when both them have same barrier bucket index, vice versa. - I/O barrier can be reduced to an acceptable number if there are enough barrier buckets Here I explain how the barrier buckets are designed, - BARRIER_UNIT_SECTOR_SIZE The whole LBA address space of a raid1 device is divided into multiple barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE. Bio requests won't go across border of barrier unit size, that means maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 (64MB) in bytes. For random I/O 64MB is large enough for both read and write requests, for sequential I/O considering underlying block layer may merge them into larger requests, 64MB is still good enough. Neil also points out that for resync operation, "we want the resync to move from region to region fairly quickly so that the slowness caused by having to synchronize with the resync is averaged out over a fairly small time frame". For full speed resync, 64MB should take less then 1 second. When resync is competing with other I/O, it could take up a few minutes. Therefore 64MB size is fairly good range for resync. - BARRIER_BUCKETS_NR There are BARRIER_BUCKETS_NR buckets in total, which is defined by, #define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - 2) #define BARRIER_BUCKETS_NR (1<> BARRIER_UNIT_SECTOR_BITS, BARRIER_BUCKETS_NR_BITS); } Here sector_nr is the start sector number of a bio. - Single bio won't go across boundary of a I/O barrier unit If a request goes across boundary of barrier unit, it will be split. A bio may be split in raid1_make_request() or raid1_sync_request(), if sectors returned by align_to_barrier_unit_end() is smaller than original bio size. Comparing to single sliding resync window, - Currently resync I/O grows linearly, therefore regular and resync I/O will conflict within a single barrier units. So the I/O behavior is similar to single sliding resync window. - But a barrier unit bucket is shared by all barrier units with identical barrier uinit index, the probability of conflict might be higher than single sliding resync window, in condition that writing I/Os always hit barrier units which have identical barrier bucket indexs with the resync I/Os. This is a very rare condition in real I/O work loads, I cannot imagine how it could happen in practice. - Therefore we can achieve a good enough low conflict rate with much simpler barrier algorithm and implementation. There are two changes should be noticed, - In raid1d(), I change the code to decrease conf->nr_pending[idx] into single loop, it looks like this, spin_lock_irqsave(&conf->device_lock, flags); conf->nr_queued[idx]--; spin_unlock_irqrestore(&conf->device_lock, flags); This change generates more spin lock operations, but in next patch of this patch set, it will be replaced by a single line code, atomic_dec(&conf->nr_queueud[idx]); So we don't need to worry about spin lock cost here. - Mainline raid1 code split original raid1_make_request() into raid1_read_request() and raid1_write_request(). If the original bio goes across an I/O barrier unit size, this bio will be split before calling raid1_read_request() or raid1_write_request(), this change the code logic more simple and clear. - In this patch wait_barrier() is moved from raid1_make_request() to raid1_write_request(). In raid_read_request(), original wait_barrier() is replaced by raid1_read_request(). The differnece is wait_read_barrier() only waits if array is frozen, using different barrier function in different code path makes the code more clean and easy to read. Changelog V4: - Add alloc_r1bio() to remove redundant r1bio memory allocation code. - Fix many typos in patch comments. - Use (PAGE_SHIFT - ilog2(sizeof(int))) to define BARRIER_BUCKETS_NR_BITS. V3: - Rebase the patch against latest upstream kernel code. - Many fixes by review comments from Neil, - Back to use pointers to replace arraries in struct r1conf - Remove total_barriers from struct r1conf - Add more patch comments to explain how/why the values of BARRIER_UNIT_SECTOR_SIZE and BARRIER_BUCKETS_NR are decided. - Use get_unqueued_pending() to replace get_all_pendings() and get_all_queued() - Increase bucket number from 512 to 1024 - Change code comments format by review from Shaohua. V2: - Use bio_split() to split the orignal bio if it goes across barrier unit bounday, to make the code more simple, by suggestion from Shaohua and Neil. - Use hash_long() to replace original linear hash, to avoid a possible confilict between resync I/O and sequential write I/O, by suggestion from Shaohua. - Add conf->total_barriers to record barrier depth, which is used to control number of parallel sync I/O barriers, by suggestion from Shaohua. - In V1 patch the bellowed barrier buckets related members in r1conf are allocated in memory page. To make the code more simple, V2 patch moves the memory space into struct r1conf, like this, - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; + int nr_pending[BARRIER_BUCKETS_NR]; + int nr_waiting[BARRIER_BUCKETS_NR]; + int nr_queued[BARRIER_BUCKETS_NR]; + int barrier[BARRIER_BUCKETS_NR]; This change is by the suggestion from Shaohua. - Remove some inrelavent code comments, by suggestion from Guoqing. - Add a missing wait_barrier() before jumping to retry_write, in raid1_make_write_request(). V1: - Original RFC patch for comments Signed-off-by: Coly Li Cc: Johannes Thumshirn Cc: Guoqing Jiang Reviewed-by: Neil Brown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 473 +++++++++++++++++++++++++++++------------------------ drivers/md/raid1.h | 57 ++++--- 2 files changed, 294 insertions(+), 236 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad5c9483bd50..40297fd17f7e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -71,9 +71,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector); -static void lower_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t sector_nr); +static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) @@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) -#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -215,7 +213,7 @@ static void put_buf(struct r1bio *r1_bio) mempool_free(r1_bio, conf->r1buf_pool); - lower_barrier(conf); + lower_barrier(conf, r1_bio->sector); } static void reschedule_retry(struct r1bio *r1_bio) @@ -223,10 +221,12 @@ static void reschedule_retry(struct r1bio *r1_bio) unsigned long flags; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; + int idx; + idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; + conf->nr_queued[idx]++; spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -243,7 +243,6 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; - sector_t start_next_window = r1_bio->start_next_window; sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { @@ -269,7 +268,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf, start_next_window, bi_sector); + allow_barrier(conf, bi_sector); } } @@ -517,6 +516,25 @@ static void raid1_end_write_request(struct bio *bio) bio_put(to_put); } +static sector_t align_to_barrier_unit_end(sector_t start_sector, + sector_t sectors) +{ + sector_t len; + + WARN_ON(sectors == 0); + /* + * len is the number of sectors from start_sector to end of the + * barrier unit which start_sector belongs to. + */ + len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - + start_sector; + + if (len > sectors) + len = sectors; + + return len; +} + /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -813,168 +831,168 @@ static void flush_pending_writes(struct r1conf *conf) */ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { + int idx = sector_to_idx(sector_nr); + spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx], conf->resync_lock); /* block any new IO from starting */ - conf->barrier++; - conf->next_resync = sector_nr; + conf->barrier[idx]++; /* For these conditions we must wait: * A: while the array is in frozen state - * B: while barrier >= RESYNC_DEPTH, meaning resync reach - * the max count which allowed. - * C: next_resync + RESYNC_SECTORS > start_next_window, meaning - * next resync will reach to the window which normal bios are - * handling. - * D: while there are any active requests in the current window. + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - conf->barrier < RESYNC_DEPTH && - conf->current_window_requests == 0 && - (conf->start_next_window >= - conf->next_resync + RESYNC_SECTORS), + !conf->nr_pending[idx] && + conf->barrier[idx] < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending++; + conf->nr_pending[idx]++; spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(struct r1conf *conf) +static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { unsigned long flags; - BUG_ON(conf->barrier <= 0); + int idx = sector_to_idx(sector_nr); + + BUG_ON(conf->barrier[idx] <= 0); + spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - conf->nr_pending--; + conf->barrier[idx]--; + conf->nr_pending[idx]--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) +static void _wait_barrier(struct r1conf *conf, int idx) { - bool wait = false; - - if (conf->array_frozen || !bio) - wait = true; - else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if ((conf->mddev->curr_resync_completed - >= bio_end_sector(bio)) || - (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) - wait = false; - else - wait = true; + spin_lock_irq(&conf->resync_lock); + if (conf->array_frozen || conf->barrier[idx]) { + conf->nr_waiting[idx]++; + /* Wait for the barrier to drop. */ + wait_event_lock_irq( + conf->wait_barrier, + !conf->array_frozen && !conf->barrier[idx], + conf->resync_lock); + conf->nr_waiting[idx]--; } - return wait; + conf->nr_pending[idx]++; + spin_unlock_irq(&conf->resync_lock); } -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { - sector_t sector = 0; + int idx = sector_to_idx(sector_nr); spin_lock_irq(&conf->resync_lock); - if (need_to_wait_for_sync(conf, bio)) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * per-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to allow conf->start_next_window - * to increase. - */ - raid1_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen && - (!conf->barrier || - ((conf->start_next_window < - conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), - conf->resync_lock); - conf->nr_waiting--; - } - - if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= conf->next_resync) { - if (conf->start_next_window == MaxSector) - conf->start_next_window = - conf->next_resync + - NEXT_NORMALIO_DISTANCE; - - if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) - <= bio->bi_iter.bi_sector) - conf->next_window_requests++; - else - conf->current_window_requests++; - sector = conf->start_next_window; - } + if (conf->array_frozen) { + conf->nr_waiting[idx]++; + /* Wait for array to unfreeze */ + wait_event_lock_irq( + conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + conf->nr_waiting[idx]--; } - conf->nr_pending++; + conf->nr_pending[idx]++; spin_unlock_irq(&conf->resync_lock); - return sector; } -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector) +static void wait_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _wait_barrier(conf, idx); +} + +static void wait_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _wait_barrier(conf, idx); +} + +static void _allow_barrier(struct r1conf *conf, int idx) { unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - if (start_next_window) { - if (start_next_window == conf->start_next_window) { - if (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bi_sector) - conf->next_window_requests--; - else - conf->current_window_requests--; - } else - conf->current_window_requests--; - - if (!conf->current_window_requests) { - if (conf->next_window_requests) { - conf->current_window_requests = - conf->next_window_requests; - conf->next_window_requests = 0; - conf->start_next_window += - NEXT_NORMALIO_DISTANCE; - } else - conf->start_next_window = MaxSector; - } - } + conf->nr_pending[idx]--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } +static void allow_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _allow_barrier(conf, idx); +} + +static void allow_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _allow_barrier(conf, idx); +} + +/* conf->resync_lock should be held */ +static int get_unqueued_pending(struct r1conf *conf) +{ + int idx, ret; + + for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += conf->nr_pending[idx] - conf->nr_queued[idx]; + + return ret; +} + static void freeze_array(struct r1conf *conf, int extra) { - /* stop syncio and normal IO and wait for everything to + /* Stop sync I/O and normal I/O and wait for everything to * go quite. - * We wait until nr_pending match nr_queued+extra - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (extra) - * must match the number of pending IOs (nr_pending) before - * we continue. + * This is called in two situations: + * 1) management command handlers (reshape, remove disk, quiesce). + * 2) one normal I/O request failed. + + * After array_frozen is set to 1, new sync IO will be blocked at + * raise_barrier(), and new normal I/O will blocked at _wait_barrier() + * or wait_read_barrier(). The flying I/Os will either complete or be + * queued. When everything goes quite, there are only queued I/Os left. + + * Every flying I/O contributes to a conf->nr_pending[idx], idx is the + * barrier bucket index which this I/O request hits. When all sync and + * normal I/O are queued, sum of all conf->nr_pending[] will match sum + * of all conf->nr_queued[]. But normal I/O failure is an exception, + * in handle_read_error(), we may call freeze_array() before trying to + * fix the read error. In this case, the error read I/O is not queued, + * so get_unqueued_pending() == 1. + * + * Therefore before this function returns, we need to wait until + * get_unqueued_pendings(conf) gets equal to extra. For + * normal I/O context, extra is 1, in rested situations extra is 0. */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; raid1_log(conf->mddev, "wait freeze"); - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_lock_irq_cmd( + conf->wait_barrier, + get_unqueued_pending(conf) == extra, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r1conf *conf) @@ -1070,11 +1088,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void raid1_read_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static inline struct r1bio * +alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + + return r1_bio; +} + +static void raid1_read_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; + struct r1bio *r1_bio; struct bio *read_bio; struct bitmap *bitmap = mddev->bitmap; const int op = bio_op(bio); @@ -1083,8 +1118,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, int max_sectors; int rdisk; - wait_barrier(conf, bio); + /* + * Still need barrier for READ in case that whole + * array is frozen. + */ + wait_read_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + /* + * We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + */ read_again: rdisk = read_balance(conf, r1_bio, &max_sectors); @@ -1106,7 +1162,6 @@ read_again: atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; - r1_bio->start_next_window = 0; read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, @@ -1151,22 +1206,16 @@ read_again: */ reschedule_retry(r1_bio); - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto read_again; } else generic_make_request(read_bio); } -static void raid1_write_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static void raid1_write_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; int i, disks; struct bitmap *bitmap = mddev->bitmap; unsigned long flags; @@ -1180,7 +1229,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int sectors_handled; int max_sectors; - sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1216,7 +1264,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } finish_wait(&conf->wait_barrier, &w); } - start_next_window = wait_barrier(conf, bio); + wait_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + + /* We might need to issue multiple writes to different + * devices if there are bad blocks around, so we keep + * track of the number of writes in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1237,7 +1297,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; retry_write: - r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1304,25 +1363,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; - sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); + allow_barrier(conf, bio->bi_iter.bi_sector); raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - start_next_window = wait_barrier(conf, bio); - /* - * We must make sure the multi r1bios of bio have - * the same value of bi_phys_segments - */ - if (bio->bi_phys_segments && old && - old != start_next_window) - /* Wait for the former r1bio(s) to complete */ - wait_event(conf->wait_barrier, - bio->bi_phys_segments == 1); + wait_barrier(conf, bio->bi_iter.bi_sector); goto retry_write; } @@ -1440,12 +1489,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto retry_write; } @@ -1457,36 +1501,25 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, static void raid1_make_request(struct mddev *mddev, struct bio *bio) { - struct r1conf *conf = mddev->private; - struct r1bio *r1_bio; + struct bio *split; + sector_t sectors; - /* - * make_request() can abort the operation when read-ahead is being - * used and no empty request is available. - * - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio); - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector; - - /* - * We might need to issue multiple reads to different devices if there - * are bad blocks around, so we keep track of the number of reads in - * bio->bi_phys_segments. If this is 0, there is only one r1_bio and - * no locking will be needed when requests complete. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); + /* if bio exceeds barrier unit boundary, split it */ + do { + sectors = align_to_barrier_unit_end( + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (sectors < bio_sectors(bio)) { + split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } - if (bio_data_dir(bio) == READ) - raid1_read_request(mddev, bio, r1_bio); - else - raid1_write_request(mddev, bio, r1_bio); + if (bio_data_dir(split) == READ) + raid1_read_request(mddev, split); + else + raid1_write_request(mddev, split); + } while (split != bio); } static void raid1_status(struct seq_file *seq, struct mddev *mddev) @@ -1577,19 +1610,11 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf, NULL); - allow_barrier(conf, 0, 0); + wait_all_barriers(conf); + allow_all_barriers(conf); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; - - spin_lock_irq(&conf->resync_lock); - conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; - conf->start_next_window = MaxSector; - conf->current_window_requests += - conf->next_window_requests; - conf->next_window_requests = 0; - spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) @@ -2337,8 +2362,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { - int m; + int m, idx; bool fail = false; + for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2364,7 +2390,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); - conf->nr_queued++; + idx = sector_to_idx(r1_bio->sector); + conf->nr_queued[idx]++; spin_unlock_irq(&conf->device_lock); md_wakeup_thread(conf->mddev->thread); } else { @@ -2460,15 +2487,8 @@ read_more: generic_make_request(bio); bio = NULL; - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = mbio; - r1_bio->sectors = bio_sectors(mbio) - sectors_handled; - r1_bio->state = 0; + r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); set_bit(R1BIO_ReadError, &r1_bio->state); - r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_iter.bi_sector + - sectors_handled; goto read_more; } else { @@ -2487,6 +2507,7 @@ static void raid1d(struct md_thread *thread) struct r1conf *conf = mddev->private; struct list_head *head = &conf->retry_list; struct blk_plug plug; + int idx; md_check_recovery(mddev); @@ -2494,17 +2515,17 @@ static void raid1d(struct md_thread *thread) !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); - if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - while (!list_empty(&conf->bio_end_io_list)) { - list_move(conf->bio_end_io_list.prev, &tmp); - conf->nr_queued--; - } - } + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + list_splice_init(&conf->bio_end_io_list, &tmp); spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { r1_bio = list_first_entry(&tmp, struct r1bio, retry_list); list_del(&r1_bio->retry_list); + idx = sector_to_idx(r1_bio->sector); + spin_lock_irqsave(&conf->device_lock, flags); + conf->nr_queued[idx]--; + spin_unlock_irqrestore(&conf->device_lock, flags); if (mddev->degraded) set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2525,7 +2546,8 @@ static void raid1d(struct md_thread *thread) } r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); - conf->nr_queued--; + idx = sector_to_idx(r1_bio->sector); + conf->nr_queued[idx]--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -2564,7 +2586,6 @@ static int init_resync(struct r1conf *conf) conf->poolinfo); if (!conf->r1buf_pool) return -ENOMEM; - conf->next_resync = 0; return 0; } @@ -2593,6 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int still_degraded = 0; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ + int idx = sector_to_idx(sector_nr); if (!conf->r1buf_pool) if (init_resync(conf)) @@ -2642,7 +2664,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting) + if (conf->nr_waiting[idx]) schedule_timeout_uninterruptible(1); /* we are incrementing sector_nr below. To be safe, we check against @@ -2669,6 +2691,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio->sector = sector_nr; r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); + /* make sure good_sectors won't go across barrier unit boundary */ + good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; @@ -2899,6 +2923,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; + conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, + sizeof(int), GFP_KERNEL); + if (!conf->nr_pending) + goto abort; + + conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, + sizeof(int), GFP_KERNEL); + if (!conf->nr_waiting) + goto abort; + + conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, + sizeof(int), GFP_KERNEL); + if (!conf->nr_queued) + goto abort; + + conf->barrier = kcalloc(BARRIER_BUCKETS_NR, + sizeof(int), GFP_KERNEL); + if (!conf->barrier) + goto abort; + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); @@ -2954,9 +2998,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; - conf->start_next_window = MaxSector; - conf->current_window_requests = conf->next_window_requests = 0; - err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2999,6 +3040,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } return ERR_PTR(err); @@ -3100,6 +3145,10 @@ static void raid1_free(struct mddev *mddev, void *priv) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c52ef424a24b..3442e8fe3fcd 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,6 +1,29 @@ #ifndef _RAID1_H #define _RAID1_H +/* + * each barrier unit size is 64MB fow now + * note: it must be larger than RESYNC_DEPTH + */ +#define BARRIER_UNIT_SECTOR_BITS 17 +#define BARRIER_UNIT_SECTOR_SIZE (1<<17) +/* + * In struct r1conf, the following members are related to I/O barrier + * buckets, + * int *nr_pending; + * int *nr_waiting; + * int *nr_queued; + * int *barrier; + * Each of them points to array of integers, each array is designed to + * have BARRIER_BUCKETS_NR elements and occupy a single memory page. The + * data width of integer variables is 4, equal to 1<<(ilog2(sizeof(int))), + * BARRIER_BUCKETS_NR_BITS is defined as (PAGE_SHIFT - ilog2(sizeof(int))) + * to make sure an array of integers with BARRIER_BUCKETS_NR elements just + * exactly occupies a single memory page. + */ +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(int))) +#define BARRIER_BUCKETS_NR (1<> BARRIER_UNIT_SECTOR_BITS, + BARRIER_BUCKETS_NR_BITS); +} #endif -- cgit v1.2.3-59-g8ed1b From 824e47daddbfc6ebe1006b8659f080620472a136 Mon Sep 17 00:00:00 2001 From: "colyli@suse.de" Date: Sat, 18 Feb 2017 03:05:57 +0800 Subject: RAID1: avoid unnecessary spin locks in I/O barrier code When I run a parallel reading performan testing on a md raid1 device with two NVMe SSDs, I observe very bad throughput in supprise: by fio with 64KB block size, 40 seq read I/O jobs, 128 iodepth, overall throughput is only 2.7GB/s, this is around 50% of the idea performance number. The perf reports locking contention happens at allow_barrier() and wait_barrier() code, - 41.41% fio [kernel.kallsyms] [k] _raw_spin_lock_irqsave - _raw_spin_lock_irqsave + 89.92% allow_barrier + 9.34% __wake_up - 37.30% fio [kernel.kallsyms] [k] _raw_spin_lock_irq - _raw_spin_lock_irq - 100.00% wait_barrier The reason is, in these I/O barrier related functions, - raise_barrier() - lower_barrier() - wait_barrier() - allow_barrier() They always hold conf->resync_lock firstly, even there are only regular reading I/Os and no resync I/O at all. This is a huge performance penalty. The solution is a lockless-like algorithm in I/O barrier code, and only holding conf->resync_lock when it has to. The original idea is from Hannes Reinecke, and Neil Brown provides comments to improve it. I continue to work on it, and make the patch into current form. In the new simpler raid1 I/O barrier implementation, there are two wait barrier functions, - wait_barrier() Which calls _wait_barrier(), is used for regular write I/O. If there is resync I/O happening on the same I/O barrier bucket, or the whole array is frozen, task will wait until no barrier on same barrier bucket, or the whold array is unfreezed. - wait_read_barrier() Since regular read I/O won't interfere with resync I/O (read_balance() will make sure only uptodate data will be read out), it is unnecessary to wait for barrier in regular read I/Os, waiting in only necessary when the whole array is frozen. The operations on conf->nr_pending[idx], conf->nr_waiting[idx], conf-> barrier[idx] are very carefully designed in raise_barrier(), lower_barrier(), _wait_barrier() and wait_read_barrier(), in order to avoid unnecessary spin locks in these functions. Once conf-> nr_pengding[idx] is increased, a resync I/O with same barrier bucket index has to wait in raise_barrier(). Then in _wait_barrier() if no barrier raised in same barrier bucket index and array is not frozen, the regular I/O doesn't need to hold conf->resync_lock, it can just increase conf->nr_pending[idx], and return to its caller. wait_read_barrier() is very similar to _wait_barrier(), the only difference is it only waits when array is frozen. For heavy parallel reading I/Os, the lockless I/O barrier code almostly gets rid of all spin lock cost. This patch significantly improves raid1 reading peroformance. From my testing, a raid1 device built by two NVMe SSD, runs fio with 64KB blocksize, 40 seq read I/O jobs, 128 iodepth, overall throughput increases from 2.7GB/s to 4.6GB/s (+70%). Changelog V4: - Change conf->nr_queued[] to atomic_t. - Define BARRIER_BUCKETS_NR_BITS by (PAGE_SHIFT - ilog2(sizeof(atomic_t))) V3: - Add smp_mb__after_atomic() as Shaohua and Neil suggested. - Change conf->nr_queued[] from atomic_t to int. - Change conf->array_frozen from atomic_t back to int, and use READ_ONCE(conf->array_frozen) to check value of conf->array_frozen in _wait_barrier() and wait_read_barrier(). - In _wait_barrier() and wait_read_barrier(), add a call to wake_up(&conf->wait_barrier) after atomic_dec(&conf->nr_pending[idx]), to fix a deadlock between _wait_barrier()/wait_read_barrier and freeze_array(). V2: - Remove a spin_lock/unlock pair in raid1d(). - Add more code comments to explain why there is no racy when checking two atomic_t variables at same time. V1: - Original RFC patch for comments. Signed-off-by: Coly Li Cc: Shaohua Li Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Guoqing Jiang Reviewed-by: Neil Brown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 165 ++++++++++++++++++++++++++++++++++++----------------- drivers/md/raid1.h | 31 +++++----- 2 files changed, 130 insertions(+), 66 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40297fd17f7e..fefbbfdb440b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -226,7 +226,7 @@ static void reschedule_retry(struct r1bio *r1_bio) idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued[idx]++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -836,11 +836,21 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx], + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier[idx]++; + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); /* For these conditions we must wait: * A: while the array is in frozen state @@ -851,42 +861,81 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - !conf->nr_pending[idx] && - conf->barrier[idx] < RESYNC_DEPTH, + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending[idx]++; + atomic_inc(&conf->nr_pending[idx]); spin_unlock_irq(&conf->resync_lock); } static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; int idx = sector_to_idx(sector_nr); - BUG_ON(conf->barrier[idx] <= 0); + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier[idx]--; - conf->nr_pending[idx]--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } static void _wait_barrier(struct r1conf *conf, int idx) { - spin_lock_irq(&conf->resync_lock); - if (conf->array_frozen || conf->barrier[idx]) { - conf->nr_waiting[idx]++; - /* Wait for the barrier to drop. */ - wait_event_lock_irq( - conf->wait_barrier, - !conf->array_frozen && !conf->barrier[idx], - conf->resync_lock); - conf->nr_waiting[idx]--; - } + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); + + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; - conf->nr_pending[idx]++; + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); } @@ -894,18 +943,32 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { int idx = sector_to_idx(sector_nr); - spin_lock_irq(&conf->resync_lock); - if (conf->array_frozen) { - conf->nr_waiting[idx]++; - /* Wait for array to unfreeze */ - wait_event_lock_irq( - conf->wait_barrier, - !conf->array_frozen, - conf->resync_lock); - conf->nr_waiting[idx]--; - } + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); - conf->nr_pending[idx]++; + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); } @@ -926,11 +989,7 @@ static void wait_all_barriers(struct r1conf *conf) static void _allow_barrier(struct r1conf *conf, int idx) { - unsigned long flags; - - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending[idx]--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } @@ -955,7 +1014,8 @@ static int get_unqueued_pending(struct r1conf *conf) int idx, ret; for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - ret += conf->nr_pending[idx] - conf->nr_queued[idx]; + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); return ret; } @@ -1000,8 +1060,8 @@ static void unfreeze_array(struct r1conf *conf) /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 0; - wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); } /* duplicate the data pages for behind I/O @@ -2391,8 +2451,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); idx = sector_to_idx(r1_bio->sector); - conf->nr_queued[idx]++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * get_unqueued_pending() == extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2523,9 +2588,7 @@ static void raid1d(struct md_thread *thread) retry_list); list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); - spin_lock_irqsave(&conf->device_lock, flags); - conf->nr_queued[idx]--; - spin_unlock_irqrestore(&conf->device_lock, flags); + atomic_dec(&conf->nr_queued[idx]); if (mddev->degraded) set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2547,7 +2610,7 @@ static void raid1d(struct md_thread *thread) r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); idx = sector_to_idx(r1_bio->sector); - conf->nr_queued[idx]--; + atomic_dec(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -2664,7 +2727,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting[idx]) + if (atomic_read(&conf->nr_waiting[idx])) schedule_timeout_uninterruptible(1); /* we are incrementing sector_nr below. To be safe, we check against @@ -2924,22 +2987,22 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_pending) goto abort; conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_waiting) goto abort; conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_queued) goto abort; conf->barrier = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->barrier) goto abort; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 3442e8fe3fcd..dd22a37d0d83 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -10,18 +10,19 @@ /* * In struct r1conf, the following members are related to I/O barrier * buckets, - * int *nr_pending; - * int *nr_waiting; - * int *nr_queued; - * int *barrier; - * Each of them points to array of integers, each array is designed to - * have BARRIER_BUCKETS_NR elements and occupy a single memory page. The - * data width of integer variables is 4, equal to 1<<(ilog2(sizeof(int))), - * BARRIER_BUCKETS_NR_BITS is defined as (PAGE_SHIFT - ilog2(sizeof(int))) - * to make sure an array of integers with BARRIER_BUCKETS_NR elements just - * exactly occupies a single memory page. + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. */ -#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(int))) +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) #define BARRIER_BUCKETS_NR (1< Date: Sun, 19 Feb 2017 22:41:27 -0800 Subject: md/raid1: fix a use-after-free bug Commit fd76863 (RAID1: a new I/O barrier implementation to remove resync window) introduces a user-after-free bug. Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fefbbfdb440b..2e5e4805cbe1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -203,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio) static void put_buf(struct r1bio *r1_bio) { struct r1conf *conf = r1_bio->mddev->private; + sector_t sect = r1_bio->sector; int i; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -213,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio) mempool_free(r1_bio, conf->r1buf_pool); - lower_barrier(conf, r1_bio->sector); + lower_barrier(conf, sect); } static void reschedule_retry(struct r1bio *r1_bio) -- cgit v1.2.3-59-g8ed1b From d939cdfde34f50b95254b375f498447c82190b3e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 21 Feb 2017 11:57:01 -0800 Subject: md/linear: shutup lockdep warnning Commit 03a9e24(md linear: fix a race between linear_add() and linear_congested()) introduces the warnning. Acked-by: Coly Li Signed-off-by: Shaohua Li --- drivers/md/linear.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 789008bc94ff..5b06b0ddf7a3 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -224,7 +224,8 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) * oldconf until no one uses it anymore. */ mddev_suspend(mddev); - oldconf = rcu_dereference(mddev->private); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; WARN_ONCE(mddev->raid_disks != newconf->raid_disks, "copied raid_disks doesn't match mddev->raid_disks"); -- cgit v1.2.3-59-g8ed1b From aff8da09f2381f0869faaf6637b0d892a3ee99ed Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 21 Feb 2017 10:56:19 -0800 Subject: md/raid1: handle flush request correctly I got a warning triggered in align_to_barrier_unit_end. It's a flush request so sectors == 0. The flush request happens to work well without the new barrier patch, but we'd better handle it explictly. Cc: NeilBrown Acked-by: Coly Li Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2e5e4805cbe1..8901f0c8c775 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1282,8 +1282,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) unsigned long flags; const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_opf & - (REQ_PREFLUSH | REQ_FUA)); + const unsigned long do_fua = (bio->bi_opf & REQ_FUA); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1509,7 +1508,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); + bio_set_op_attrs(mbio, op, do_fua | do_sync); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -1565,6 +1564,11 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; sector_t sectors; + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + md_flush_request(mddev, bio); + return; + } + /* if bio exceeds barrier unit boundary, split it */ do { sectors = align_to_barrier_unit_end( -- cgit v1.2.3-59-g8ed1b From 1ec492232ed659acde8cc00b9ecc7529778e03e1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 21 Feb 2017 14:27:57 -0800 Subject: md/raid1: fix write behind issues introduced by bio_clone_bioset_partial There are two issues, introduced by commit 8e58e32(md/raid1: use bio_clone_bioset_partial() in case of write behind): - bio_clone_bioset_partial() uses bytes instead of sectors as parameters - in writebehind mode, we return bio if all !writemostly disk bios finish, which could happen before writemostly disk bios run. So all writemostly disk bios should have their bvec. Here we just make sure all bios are cloned instead of fast cloned. Reviewed-by: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8901f0c8c775..d4e8796b4569 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1472,8 +1472,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) !waitqueue_active(&bitmap->behind_wait)) { mbio = bio_clone_bioset_partial(bio, GFP_NOIO, mddev->bio_set, - offset, - max_sectors); + offset << 9, + max_sectors << 9); alloc_behind_pages(mbio, r1_bio); } @@ -1485,8 +1485,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio) } if (!mbio) { - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, offset, max_sectors); + if (r1_bio->behind_bvecs) + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); + else { + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, offset, max_sectors); + } } if (r1_bio->behind_bvecs) { -- cgit v1.2.3-59-g8ed1b