From 24113d4878439baf1f23c1a33dfcc340fba66e97 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 6 Nov 2018 22:34:59 +0100 Subject: dm: avoid indirect call in __dm_make_request Indirect calls are inefficient because of retpolines that are used for spectre workaround. This patch replaces an indirect call with a condition (that can be predicted by the branch predictor). Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dddbca63e140..fa70c90e6757 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1696,10 +1696,7 @@ out: return ret; } -typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *); - -static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio, - process_bio_fn process_bio) +static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; blk_qc_t ret = BLK_QC_T_NONE; @@ -1719,26 +1716,15 @@ static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio, return ret; } - ret = process_bio(md, map, bio); + if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) + ret = __process_bio(md, map, bio); + else + ret = __split_and_process_bio(md, map, bio); dm_put_live_table(md, srcu_idx); return ret; } -/* - * The request function that remaps the bio to one target and - * splits off any remainder. - */ -static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) -{ - return __dm_make_request(q, bio, __split_and_process_bio); -} - -static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio) -{ - return __dm_make_request(q, bio, __process_bio); -} - static int dm_any_congested(void *congested_data, int bdi_bits) { int r = bdi_bits; @@ -2229,12 +2215,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request); - break; case DM_TYPE_NVME_BIO_BASED: dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request_nvme); + blk_queue_make_request(md->queue, dm_make_request); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); -- cgit v1.2.3-59-g8ed1b From 2adc5c559a0770ef75d1647fbf557c7f56194ef8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 8 Nov 2018 14:59:41 -0500 Subject: dm rq: remove unused arguments from rq_completed() Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 4e06be4f0a62..202e9be5aea7 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -128,7 +128,7 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) * the md may be freed in dm_put() at the end of this function. * Or do dm_get() before calling this function and dm_put() later. */ -static void rq_completed(struct mapped_device *md, int rw, bool run_queue) +static void rq_completed(struct mapped_device *md) { /* nudge anyone waiting on suspend queue */ if (unlikely(waitqueue_active(&md->wait))) @@ -147,7 +147,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) */ static void dm_end_request(struct request *clone, blk_status_t error) { - int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; @@ -157,7 +156,7 @@ static void dm_end_request(struct request *clone, blk_status_t error) rq_end_stats(md, rq); blk_mq_end_request(rq, error); - rq_completed(md, rw, true); + rq_completed(md); } static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) @@ -181,7 +180,6 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ { struct mapped_device *md = tio->md; struct request *rq = tio->orig; - int rw = rq_data_dir(rq); unsigned long delay_ms = delay_requeue ? 100 : 0; rq_end_stats(md, rq); @@ -191,7 +189,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ } dm_mq_delay_requeue_request(rq, delay_ms); - rq_completed(md, rw, false); + rq_completed(md); } static void dm_done(struct request *clone, blk_status_t error, bool mapped) @@ -246,15 +244,13 @@ static void dm_softirq_done(struct request *rq) bool mapped = true; struct dm_rq_target_io *tio = tio_from_request(rq); struct request *clone = tio->clone; - int rw; if (!clone) { struct mapped_device *md = tio->md; rq_end_stats(md, rq); - rw = rq_data_dir(rq); blk_mq_end_request(rq, tio->error); - rq_completed(md, rw, false); + rq_completed(md); return; } @@ -507,7 +503,7 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (map_request(tio) == DM_MAPIO_REQUEUE) { /* Undo dm_start_request() before requeuing */ rq_end_stats(md, rq); - rq_completed(md, rq_data_dir(rq), false); + rq_completed(md); return BLK_STS_RESOURCE; } -- cgit v1.2.3-59-g8ed1b From 935fcc56abc321ca828f2d1fbb732fd478fd3449 Mon Sep 17 00:00:00 2001 From: wuzhouhui Date: Thu, 22 Mar 2018 22:22:38 +0800 Subject: dm mpath: only flush workqueue when needed The workqueues are shared by many multipath devices, only flush whole workqueue when necessary. Otherwise, we just flush works as needed. Signed-off-by: wuzhouhui Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d6a66921daf4..2ee5e357a0a7 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1211,14 +1211,16 @@ static void flush_multipath_work(struct multipath *m) set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); smp_mb__after_atomic(); - flush_workqueue(kmpath_handlerd); + if (atomic_read(&m->pg_init_in_progress)) + flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); smp_mb__after_atomic(); } - flush_workqueue(kmultipathd); + if (m->queue_mode == DM_TYPE_BIO_BASED) + flush_work(&m->process_queued_bios); flush_work(&m->trigger_event); } -- cgit v1.2.3-59-g8ed1b From 53b471687012c80a8c03250cecf7121ffa8781f8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 3 Dec 2018 17:00:54 -0500 Subject: dm: remove indirect calls from __send_changing_extent_only() No need to be so fancy. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fa70c90e6757..57bac79848d3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1486,11 +1486,9 @@ static bool is_split_required_for_discard(struct dm_target *ti) } static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - get_num_bios_fn get_num_bios, - is_split_required_fn is_split_required) + unsigned num_bios, bool is_split_required) { unsigned len; - unsigned num_bios; /* * Even though the device advertised support for this type of @@ -1498,11 +1496,10 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * * reconfiguration might also have changed that since the * check was performed. */ - num_bios = get_num_bios ? get_num_bios(ti) : 0; if (!num_bios) return -EOPNOTSUPP; - if (is_split_required && !is_split_required(ti)) + if (!is_split_required) len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); else len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); @@ -1517,23 +1514,23 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * static int __send_discard(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_discard_bios, - is_split_required_for_discard); + return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti), + is_split_required_for_discard(ti)); } static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL); + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false); } static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL); + return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false); } static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL); + return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false); } static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, -- cgit v1.2.3-59-g8ed1b From e8ea141a0f5e1d7b23f47d10c480d4fc55caaf14 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Wed, 24 Oct 2018 16:04:36 +0800 Subject: dm writecache: fix typo in error msg for creating writecache_flush_thread The error msg should be "flush thread" instead of "endio thread" for writecache_flush_thread. Signed-off-by: Shenghui Wang Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 2d50eec94cd7..2b8cee35e4d5 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2061,7 +2061,7 @@ invalid_optional: if (IS_ERR(wc->flush_thread)) { r = PTR_ERR(wc->flush_thread); wc->flush_thread = NULL; - ti->error = "Couldn't spawn endio thread"; + ti->error = "Couldn't spawn flush thread"; goto bad; } wake_up_process(wc->flush_thread); -- cgit v1.2.3-59-g8ed1b From ef9923739e6d2233dc16aba70b7d81767d85cfcb Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Tue, 30 Oct 2018 15:35:54 +0800 Subject: dm bufio: update comment in dm-bufio.c * Hashtable has been replaced by rbtree to manage buffers. Update the comment. * Fix typo in the comment for dm_bufio_issue_flush Signed-off-by: Shenghui Wang Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index dc385b70e4c3..0e9fcceaefd2 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -65,7 +65,7 @@ /* * Linking of buffers: - * All buffers are linked to cache_hash with their hash_list field. + * All buffers are linked to buffer_tree with their node field. * * Clean buffers that are not being written (B_WRITING not set) * are linked to lru[LIST_CLEAN] with their lru_list field. @@ -457,7 +457,7 @@ static void free_buffer(struct dm_buffer *b) } /* - * Link buffer to the hash list and clean or dirty queue. + * Link buffer to the buffer tree and clean or dirty queue. */ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) { @@ -472,7 +472,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) } /* - * Unlink buffer from the hash list and dirty or clean queue. + * Unlink buffer from the buffer tree and dirty or clean queue. */ static void __unlink_buffer(struct dm_buffer *b) { @@ -993,7 +993,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, /* * We've had a period where the mutex was unlocked, so need to - * recheck the hash table. + * recheck the buffer tree. */ b = __find(c, block); if (b) { @@ -1327,7 +1327,7 @@ again: EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); /* - * Use dm-io to send and empty barrier flush the device. + * Use dm-io to send an empty barrier to flush the device. */ int dm_bufio_issue_flush(struct dm_bufio_client *c) { @@ -1356,7 +1356,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); * Then, we write the buffer to the original location if it was dirty. * * Then, if we are the only one who is holding the buffer, relink the buffer - * in the hash queue for the new location. + * in the buffer tree for the new location. * * If there was someone else holding the buffer, we write it to the new * location but not relink it, because that other user needs to have the buffer -- cgit v1.2.3-59-g8ed1b From 721b1d98fb517ae99ab3b757021cf81db41e67be Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Wed, 31 Oct 2018 17:53:08 -0400 Subject: dm snapshot: Fix excessive memory usage and workqueue stalls kcopyd has no upper limit to the number of jobs one can allocate and issue. Under certain workloads this can lead to excessive memory usage and workqueue stalls. For example, when creating multiple dm-snapshot targets with a 4K chunk size and then writing to the origin through the page cache. Syncing the page cache causes a large number of BIOs to be issued to the dm-snapshot origin target, which itself issues an even larger (because of the BIO splitting taking place) number of kcopyd jobs. Running the following test, from the device mapper test suite [1], dmtest run --suite snapshot -n many_snapshots_of_same_volume_N , with 8 active snapshots, results in the kcopyd job slab cache growing to 10G. Depending on the available system RAM this can lead to the OOM killer killing user processes: [463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=1, oom_score_adj=0 [463.492894] kthreadd cpuset=/ mems_allowed=0 [463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3 [463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [463.492952] Call Trace: [463.492964] dump_stack+0x7d/0xbb [463.492973] dump_header+0x6b/0x2fc [463.492987] ? lockdep_hardirqs_on+0xee/0x190 [463.493012] oom_kill_process+0x302/0x370 [463.493021] out_of_memory+0x113/0x560 [463.493030] __alloc_pages_slowpath+0xf40/0x1020 [463.493055] __alloc_pages_nodemask+0x348/0x3c0 [463.493067] cache_grow_begin+0x81/0x8b0 [463.493072] ? cache_grow_begin+0x874/0x8b0 [463.493078] fallback_alloc+0x1e4/0x280 [463.493092] kmem_cache_alloc_node+0xd6/0x370 [463.493098] ? copy_process.part.31+0x1c5/0x20d0 [463.493105] copy_process.part.31+0x1c5/0x20d0 [463.493115] ? __lock_acquire+0x3cc/0x1550 [463.493121] ? __switch_to_asm+0x34/0x70 [463.493129] ? kthread_create_worker_on_cpu+0x70/0x70 [463.493135] ? finish_task_switch+0x90/0x280 [463.493165] _do_fork+0xe0/0x6d0 [463.493191] ? kthreadd+0x19f/0x220 [463.493233] kernel_thread+0x25/0x30 [463.493235] kthreadd+0x1bf/0x220 [463.493242] ? kthread_create_on_cpu+0x90/0x90 [463.493248] ret_from_fork+0x3a/0x50 [463.493279] Mem-Info: [463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0 [463.493285] active_file:80216 inactive_file:80107 isolated_file:435 [463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0 [463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521 [463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0 [463.493285] free:33623 free_pcp:2392 free_cma:0 ... [463.493489] Unreclaimable slab info: [463.493513] Name Used Total [463.493522] bio-6 1028KB 1028KB [463.493525] bio-5 1028KB 1028KB [463.493528] dm_snap_pending_exception 236783KB 243789KB [463.493531] dm_exception 41KB 42KB [463.493534] bio-4 1216KB 1216KB [463.493537] bio-3 439396KB 439396KB [463.493539] kcopyd_job 6973427KB 6973427KB ... [463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child [463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB [463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB Moreover, issuing a large number of kcopyd jobs results in kcopyd hogging the CPU, while processing them. As a result, processing of work items, queued for execution on the same CPU as the currently running kcopyd thread, is stalled for long periods of time, hurting performance. Running the aforementioned test we get, in dmesg, messages like the following: [67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s! [67501.195586] Showing busy workqueues and worker pools: [67501.195591] workqueue events: flags=0x0 [67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195611] pending: cache_reap [67501.195641] workqueue mm_percpu_wq: flags=0x8 [67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195656] pending: vmstat_update [67501.195682] workqueue kblockd: flags=0x18 [67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256 [67501.195698] pending: blk_timeout_work [67501.195753] workqueue kcopyd: flags=0x8 [67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195768] pending: do_work [dm_mod] [67501.195802] workqueue kcopyd: flags=0x8 [67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195817] pending: do_work [dm_mod] [67501.195834] workqueue kcopyd: flags=0x8 [67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195848] pending: do_work [dm_mod] [67501.195881] workqueue kcopyd: flags=0x8 [67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195896] pending: do_work [dm_mod] [67501.195920] workqueue kcopyd: flags=0x8 [67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256 [67501.195935] in-flight: 67:do_work [dm_mod] [67501.195945] pending: do_work [dm_mod] [67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765 The root cause for these issues is the way dm-snapshot uses kcopyd. In particular, the lack of an explicit or implicit limit to the maximum number of in-flight COW jobs. The merging path is not affected because it implicitly limits the in-flight kcopyd jobs to one. Fix these issues by using a semaphore to limit the maximum number of in-flight kcopyd jobs. We grab the semaphore before allocating a new kcopyd job in start_copy() and start_full_bio() and release it after the job finishes in copy_callback(). The initial semaphore value is configurable through a module parameter, to allow fine tuning the maximum number of in-flight COW jobs. Setting this parameter to zero initializes the semaphore to INT_MAX. A default value of 2048 maximum in-flight kcopyd jobs was chosen. This value was decided experimentally as a trade-off between memory consumption, stalling the kernel's workqueues and maintaining a high enough throughput. Re-running the aforementioned test: * Workqueue stalls are eliminated * kcopyd's job slab cache uses a maximum of 130MB * The time taken by the test to write to the snapshot-origin target is reduced from 05m20.48s to 03m26.38s [1] https://github.com/jthornber/device-mapper-test-suite Signed-off-by: Nikos Tsironis Signed-off-by: Ilias Tsitsimpis Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ae4b33d10924..36805b12661e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "dm.h" @@ -105,6 +106,9 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; + /* Maximum number of in-flight COW jobs. */ + struct semaphore cow_count; + struct dm_kcopyd_client *kcopyd_client; /* Wait for events based on state_bits */ @@ -145,6 +149,19 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1 +/* + * Maximum number of chunks being copied on write. + * + * The value was decided experimentally as a trade-off between memory + * consumption, stalling the kernel's workqueues and maintaining a high enough + * throughput. + */ +#define DEFAULT_COW_THRESHOLD 2048 + +static int cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); + DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write"); @@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } + sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { r = PTR_ERR(s->kcopyd_client); @@ -1575,6 +1594,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } + up(&s->cow_count); } /* @@ -1598,6 +1618,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ + down(&s->cow_count); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1617,6 +1638,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; + down(&s->cow_count); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); -- cgit v1.2.3-59-g8ed1b From d7e6b8dfc7bcb3f4f3a18313581f67486a725b52 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Wed, 31 Oct 2018 17:53:09 -0400 Subject: dm kcopyd: Fix bug causing workqueue stalls When using kcopyd to run callbacks through dm_kcopyd_do_callback() or submitting copy jobs with a source size of 0, the jobs are pushed directly to the complete_jobs list, which could be under processing by the kcopyd thread. As a result, the kcopyd thread can continue running completed jobs indefinitely, without releasing the CPU, as long as someone keeps submitting new completed jobs through the aforementioned paths. Processing of work items, queued for execution on the same CPU as the currently running kcopyd thread, is thus stalled for excessive amounts of time, hurting performance. Running the following test, from the device mapper test suite [1], dmtest run --suite snapshot -n parallel_io_to_many_snaps_N , with 8 active snapshots, we get, in dmesg, messages like the following: [68899.948523] BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 95s! [68899.949282] Showing busy workqueues and worker pools: [68899.949288] workqueue events: flags=0x0 [68899.949295] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 [68899.949306] pending: vmstat_shepherd, cache_reap [68899.949331] workqueue mm_percpu_wq: flags=0x8 [68899.949337] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949345] pending: vmstat_update [68899.949387] workqueue dm_bufio_cache: flags=0x8 [68899.949392] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256 [68899.949400] pending: work_fn [dm_bufio] [68899.949423] workqueue kcopyd: flags=0x8 [68899.949429] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949437] pending: do_work [dm_mod] [68899.949452] workqueue kcopyd: flags=0x8 [68899.949458] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 [68899.949466] in-flight: 13:do_work [dm_mod] [68899.949474] pending: do_work [dm_mod] [68899.949487] workqueue kcopyd: flags=0x8 [68899.949493] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949501] pending: do_work [dm_mod] [68899.949515] workqueue kcopyd: flags=0x8 [68899.949521] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949529] pending: do_work [dm_mod] [68899.949541] workqueue kcopyd: flags=0x8 [68899.949547] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949555] pending: do_work [dm_mod] [68899.949568] pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=95s workers=4 idle: 27130 27223 1084 Fix this by splitting the complete_jobs list into two parts: A user facing part, named callback_jobs, and one used internally by kcopyd, retaining the name complete_jobs. dm_kcopyd_do_callback() and dispatch_job() now push their jobs to the callback_jobs list, which is spliced to the complete_jobs list once, every time the kcopyd thread wakes up. This prevents kcopyd from hogging the CPU indefinitely and causing workqueue stalls. Re-running the aforementioned test: * Workqueue stalls are eliminated * The maximum writing time among all targets is reduced from 09m37.10s to 06m04.85s and the total run time of the test is reduced from 10m43.591s to 7m19.199s [1] https://github.com/jthornber/device-mapper-test-suite Signed-off-by: Nikos Tsironis Signed-off-by: Ilias Tsitsimpis Signed-off-by: Mike Snitzer --- drivers/md/dm-kcopyd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 2fc4213e02b5..671c24332802 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -56,15 +56,17 @@ struct dm_kcopyd_client { atomic_t nr_jobs; /* - * We maintain three lists of jobs: + * We maintain four lists of jobs: * * i) jobs waiting for pages * ii) jobs that have pages, and are waiting for the io to be issued. - * iii) jobs that have completed. + * iii) jobs that don't need to do any IO and just run a callback + * iv) jobs that have completed. * - * All three of these are protected by job_lock. + * All four of these are protected by job_lock. */ spinlock_t job_lock; + struct list_head callback_jobs; struct list_head complete_jobs; struct list_head io_jobs; struct list_head pages_jobs; @@ -625,6 +627,7 @@ static void do_work(struct work_struct *work) struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); struct blk_plug plug; + unsigned long flags; /* * The order that these are called is *very* important. @@ -633,6 +636,10 @@ static void do_work(struct work_struct *work) * list. io jobs call wake when they complete and it all * starts again. */ + spin_lock_irqsave(&kc->job_lock, flags); + list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); + blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); @@ -650,7 +657,7 @@ static void dispatch_job(struct kcopyd_job *job) struct dm_kcopyd_client *kc = job->kc; atomic_inc(&kc->nr_jobs); if (unlikely(!job->source.count)) - push(&kc->complete_jobs, job); + push(&kc->callback_jobs, job); else if (job->pages == &zero_page_list) push(&kc->io_jobs, job); else @@ -858,7 +865,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) job->read_err = read_err; job->write_err = write_err; - push(&kc->complete_jobs, job); + push(&kc->callback_jobs, job); wake(kc); } EXPORT_SYMBOL(dm_kcopyd_do_callback); @@ -888,6 +895,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro return ERR_PTR(-ENOMEM); spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->callback_jobs); INIT_LIST_HEAD(&kc->complete_jobs); INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); @@ -939,6 +947,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) /* Wait for completion of all jobs submitted by this client. */ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + BUG_ON(!list_empty(&kc->callback_jobs)); BUG_ON(!list_empty(&kc->complete_jobs)); BUG_ON(!list_empty(&kc->io_jobs)); BUG_ON(!list_empty(&kc->pages_jobs)); -- cgit v1.2.3-59-g8ed1b From 8d683dcd65c037efc9fb38c696ec9b65b306e573 Mon Sep 17 00:00:00 2001 From: AliOS system security Date: Mon, 5 Nov 2018 15:31:42 +0800 Subject: dm crypt: use u64 instead of sector_t to store iv_offset The iv_offset in the mapping table of crypt target is a 64bit number when IV algorithm is plain64, plain64be, essiv or benbi. It will be assigned to iv_offset of struct crypt_config, cc_sector of struct convert_context and iv_sector of struct dm_crypt_request. These structures members are defined as a sector_t. But sector_t is 32bit when CONFIG_LBDAF is not set in 32bit kernel. In this situation sector_t is not big enough to store the 64bit iv_offset. Here is a reproducer. Prepare test image and device (loop is automatically allocated by cryptsetup): # dd if=/dev/zero of=tst.img bs=1M count=1 # echo "tst"|cryptsetup open --type plain -c aes-xts-plain64 \ --skip 500000000000000000 tst.img test On 32bit system (use IV offset value that overflows to 64bit; CONFIG_LBDAF if off) and device checksum is wrong: # dmsetup table test --showkeys 0 2048 crypt aes-xts-plain64 dfa7cfe3c481f2239155739c42e539ae8f2d38f304dcc89d20b26f69daaf0933 3551657984 7:0 0 # sha256sum /dev/mapper/test 533e25c09176632b3794f35303488c4a8f3f965dffffa6ec2df347c168cb6c19 /dev/mapper/test On 64bit system (and on 32bit system with the patch), table and checksum is now correct: # dmsetup table test --showkeys 0 2048 crypt aes-xts-plain64 dfa7cfe3c481f2239155739c42e539ae8f2d38f304dcc89d20b26f69daaf0933 500000000000000000 7:0 0 # sha256sum /dev/mapper/test 5d16160f9d5f8c33d8051e65fdb4f003cc31cd652b5abb08f03aa6fce0df75fc /dev/mapper/test Signed-off-by: AliOS system security Tested-and-Reviewed-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b8eec515a003..9a4dec0a0f71 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -49,7 +49,7 @@ struct convert_context { struct bio *bio_out; struct bvec_iter iter_in; struct bvec_iter iter_out; - sector_t cc_sector; + u64 cc_sector; atomic_t cc_pending; union { struct skcipher_request *req; @@ -81,7 +81,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in[4]; struct scatterlist sg_out[4]; - sector_t iv_sector; + u64 iv_sector; }; struct crypt_config; @@ -160,7 +160,7 @@ struct crypt_config { struct iv_lmk_private lmk; struct iv_tcw_private tcw; } iv_gen_private; - sector_t iv_offset; + u64 iv_offset; unsigned int iv_size; unsigned short int sector_size; unsigned char sector_shift; -- cgit v1.2.3-59-g8ed1b From ef87bfc24f9b8da82c89aff493df20f078bc9cb1 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 7 Nov 2018 22:24:55 +0100 Subject: dm: Check for device sector overflow if CONFIG_LBDAF is not set Reference to a device in device-mapper table contains offset in sectors. If the sector_t is 32bit integer (CONFIG_LBDAF is not set), then several device-mapper targets can overflow this offset and validity check is then performed on a wrong offset and a wrong table is activated. See for example (on 32bit without CONFIG_LBDAF) this overflow: # dmsetup create test --table "0 2048 linear /dev/sdg 4294967297" # dmsetup table test 0 2048 linear 8:96 1 This patch adds explicit check for overflow if the offset is sector_t type. Signed-off-by: Milan Broz Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-delay.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/dm-linear.c | 2 +- drivers/md/dm-raid1.c | 3 ++- drivers/md/dm-unstripe.c | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9a4dec0a0f71..fc7d8b8a654f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2781,7 +2781,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -EINVAL; - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2fb7bb4304ad..fddffe251bf6 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -141,7 +141,7 @@ static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **a unsigned long long tmpll; char dummy; - if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; return -EINVAL; } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 3cb97fa4c11d..8261aa8c7fe1 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -213,7 +213,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) devname = dm_shift_arg(&as); r = -EINVAL; - if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 8d7ddee6ac4d..ad980a38fb1e 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -45,7 +45,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -EINVAL; - if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 79eab1071ec2..5a51151f680d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -943,7 +943,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, char dummy; int ret; - if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1 || + offset != (sector_t)offset) { ti->error = "Invalid offset"; return -EINVAL; } diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 954b7ab4e684..e673dacf6418 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -78,7 +78,7 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; } - if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) { + if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { ti->error = "Invalid striped device offset"; goto err; } -- cgit v1.2.3-59-g8ed1b From a00f5276e26636cbf72f24f79831026d2e2868e7 Mon Sep 17 00:00:00 2001 From: Sweet Tea Date: Tue, 13 Nov 2018 08:04:24 -0500 Subject: dm flakey: Properly corrupt multi-page bios. The flakey target is documented to be able to corrupt the Nth byte in a bio, but does not corrupt byte indices after the first biovec in the bio. Change the corrupting function to actually corrupt the Nth byte no matter in which biovec that index falls. A test device generating two-page bios, atop a flakey device configured to corrupt a byte index on the second page, verified both the failure to corrupt before this patch and the expected corruption after this change. Signed-off-by: John Dorminy Signed-off-by: Mike Snitzer --- drivers/md/dm-flakey.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 8261aa8c7fe1..a9bc518156f2 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -287,20 +287,31 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) { - unsigned bio_bytes = bio_cur_bytes(bio); - char *data = bio_data(bio); + unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; + + struct bvec_iter iter; + struct bio_vec bvec; + + if (!bio_has_data(bio)) + return; /* - * Overwrite the Nth byte of the data returned. + * Overwrite the Nth byte of the bio's data, on whichever page + * it falls. */ - if (data && bio_bytes >= fc->corrupt_bio_byte) { - data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; - - DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " - "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n", - bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, - (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, bio_bytes); + bio_for_each_segment(bvec, bio, iter) { + if (bio_iter_len(bio, iter) > corrupt_bio_byte) { + char *segment = (page_address(bio_iter_page(bio, iter)) + + bio_iter_offset(bio, iter)); + segment[corrupt_bio_byte] = fc->corrupt_bio_value; + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + break; + } + corrupt_bio_byte -= bio_iter_len(bio, iter); } } -- cgit v1.2.3-59-g8ed1b From e8c2566f83527f6bcced86739adc5cd5262e2dd2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Nov 2018 15:15:31 +0000 Subject: dm integrity: fix spelling mistake in workqueue name Rename the workqueue from dm-intergrity-recalc to dm-integrity-recalc. Signed-off-by: Colin Ian King Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index bb3096bf2cc6..9da55b742329 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3460,7 +3460,7 @@ try_smaller_buffer: ti->error = "Recalculate is only valid with internal hash"; goto bad; } - ic->recalc_wq = alloc_workqueue("dm-intergrity-recalc", WQ_MEM_RECLAIM, 1); + ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq ) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; -- cgit v1.2.3-59-g8ed1b From af331ebae7e10699a023fcce4f0671a01419ee3c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 5 Dec 2018 20:53:00 -0800 Subject: dm crypt: log the encryption algorithm implementation Log the encryption algorithm's driver name when a dm-crypt target is created. This will help people determine whether the expected implementation is being used. In some cases we've seen people do benchmarks and reject using encryption for performance reasons, when in fact they used a much slower implementation than was possible on the hardware. It can make an enormous difference; e.g., AES-XTS on ARM can be over 10x faster with the crypto extensions than without. It can also be useful to know if an implementation using an external crypto accelerator is being used instead of a software implementation. Example message: [ 29.307629] device-mapper: crypt: xts(aes) using implementation "xts-aes-ce" We've already found the similar message in fs/crypto/keyinfo.c to be very useful. Signed-off-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fc7d8b8a654f..509fb20f7f86 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1885,6 +1885,13 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) } } + /* + * dm-crypt performance can vary greatly depending on which crypto + * algorithm implementation is used. Help people debug performance + * problems by logging the ->cra_driver_name. + */ + DMINFO("%s using implementation \"%s\"", ciphermode, + crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); return 0; } @@ -1903,6 +1910,8 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) return err; } + DMINFO("%s using implementation \"%s\"", ciphermode, + crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); return 0; } -- cgit v1.2.3-59-g8ed1b From bbf6a566920e1b0244b27b5f9882fb7f8bc83a31 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 5 Dec 2018 20:54:13 -0800 Subject: dm verity: log the hash algorithm implementation Log the hash algorithm's driver name when a dm-verity target is created. This will help people determine whether the expected implementation is being used. It can make an enormous difference; e.g., SHA-256 on ARM can be 8x faster with the crypto extensions than without. It can also be useful to know if an implementation using an external crypto accelerator is being used instead of a software implementation. Example message: [ 35.281945] device-mapper: verity: sha256 using implementation "sha256-ce" We've already found the similar message in fs/crypto/keyinfo.c to be very useful. Signed-off-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index fc65f0dedf7f..f4c31ffaa88e 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1040,6 +1040,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) v->tfm = NULL; goto bad; } + + /* + * dm-verity performance can vary greatly depending on which hash + * algorithm implementation is used. Help people debug performance + * problems by logging the ->cra_driver_name. + */ + DMINFO("%s using implementation \"%s\"", v->alg_name, + crypto_hash_alg_common(v->tfm)->base.cra_driver_name); + v->digest_size = crypto_ahash_digestsize(v->tfm); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; -- cgit v1.2.3-59-g8ed1b From 34743bfddef2c50af20234ae873324ca49320a55 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 10 Dec 2018 11:55:56 -0500 Subject: dm rq: cleanup leftover code from recently removed q->mq_ops branching When commit 6a23e05c2fe3c6 ("dm: remove legacy request-based IO path") removed some q->mq_ops branching from map_request() it left in place a goto that was only needed if that branching (and conditional 'r' assignment) existed. Now that the branching is gone map_request()'s goto can be removed too. Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 202e9be5aea7..4eb5f8c56535 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -372,7 +372,6 @@ static int map_request(struct dm_rq_target_io *tio) blk_status_t ret; r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); -check_again: switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -392,8 +391,7 @@ check_again: blk_rq_unprep_clone(clone); tio->ti->type->release_clone_rq(clone); tio->clone = NULL; - r = DM_MAPIO_REQUEUE; - goto check_again; + return DM_MAPIO_REQUEUE; } break; case DM_MAPIO_REQUEUE: -- cgit v1.2.3-59-g8ed1b From 74694bcbdf7e28a5ad548cdda9ac56d30be00d13 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 18 Dec 2018 17:35:41 +0100 Subject: dm raid: fix false -EBUSY when handling check/repair message Sending a check/repair message infrequently leads to -EBUSY instead of properly identifying an active resync. This occurs because raid_message() is testing recovery bits in a racy way. Fix by calling decipher_sync_action() from raid_message() to properly identify the idle state of the RAID device. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e1dd1622a290..adcfe8ae10aa 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3690,8 +3690,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) return -EBUSY; else if (!strcasecmp(argv[0], "resync")) ; /* MD_RECOVERY_NEEDED set below */ -- cgit v1.2.3-59-g8ed1b From c6d6e9b0f6b4201c77f2cea3964dd122697e3543 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Dec 2018 09:25:37 -0800 Subject: dm: do not allow readahead to limit IO size Update DM to set the bdi's io_pages. This fixes reads to be capped at the device's max request size (even if user's read IO exceeds the established readahead setting). Fixes: 9491ae4a ("mm: don't cap request size based on read-ahead setting") Cc: stable@vger.kernel.org Reviewed-by: Jens Axboe Signed-off-by: Jaegeuk Kim Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 844f7d0f2ef8..4b1be754cc41 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1927,6 +1927,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_is_zoned(q)) blk_revalidate_disk_zones(t->md->disk); + + /* Allow reads to exceed readahead limits */ + q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); } unsigned int dm_table_get_num_targets(struct dm_table *t) -- cgit v1.2.3-59-g8ed1b