aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-31 11:05:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-31 11:05:47 -0800
commit0be600a5add76e8e8b9e1119f2a7426ff849aca8 (patch)
treed5fcc2b119f03143f9bed1b9aa5cb85458c8bd03 /drivers/md/dm.c
parentMerge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md (diff)
parentdm cache: Documentation: update default migration_throttling value (diff)
downloadlinux-dev-0be600a5add76e8e8b9e1119f2a7426ff849aca8.tar.xz
linux-dev-0be600a5add76e8e8b9e1119f2a7426ff849aca8.zip
Merge tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - DM core fixes to ensure that bio submission follows a depth-first tree walk; this is critical to allow forward progress without the need to use the bioset's BIOSET_NEED_RESCUER. - Remove DM core's BIOSET_NEED_RESCUER based dm_offload infrastructure. - DM core cleanups and improvements to make bio-based DM more efficient (e.g. reduced memory footprint as well leveraging per-bio-data more). - Introduce new bio-based mode (DM_TYPE_NVME_BIO_BASED) that leverages the more direct IO submission path in the block layer; this mode is used by DM multipath and also optimizes targets like DM thin-pool that stack directly on NVMe data device. - DM multipath improvements to factor out legacy SCSI-only (e.g. scsi_dh) code paths to allow for more optimized support for NVMe multipath. - A fix for DM multipath path selectors (service-time and queue-length) to select paths in a more balanced way; largely academic but doesn't hurt. - Numerous DM raid target fixes and improvements. - Add a new DM "unstriped" target that enables Intel to workaround firmware limitations in some NVMe drives that are striped internally (this target also works when stacked above the DM "striped" target). - Various Documentation fixes and improvements. - Misc cleanups and fixes across various DM infrastructure and targets (e.g. bufio, flakey, log-writes, snapshot). * tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (69 commits) dm cache: Documentation: update default migration_throttling value dm mpath selector: more evenly distribute ties dm unstripe: fix target length versus number of stripes size check dm thin: fix trailing semicolon in __remap_and_issue_shared_cell dm table: fix NVMe bio-based dm_table_determine_type() validation dm: various cleanups to md->queue initialization code dm mpath: delay the retry of a request if the target responded as busy dm mpath: return DM_MAPIO_DELAY_REQUEUE if QUEUE_IO or PG_INIT_REQUIRED dm mpath: return DM_MAPIO_REQUEUE on blk-mq rq allocation failure dm log writes: fix max length used for kstrndup dm: backfill missing calls to mutex_destroy() dm snapshot: use mutex instead of rw_semaphore dm flakey: check for null arg_name in parse_features() dm thin: extend thinpool status format string with omitted fields dm thin: fixes in thin-provisioning.txt dm thin: document representation of <highest mapped sector> when there is none dm thin: fix documentation relative to low water mark threshold dm cache: be consistent in specifying sectors and SI units in cache.txt dm cache: delete obsoleted paragraph in cache.txt dm cache: fix grammar in cache-policies.txt ...
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c659
1 files changed, 390 insertions, 269 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8c26bfc35335..d6de00f367ef 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -60,18 +60,73 @@ void dm_issue_global_event(void)
}
/*
- * One of these is allocated per bio.
+ * One of these is allocated (on-stack) per original bio.
*/
+struct clone_info {
+ struct dm_table *map;
+ struct bio *bio;
+ struct dm_io *io;
+ sector_t sector;
+ unsigned sector_count;
+};
+
+/*
+ * One of these is allocated per clone bio.
+ */
+#define DM_TIO_MAGIC 7282014
+struct dm_target_io {
+ unsigned magic;
+ struct dm_io *io;
+ struct dm_target *ti;
+ unsigned target_bio_nr;
+ unsigned *len_ptr;
+ bool inside_dm_io;
+ struct bio clone;
+};
+
+/*
+ * One of these is allocated per original bio.
+ * It contains the first clone used for that original.
+ */
+#define DM_IO_MAGIC 5191977
struct dm_io {
+ unsigned magic;
struct mapped_device *md;
blk_status_t status;
atomic_t io_count;
- struct bio *bio;
+ struct bio *orig_bio;
unsigned long start_time;
spinlock_t endio_lock;
struct dm_stats_aux stats_aux;
+ /* last member of dm_target_io is 'struct bio' */
+ struct dm_target_io tio;
};
+void *dm_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ if (!tio->inside_dm_io)
+ return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
+ return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
+}
+EXPORT_SYMBOL_GPL(dm_per_bio_data);
+
+struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
+{
+ struct dm_io *io = (struct dm_io *)((char *)data + data_size);
+ if (io->magic == DM_IO_MAGIC)
+ return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
+ BUG_ON(io->magic != DM_TIO_MAGIC);
+ return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
+}
+EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
+
+unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
+{
+ return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
+}
+EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
+
#define MINOR_ALLOCED ((void *)-1)
/*
@@ -93,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE;
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
- mempool_t *io_pool;
struct bio_set *bs;
+ struct bio_set *io_bs;
};
struct table_device {
@@ -103,7 +158,6 @@ struct table_device {
struct dm_dev dm_dev;
};
-static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
static struct kmem_cache *_rq_cache;
@@ -170,14 +224,9 @@ static int __init local_init(void)
{
int r = -ENOMEM;
- /* allocate a slab for the dm_ios */
- _io_cache = KMEM_CACHE(dm_io, 0);
- if (!_io_cache)
- return r;
-
_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
if (!_rq_tio_cache)
- goto out_free_io_cache;
+ return r;
_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
__alignof__(struct request), 0, NULL);
@@ -212,8 +261,6 @@ out_free_rq_cache:
kmem_cache_destroy(_rq_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
-out_free_io_cache:
- kmem_cache_destroy(_io_cache);
return r;
}
@@ -225,7 +272,6 @@ static void local_exit(void)
kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_rq_tio_cache);
- kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
dm_uevent_exit();
@@ -486,18 +532,69 @@ out:
return r;
}
-static struct dm_io *alloc_io(struct mapped_device *md)
+static void start_io_acct(struct dm_io *io);
+
+static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
{
- return mempool_alloc(md->io_pool, GFP_NOIO);
+ struct dm_io *io;
+ struct dm_target_io *tio;
+ struct bio *clone;
+
+ clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
+ if (!clone)
+ return NULL;
+
+ tio = container_of(clone, struct dm_target_io, clone);
+ tio->inside_dm_io = true;
+ tio->io = NULL;
+
+ io = container_of(tio, struct dm_io, tio);
+ io->magic = DM_IO_MAGIC;
+ io->status = 0;
+ atomic_set(&io->io_count, 1);
+ io->orig_bio = bio;
+ io->md = md;
+ spin_lock_init(&io->endio_lock);
+
+ start_io_acct(io);
+
+ return io;
}
static void free_io(struct mapped_device *md, struct dm_io *io)
{
- mempool_free(io, md->io_pool);
+ bio_put(&io->tio.clone);
+}
+
+static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
+ unsigned target_bio_nr, gfp_t gfp_mask)
+{
+ struct dm_target_io *tio;
+
+ if (!ci->io->tio.io) {
+ /* the dm_target_io embedded in ci->io is available */
+ tio = &ci->io->tio;
+ } else {
+ struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
+ if (!clone)
+ return NULL;
+
+ tio = container_of(clone, struct dm_target_io, clone);
+ tio->inside_dm_io = false;
+ }
+
+ tio->magic = DM_TIO_MAGIC;
+ tio->io = ci->io;
+ tio->ti = ti;
+ tio->target_bio_nr = target_bio_nr;
+
+ return tio;
}
static void free_tio(struct dm_target_io *tio)
{
+ if (tio->inside_dm_io)
+ return;
bio_put(&tio->clone);
}
@@ -510,17 +607,15 @@ int md_in_flight(struct mapped_device *md)
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
- struct bio *bio = io->bio;
- int cpu;
+ struct bio *bio = io->orig_bio;
int rw = bio_data_dir(bio);
io->start_time = jiffies;
- cpu = part_stat_lock();
- part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
- part_stat_unlock();
+ generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+
atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
+ atomic_inc_return(&md->pending[rw]));
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -531,7 +626,7 @@ static void start_io_acct(struct dm_io *io)
static void end_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
- struct bio *bio = io->bio;
+ struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
int pending;
int rw = bio_data_dir(bio);
@@ -752,15 +847,6 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
return 0;
}
-/*-----------------------------------------------------------------
- * CRUD START:
- * A more elegant soln is in the works that uses the queue
- * merge fn, unfortunately there are a couple of changes to
- * the block layer that I want to make for this. So in the
- * interests of getting something for people to use I give
- * you this clearly demarcated crap.
- *---------------------------------------------------------------*/
-
static int __noflush_suspending(struct mapped_device *md)
{
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
@@ -780,8 +866,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
/* Push-back supersedes any I/O errors */
if (unlikely(error)) {
spin_lock_irqsave(&io->endio_lock, flags);
- if (!(io->status == BLK_STS_DM_REQUEUE &&
- __noflush_suspending(md)))
+ if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
io->status = error;
spin_unlock_irqrestore(&io->endio_lock, flags);
}
@@ -793,7 +878,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
*/
spin_lock_irqsave(&md->deferred_lock, flags);
if (__noflush_suspending(md))
- bio_list_add_head(&md->deferred, io->bio);
+ /* NOTE early return due to BLK_STS_DM_REQUEUE below */
+ bio_list_add_head(&md->deferred, io->orig_bio);
else
/* noflush suspend was interrupted. */
io->status = BLK_STS_IOERR;
@@ -801,7 +887,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
}
io_error = io->status;
- bio = io->bio;
+ bio = io->orig_bio;
end_io_acct(io);
free_io(md, io);
@@ -847,7 +933,7 @@ static void clone_endio(struct bio *bio)
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
- if (unlikely(error == BLK_STS_TARGET)) {
+ if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
if (bio_op(bio) == REQ_OP_WRITE_SAME &&
!bio->bi_disk->queue->limits.max_write_same_sectors)
disable_write_same(md);
@@ -1005,7 +1091,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
- * allowed for all bio types except REQ_PREFLUSH.
+ * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1055,7 +1141,7 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
{
#ifdef CONFIG_BLK_DEV_ZONED
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
- struct bio *report_bio = tio->io->bio;
+ struct bio *report_bio = tio->io->orig_bio;
struct blk_zone_report_hdr *hdr = NULL;
struct blk_zone *zone;
unsigned int nr_rep = 0;
@@ -1122,67 +1208,15 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
}
EXPORT_SYMBOL_GPL(dm_remap_zone_report);
-/*
- * Flush current->bio_list when the target map method blocks.
- * This fixes deadlocks in snapshot and possibly in other targets.
- */
-struct dm_offload {
- struct blk_plug plug;
- struct blk_plug_cb cb;
-};
-
-static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
-{
- struct dm_offload *o = container_of(cb, struct dm_offload, cb);
- struct bio_list list;
- struct bio *bio;
- int i;
-
- INIT_LIST_HEAD(&o->cb.list);
-
- if (unlikely(!current->bio_list))
- return;
-
- for (i = 0; i < 2; i++) {
- list = current->bio_list[i];
- bio_list_init(&current->bio_list[i]);
-
- while ((bio = bio_list_pop(&list))) {
- struct bio_set *bs = bio->bi_pool;
- if (unlikely(!bs) || bs == fs_bio_set ||
- !bs->rescue_workqueue) {
- bio_list_add(&current->bio_list[i], bio);
- continue;
- }
-
- spin_lock(&bs->rescue_lock);
- bio_list_add(&bs->rescue_list, bio);
- queue_work(bs->rescue_workqueue, &bs->rescue_work);
- spin_unlock(&bs->rescue_lock);
- }
- }
-}
-
-static void dm_offload_start(struct dm_offload *o)
-{
- blk_start_plug(&o->plug);
- o->cb.callback = flush_current_bio_list;
- list_add(&o->cb.list, &current->plug->cb_list);
-}
-
-static void dm_offload_end(struct dm_offload *o)
-{
- list_del(&o->cb.list);
- blk_finish_plug(&o->plug);
-}
-
-static void __map_bio(struct dm_target_io *tio)
+static blk_qc_t __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
- struct dm_offload o;
struct bio *clone = &tio->clone;
+ struct dm_io *io = tio->io;
+ struct mapped_device *md = io->md;
struct dm_target *ti = tio->ti;
+ blk_qc_t ret = BLK_QC_T_NONE;
clone->bi_end_io = clone_endio;
@@ -1191,44 +1225,37 @@ static void __map_bio(struct dm_target_io *tio)
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&tio->io->io_count);
+ atomic_inc(&io->io_count);
sector = clone->bi_iter.bi_sector;
- dm_offload_start(&o);
r = ti->type->map(ti, clone);
- dm_offload_end(&o);
-
switch (r) {
case DM_MAPIO_SUBMITTED:
break;
case DM_MAPIO_REMAPPED:
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(clone->bi_disk->queue, clone,
- bio_dev(tio->io->bio), sector);
- generic_make_request(clone);
+ bio_dev(io->orig_bio), sector);
+ if (md->type == DM_TYPE_NVME_BIO_BASED)
+ ret = direct_make_request(clone);
+ else
+ ret = generic_make_request(clone);
break;
case DM_MAPIO_KILL:
- dec_pending(tio->io, BLK_STS_IOERR);
free_tio(tio);
+ dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
- dec_pending(tio->io, BLK_STS_DM_REQUEUE);
free_tio(tio);
+ dec_pending(io, BLK_STS_DM_REQUEUE);
break;
default:
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
-}
-struct clone_info {
- struct mapped_device *md;
- struct dm_table *map;
- struct bio *bio;
- struct dm_io *io;
- sector_t sector;
- unsigned sector_count;
-};
+ return ret;
+}
static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
{
@@ -1272,28 +1299,49 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
return 0;
}
-static struct dm_target_io *alloc_tio(struct clone_info *ci,
- struct dm_target *ti,
- unsigned target_bio_nr)
+static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
+ struct dm_target *ti, unsigned num_bios)
{
struct dm_target_io *tio;
- struct bio *clone;
+ int try;
- clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
- tio = container_of(clone, struct dm_target_io, clone);
+ if (!num_bios)
+ return;
- tio->io = ci->io;
- tio->ti = ti;
- tio->target_bio_nr = target_bio_nr;
+ if (num_bios == 1) {
+ tio = alloc_tio(ci, ti, 0, GFP_NOIO);
+ bio_list_add(blist, &tio->clone);
+ return;
+ }
- return tio;
+ for (try = 0; try < 2; try++) {
+ int bio_nr;
+ struct bio *bio;
+
+ if (try)
+ mutex_lock(&ci->io->md->table_devices_lock);
+ for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
+ tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
+ if (!tio)
+ break;
+
+ bio_list_add(blist, &tio->clone);
+ }
+ if (try)
+ mutex_unlock(&ci->io->md->table_devices_lock);
+ if (bio_nr == num_bios)
+ return;
+
+ while ((bio = bio_list_pop(blist))) {
+ tio = container_of(bio, struct dm_target_io, clone);
+ free_tio(tio);
+ }
+ }
}
-static void __clone_and_map_simple_bio(struct clone_info *ci,
- struct dm_target *ti,
- unsigned target_bio_nr, unsigned *len)
+static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
+ struct dm_target_io *tio, unsigned *len)
{
- struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
struct bio *clone = &tio->clone;
tio->len_ptr = len;
@@ -1302,16 +1350,22 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
if (len)
bio_setup_sector(clone, ci->sector, *len);
- __map_bio(tio);
+ return __map_bio(tio);
}
static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios, unsigned *len)
{
- unsigned target_bio_nr;
+ struct bio_list blist = BIO_EMPTY_LIST;
+ struct bio *bio;
+ struct dm_target_io *tio;
+
+ alloc_multiple_bios(&blist, ci, ti, num_bios);
- for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
- __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
+ while ((bio = bio_list_pop(&blist))) {
+ tio = container_of(bio, struct dm_target_io, clone);
+ (void) __clone_and_map_simple_bio(ci, tio, len);
+ }
}
static int __send_empty_flush(struct clone_info *ci)
@@ -1327,32 +1381,22 @@ static int __send_empty_flush(struct clone_info *ci)
}
static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
- sector_t sector, unsigned *len)
+ sector_t sector, unsigned *len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
- unsigned target_bio_nr;
- unsigned num_target_bios = 1;
- int r = 0;
+ int r;
- /*
- * Does the target want to receive duplicate copies of the bio?
- */
- if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
- num_target_bios = ti->num_write_bios(ti, bio);
-
- for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
- tio = alloc_tio(ci, ti, target_bio_nr);
- tio->len_ptr = len;
- r = clone_bio(tio, bio, sector, *len);
- if (r < 0) {
- free_tio(tio);
- break;
- }
- __map_bio(tio);
+ tio = alloc_tio(ci, ti, 0, GFP_NOIO);
+ tio->len_ptr = len;
+ r = clone_bio(tio, bio, sector, *len);
+ if (r < 0) {
+ free_tio(tio);
+ return r;
}
+ (void) __map_bio(tio);
- return r;
+ return 0;
}
typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
@@ -1379,56 +1423,50 @@ static bool is_split_required_for_discard(struct dm_target *ti)
return ti->split_discard_bios;
}
-static int __send_changing_extent_only(struct clone_info *ci,
+static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
get_num_bios_fn get_num_bios,
is_split_required_fn is_split_required)
{
- struct dm_target *ti;
unsigned len;
unsigned num_bios;
- do {
- ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
- return -EIO;
-
- /*
- * Even though the device advertised support for this type of
- * request, that does not mean every target supports it, and
- * reconfiguration might also have changed that since the
- * check was performed.
- */
- num_bios = get_num_bios ? get_num_bios(ti) : 0;
- if (!num_bios)
- return -EOPNOTSUPP;
+ /*
+ * Even though the device advertised support for this type of
+ * request, that does not mean every target supports it, and
+ * reconfiguration might also have changed that since the
+ * check was performed.
+ */
+ num_bios = get_num_bios ? get_num_bios(ti) : 0;
+ if (!num_bios)
+ return -EOPNOTSUPP;
- if (is_split_required && !is_split_required(ti))
- len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
- else
- len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
+ if (is_split_required && !is_split_required(ti))
+ len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+ else
+ len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
- __send_duplicate_bios(ci, ti, num_bios, &len);
+ __send_duplicate_bios(ci, ti, num_bios, &len);
- ci->sector += len;
- } while (ci->sector_count -= len);
+ ci->sector += len;
+ ci->sector_count -= len;
return 0;
}
-static int __send_discard(struct clone_info *ci)
+static int __send_discard(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, get_num_discard_bios,
+ return __send_changing_extent_only(ci, ti, get_num_discard_bios,
is_split_required_for_discard);
}
-static int __send_write_same(struct clone_info *ci)
+static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
}
-static int __send_write_zeroes(struct clone_info *ci)
+static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
}
/*
@@ -1441,17 +1479,17 @@ static int __split_and_process_non_flush(struct clone_info *ci)
unsigned len;
int r;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
- return __send_discard(ci);
- else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
- return __send_write_same(ci);
- else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
- return __send_write_zeroes(ci);
-
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+ return __send_discard(ci, ti);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
+ return __send_write_same(ci, ti);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ return __send_write_zeroes(ci, ti);
+
if (bio_op(bio) == REQ_OP_ZONE_REPORT)
len = ci->sector_count;
else
@@ -1468,34 +1506,33 @@ static int __split_and_process_non_flush(struct clone_info *ci)
return 0;
}
+static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
+{
+ ci->map = map;
+ ci->io = alloc_io(md, bio);
+ ci->sector = bio->bi_iter.bi_sector;
+}
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
-static void __split_and_process_bio(struct mapped_device *md,
- struct dm_table *map, struct bio *bio)
+static blk_qc_t __split_and_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
{
struct clone_info ci;
+ blk_qc_t ret = BLK_QC_T_NONE;
int error = 0;
if (unlikely(!map)) {
bio_io_error(bio);
- return;
+ return ret;
}
- ci.map = map;
- ci.md = md;
- ci.io = alloc_io(md);
- ci.io->status = 0;
- atomic_set(&ci.io->io_count, 1);
- ci.io->bio = bio;
- ci.io->md = md;
- spin_lock_init(&ci.io->endio_lock);
- ci.sector = bio->bi_iter.bi_sector;
-
- start_io_acct(ci.io);
+ init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.md->flush_bio;
+ ci.bio = &ci.io->md->flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1506,32 +1543,95 @@ static void __split_and_process_bio(struct mapped_device *md,
} else {
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
- while (ci.sector_count && !error)
+ while (ci.sector_count && !error) {
error = __split_and_process_non_flush(&ci);
+ if (current->bio_list && ci.sector_count && !error) {
+ /*
+ * Remainder must be passed to generic_make_request()
+ * so that it gets handled *after* bios already submitted
+ * have been completely processed.
+ * We take a clone of the original to store in
+ * ci.io->orig_bio to be used by end_io_acct() and
+ * for dec_pending to use for completion handling.
+ * As this path is not used for REQ_OP_ZONE_REPORT,
+ * the usage of io->orig_bio in dm_remap_zone_report()
+ * won't be affected by this reassignment.
+ */
+ struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
+ md->queue->bio_split);
+ ci.io->orig_bio = b;
+ bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
+ bio_chain(b, bio);
+ ret = generic_make_request(bio);
+ break;
+ }
+ }
}
/* drop the extra reference count */
dec_pending(ci.io, errno_to_blk_status(error));
+ return ret;
}
-/*-----------------------------------------------------------------
- * CRUD END
- *---------------------------------------------------------------*/
/*
- * The request function that just remaps the bio built up by
- * dm_merge_bvec.
+ * Optimized variant of __split_and_process_bio that leverages the
+ * fact that targets that use it do _not_ have a need to split bios.
*/
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t __process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
+{
+ struct clone_info ci;
+ blk_qc_t ret = BLK_QC_T_NONE;
+ int error = 0;
+
+ if (unlikely(!map)) {
+ bio_io_error(bio);
+ return ret;
+ }
+
+ init_clone_info(&ci, md, map, bio);
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ ci.bio = &ci.io->md->flush_bio;
+ ci.sector_count = 0;
+ error = __send_empty_flush(&ci);
+ /* dec_pending submits any data associated with flush */
+ } else {
+ struct dm_target *ti = md->immutable_target;
+ struct dm_target_io *tio;
+
+ /*
+ * Defend against IO still getting in during teardown
+ * - as was seen for a time with nvme-fcloop
+ */
+ if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
+ error = -EIO;
+ goto out;
+ }
+
+ tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
+ ci.bio = bio;
+ ci.sector_count = bio_sectors(bio);
+ ret = __clone_and_map_simple_bio(&ci, tio, NULL);
+ }
+out:
+ /* drop the extra reference count */
+ dec_pending(ci.io, errno_to_blk_status(error));
+ return ret;
+}
+
+typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
+
+static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
+ process_bio_fn process_bio)
{
- int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
+ blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
- generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
-
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
dm_put_live_table(md, srcu_idx);
@@ -1540,12 +1640,27 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
queue_io(md, bio);
else
bio_io_error(bio);
- return BLK_QC_T_NONE;
+ return ret;
}
- __split_and_process_bio(md, map, bio);
+ ret = process_bio(md, map, bio);
+
dm_put_live_table(md, srcu_idx);
- return BLK_QC_T_NONE;
+ return ret;
+}
+
+/*
+ * The request function that remaps the bio to one target and
+ * splits off any remainder.
+ */
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
+{
+ return __dm_make_request(q, bio, __split_and_process_bio);
+}
+
+static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
+{
+ return __dm_make_request(q, bio, __process_bio);
}
static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -1626,20 +1741,9 @@ static const struct dax_operations dm_dax_ops;
static void dm_wq_work(struct work_struct *work);
-void dm_init_md_queue(struct mapped_device *md)
-{
- /*
- * Initialize data that will only be used by a non-blk-mq DM queue
- * - must do so here (in alloc_dev callchain) before queue is used
- */
- md->queue->queuedata = md;
- md->queue->backing_dev_info->congested_data = md;
-}
-
-void dm_init_normal_md_queue(struct mapped_device *md)
+static void dm_init_normal_md_queue(struct mapped_device *md)
{
md->use_blk_mq = false;
- dm_init_md_queue(md);
/*
* Initialize aspects of queue that aren't relevant for blk-mq
@@ -1653,9 +1757,10 @@ static void cleanup_mapped_device(struct mapped_device *md)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
- mempool_destroy(md->io_pool);
if (md->bs)
bioset_free(md->bs);
+ if (md->io_bs)
+ bioset_free(md->io_bs);
if (md->dax_dev) {
kill_dax(md->dax_dev);
@@ -1681,6 +1786,10 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->bdev = NULL;
}
+ mutex_destroy(&md->suspend_lock);
+ mutex_destroy(&md->type_lock);
+ mutex_destroy(&md->table_devices_lock);
+
dm_mq_cleanup_mapped_device(md);
}
@@ -1734,10 +1843,10 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info->congested_data = md;
- dm_init_md_queue(md);
-
- md->disk = alloc_disk_node(1, numa_node_id);
+ md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
goto bad;
@@ -1820,17 +1929,22 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p = dm_table_get_md_mempools(t);
- if (md->bs) {
- /* The md already has necessary mempools. */
- if (dm_table_bio_based(t)) {
- /*
- * Reload bioset because front_pad may have changed
- * because a different table was loaded.
- */
+ if (dm_table_bio_based(t)) {
+ /*
+ * The md may already have mempools that need changing.
+ * If so, reload bioset because front_pad may have changed
+ * because a different table was loaded.
+ */
+ if (md->bs) {
bioset_free(md->bs);
- md->bs = p->bs;
- p->bs = NULL;
+ md->bs = NULL;
+ }
+ if (md->io_bs) {
+ bioset_free(md->io_bs);
+ md->io_bs = NULL;
}
+
+ } else if (md->bs) {
/*
* There's no need to reload with request-based dm
* because the size of front_pad doesn't change.
@@ -1842,13 +1956,12 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
goto out;
}
- BUG_ON(!p || md->io_pool || md->bs);
+ BUG_ON(!p || md->bs || md->io_bs);
- md->io_pool = p->io_pool;
- p->io_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
-
+ md->io_bs = p->io_bs;
+ p->io_bs = NULL;
out:
/* mempool bind completed, no longer need any mempools in the table */
dm_table_free_md_mempools(t);
@@ -1894,6 +2007,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
{
struct dm_table *old_map;
struct request_queue *q = md->queue;
+ bool request_based = dm_table_request_based(t);
sector_t size;
lockdep_assert_held(&md->suspend_lock);
@@ -1917,12 +2031,15 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* This must be done before setting the queue restrictions,
* because request-based dm may be run just after the setting.
*/
- if (dm_table_request_based(t)) {
+ if (request_based)
dm_stop_queue(q);
+
+ if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
/*
- * Leverage the fact that request-based DM targets are
- * immutable singletons and establish md->immutable_target
- * - used to optimize both dm_request_fn and dm_mq_queue_rq
+ * Leverage the fact that request-based DM targets and
+ * NVMe bio based targets are immutable singletons
+ * - used to optimize both dm_request_fn and dm_mq_queue_rq;
+ * and __process_bio.
*/
md->immutable_target = dm_table_get_immutable_target(t);
}
@@ -1962,13 +2079,18 @@ static struct dm_table *__unbind(struct mapped_device *md)
*/
int dm_create(int minor, struct mapped_device **result)
{
+ int r;
struct mapped_device *md;
md = alloc_dev(minor);
if (!md)
return -ENXIO;
- dm_sysfs_init(md);
+ r = dm_sysfs_init(md);
+ if (r) {
+ free_dev(md);
+ return r;
+ }
*result = md;
return 0;
@@ -2026,6 +2148,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) {
case DM_TYPE_REQUEST_BASED:
+ dm_init_normal_md_queue(md);
r = dm_old_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based mapped device");
@@ -2043,15 +2166,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED:
dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
- /*
- * DM handles splitting bios as needed. Free the bio_split bioset
- * since it won't be used (saves 1 process per bio-based DM device).
- */
- bioset_free(md->queue->bio_split);
- md->queue->bio_split = NULL;
-
- if (type == DM_TYPE_DAX_BIO_BASED)
- queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
+ break;
+ case DM_TYPE_NVME_BIO_BASED:
+ dm_init_normal_md_queue(md);
+ blk_queue_make_request(md->queue, dm_make_request_nvme);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
@@ -2130,7 +2248,6 @@ EXPORT_SYMBOL_GPL(dm_device_name);
static void __dm_destroy(struct mapped_device *md, bool wait)
{
- struct request_queue *q = dm_get_md_queue(md);
struct dm_table *map;
int srcu_idx;
@@ -2141,7 +2258,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
- blk_set_queue_dying(q);
+ blk_set_queue_dying(md->queue);
if (dm_request_based(md) && md->kworker_task)
kthread_flush_worker(&md->kworker);
@@ -2752,11 +2869,12 @@ int dm_noflush_suspending(struct dm_target *ti)
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
- unsigned integrity, unsigned per_io_data_size)
+ unsigned integrity, unsigned per_io_data_size,
+ unsigned min_pool_size)
{
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
unsigned int pool_size = 0;
- unsigned int front_pad;
+ unsigned int front_pad, io_front_pad;
if (!pools)
return NULL;
@@ -2764,16 +2882,19 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
switch (type) {
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- pool_size = dm_get_reserved_bio_based_ios();
+ case DM_TYPE_NVME_BIO_BASED:
+ pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-
- pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
- if (!pools->io_pool)
+ io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
+ pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
+ if (!pools->io_bs)
+ goto out;
+ if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
goto out;
break;
case DM_TYPE_REQUEST_BASED:
case DM_TYPE_MQ_REQUEST_BASED:
- pool_size = dm_get_reserved_rq_based_ios();
+ pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_io_data_size is used for blk-mq pdu at queue allocation */
break;
@@ -2781,7 +2902,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
BUG();
}
- pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
+ pools->bs = bioset_create(pool_size, front_pad, 0);
if (!pools->bs)
goto out;
@@ -2801,10 +2922,10 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (!pools)
return;
- mempool_destroy(pools->io_pool);
-
if (pools->bs)
bioset_free(pools->bs);
+ if (pools->io_bs)
+ bioset_free(pools->io_bs);
kfree(pools);
}