aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c606
1 files changed, 350 insertions, 256 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb0a551bd88..99642f69bfa7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -88,10 +88,6 @@ struct clone_info {
bool submit_as_polled:1;
};
-#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
-#define DM_IO_BIO_OFFSET \
- (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
-
static inline struct dm_target_io *clone_to_tio(struct bio *clone)
{
return container_of(clone, struct dm_target_io, clone);
@@ -136,14 +132,6 @@ static int get_swap_bios(void)
return latch;
}
-/*
- * For mempools pre-allocation at the table loading time.
- */
-struct dm_md_mempools {
- struct bio_set bs;
- struct bio_set io_bs;
-};
-
struct table_device {
struct list_head list;
refcount_t count;
@@ -423,7 +411,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
{
- struct dm_target *tgt;
+ struct dm_target *ti;
struct dm_table *map;
int r;
@@ -434,17 +422,17 @@ retry:
return r;
/* We only support devices that have a single target */
- if (dm_table_get_num_targets(map) != 1)
+ if (map->num_targets != 1)
return r;
- tgt = dm_table_get_target(map, 0);
- if (!tgt->type->prepare_ioctl)
+ ti = dm_table_get_target(map, 0);
+ if (!ti->type->prepare_ioctl)
return r;
if (dm_suspended_md(md))
return -EAGAIN;
- r = tgt->type->prepare_ioctl(tgt, bdev);
+ r = ti->type->prepare_ioctl(ti, bdev);
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
dm_put_live_table(md, *srcu_idx);
msleep(10);
@@ -563,6 +551,10 @@ static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
unsigned long flags;
/* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
spin_lock_irqsave(&io->lock, flags);
+ if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
+ spin_unlock_irqrestore(&io->lock, flags);
+ return;
+ }
dm_io_set_flag(io, DM_IO_ACCOUNTED);
spin_unlock_irqrestore(&io->lock, flags);
}
@@ -581,10 +573,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
struct dm_target_io *tio;
struct bio *clone;
- clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs);
- /* Set default bdev, but target must bio_set_dev() before issuing IO */
- clone->bi_bdev = md->disk->part0;
-
+ clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
tio = clone_to_tio(clone);
tio->flags = 0;
dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
@@ -617,6 +606,7 @@ static void free_io(struct dm_io *io)
static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
{
+ struct mapped_device *md = ci->io->md;
struct dm_target_io *tio;
struct bio *clone;
@@ -626,13 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
/* alloc_io() already initialized embedded clone */
clone = &tio->clone;
} else {
- struct mapped_device *md = ci->io->md;
-
- clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs);
+ clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
+ &md->mempools->bs);
if (!clone)
return NULL;
- /* Set default bdev, but target must bio_set_dev() before issuing IO */
- clone->bi_bdev = md->disk->part0;
/* REQ_DM_POLL_LIST shouldn't be inherited */
clone->bi_opf &= ~REQ_DM_POLL_LIST;
@@ -648,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
tio->len_ptr = len;
tio->old_sector = 0;
+ /* Set default bdev, but target must bio_set_dev() before issuing IO */
+ clone->bi_bdev = md->disk->part0;
+ if (unlikely(ti->needs_bio_set_dev))
+ bio_set_dev(clone, md->disk->part0);
+
if (len) {
clone->bi_iter.bi_size = to_bytes(*len);
if (bio_integrity(clone))
@@ -718,18 +710,18 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
}
static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md,
- int *srcu_idx, struct bio *bio)
+ int *srcu_idx, blk_opf_t bio_opf)
{
- if (bio->bi_opf & REQ_NOWAIT)
+ if (bio_opf & REQ_NOWAIT)
return dm_get_live_table_fast(md);
else
return dm_get_live_table(md, srcu_idx);
}
static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx,
- struct bio *bio)
+ blk_opf_t bio_opf)
{
- if (bio->bi_opf & REQ_NOWAIT)
+ if (bio_opf & REQ_NOWAIT)
dm_put_live_table_fast(md);
else
dm_put_live_table(md, srcu_idx);
@@ -886,22 +878,63 @@ static int __noflush_suspending(struct mapped_device *md)
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
}
-static void dm_io_complete(struct dm_io *io)
+static void dm_requeue_add_io(struct dm_io *io, bool first_stage)
{
- blk_status_t io_error;
struct mapped_device *md = io->md;
+
+ if (first_stage) {
+ struct dm_io *next = md->requeue_list;
+
+ md->requeue_list = io;
+ io->next = next;
+ } else {
+ bio_list_add_head(&md->deferred, io->orig_bio);
+ }
+}
+
+static void dm_kick_requeue(struct mapped_device *md, bool first_stage)
+{
+ if (first_stage)
+ queue_work(md->wq, &md->requeue_work);
+ else
+ queue_work(md->wq, &md->work);
+}
+
+/*
+ * Return true if the dm_io's original bio is requeued.
+ * io->status is updated with error if requeue disallowed.
+ */
+static bool dm_handle_requeue(struct dm_io *io, bool first_stage)
+{
struct bio *bio = io->orig_bio;
+ bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);
+ bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
+ (bio->bi_opf & REQ_POLLED));
+ struct mapped_device *md = io->md;
+ bool requeued = false;
- if (io->status == BLK_STS_DM_REQUEUE) {
+ if (handle_requeue || handle_polled_eagain) {
unsigned long flags;
+
+ if (bio->bi_opf & REQ_POLLED) {
+ /*
+ * Upper layer won't help us poll split bio
+ * (io->orig_bio may only reflect a subset of the
+ * pre-split original) so clear REQ_POLLED.
+ */
+ bio_clear_polled(bio);
+ }
+
/*
- * Target requested pushing back the I/O.
+ * Target requested pushing back the I/O or
+ * polled IO hit BLK_STS_AGAIN.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md) &&
- !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
- /* NOTE early return due to BLK_STS_DM_REQUEUE below */
- bio_list_add_head(&md->deferred, bio);
+ if ((__noflush_suspending(md) &&
+ !WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
+ handle_polled_eagain || first_stage) {
+ dm_requeue_add_io(io, first_stage);
+ requeued = true;
} else {
/*
* noflush suspend was interrupted or this is
@@ -912,6 +945,23 @@ static void dm_io_complete(struct dm_io *io)
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
+ if (requeued)
+ dm_kick_requeue(md, first_stage);
+
+ return requeued;
+}
+
+static void __dm_io_complete(struct dm_io *io, bool first_stage)
+{
+ struct bio *bio = io->orig_bio;
+ struct mapped_device *md = io->md;
+ blk_status_t io_error;
+ bool requeued;
+
+ requeued = dm_handle_requeue(io, first_stage);
+ if (requeued && first_stage)
+ return;
+
io_error = io->status;
if (dm_io_flagged(io, DM_IO_ACCOUNTED))
dm_end_io_acct(io);
@@ -931,21 +981,9 @@ static void dm_io_complete(struct dm_io *io)
if (unlikely(wq_has_sleeper(&md->wait)))
wake_up(&md->wait);
- if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) {
- if (bio->bi_opf & REQ_POLLED) {
- /*
- * Upper layer won't help us poll split bio (io->orig_bio
- * may only reflect a subset of the pre-split original)
- * so clear REQ_POLLED in case of requeue.
- */
- bio_clear_polled(bio);
- if (io_error == BLK_STS_AGAIN) {
- /* io_uring doesn't handle BLK_STS_AGAIN (yet) */
- queue_io(md, bio);
- }
- }
+ /* Return early if the original bio was requeued */
+ if (requeued)
return;
- }
if (bio_is_flush_with_data(bio)) {
/*
@@ -962,6 +1000,58 @@ static void dm_io_complete(struct dm_io *io)
}
}
+static void dm_wq_requeue_work(struct work_struct *work)
+{
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ requeue_work);
+ unsigned long flags;
+ struct dm_io *io;
+
+ /* reuse deferred lock to simplify dm_handle_requeue */
+ spin_lock_irqsave(&md->deferred_lock, flags);
+ io = md->requeue_list;
+ md->requeue_list = NULL;
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
+
+ while (io) {
+ struct dm_io *next = io->next;
+
+ dm_io_rewind(io, &md->queue->bio_split);
+
+ io->next = NULL;
+ __dm_io_complete(io, false);
+ io = next;
+ }
+}
+
+/*
+ * Two staged requeue:
+ *
+ * 1) io->orig_bio points to the real original bio, and the part mapped to
+ * this io must be requeued, instead of other parts of the original bio.
+ *
+ * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
+ */
+static void dm_io_complete(struct dm_io *io)
+{
+ bool first_requeue;
+
+ /*
+ * Only dm_io that has been split needs two stage requeue, otherwise
+ * we may run into long bio clone chain during suspend and OOM could
+ * be triggered.
+ *
+ * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
+ * also aren't handled via the first stage requeue.
+ */
+ if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
+ first_requeue = true;
+ else
+ first_requeue = false;
+
+ __dm_io_complete(io, first_requeue);
+}
+
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
@@ -1023,23 +1113,19 @@ static void clone_endio(struct bio *bio)
struct dm_io *io = tio->io;
struct mapped_device *md = io->md;
- if (likely(bio->bi_bdev != md->disk->part0)) {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-
- if (unlikely(error == BLK_STS_TARGET)) {
- if (bio_op(bio) == REQ_OP_DISCARD &&
- !bdev_max_discard_sectors(bio->bi_bdev))
- disable_discard(md);
- else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !q->limits.max_write_zeroes_sectors)
- disable_write_zeroes(md);
- }
-
- if (static_branch_unlikely(&zoned_enabled) &&
- unlikely(blk_queue_is_zoned(q)))
- dm_zone_endio(io, bio);
+ if (unlikely(error == BLK_STS_TARGET)) {
+ if (bio_op(bio) == REQ_OP_DISCARD &&
+ !bdev_max_discard_sectors(bio->bi_bdev))
+ disable_discard(md);
+ else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_write_zeroes_sectors(bio->bi_bdev))
+ disable_write_zeroes(md);
}
+ if (static_branch_unlikely(&zoned_enabled) &&
+ unlikely(bdev_is_zoned(bio->bi_bdev)))
+ dm_zone_endio(io, bio);
+
if (endio) {
int r = endio(ti, bio, &error);
switch (r) {
@@ -1090,23 +1176,18 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
{
sector_t target_offset = dm_target_offset(ti, sector);
sector_t len = max_io_len_target_boundary(ti, target_offset);
- sector_t max_len;
/*
* Does the target need to split IO even further?
* - varied (per target) IO splitting is a tenet of DM; this
* explains why stacked chunk_sectors based splitting via
- * blk_max_size_offset() isn't possible here. So pass in
- * ti->max_io_len to override stacked chunk_sectors.
+ * blk_queue_split() isn't possible here.
*/
- if (ti->max_io_len) {
- max_len = blk_max_size_offset(ti->table->md->queue,
- target_offset, ti->max_io_len);
- if (len > max_len)
- len = max_len;
- }
-
- return len;
+ if (!ti->max_io_len)
+ return len;
+ return min_t(sector_t, len,
+ min(queue_max_sectors(ti->table->md->queue),
+ blk_chunk_sectors_left(target_offset, ti->max_io_len)));
}
int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@ -1249,6 +1330,7 @@ out:
void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
struct dm_target_io *tio = clone_to_tio(bio);
+ struct dm_io *io = tio->io;
unsigned bio_sectors = bio_sectors(bio);
BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
@@ -1264,8 +1346,9 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
* __split_and_process_bio() may have already saved mapped part
* for accounting but it is being reduced so update accordingly.
*/
- dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT);
- tio->io->sectors = n_sectors;
+ dm_io_set_flag(io, DM_IO_WAS_SPLIT);
+ io->sectors = n_sectors;
+ io->sector_offset = bio_sectors(io->orig_bio);
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
@@ -1388,17 +1471,7 @@ static void setup_split_accounting(struct clone_info *ci, unsigned len)
*/
dm_io_set_flag(io, DM_IO_WAS_SPLIT);
io->sectors = len;
- }
-
- if (static_branch_unlikely(&stats_enabled) &&
- unlikely(dm_stats_used(&io->md->stats))) {
- /*
- * Save bi_sector in terms of its offset from end of
- * original bio, only needed for DM-stats' benefit.
- * - saved regardless of whether split needed so that
- * dm_accept_partial_bio() doesn't need to.
- */
- io->sector_offset = bio_end_sector(ci->bio) - ci->sector;
+ io->sector_offset = bio_sectors(ci->bio);
}
}
@@ -1432,11 +1505,11 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
}
static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
- unsigned num_bios, unsigned *len)
+ unsigned int num_bios, unsigned *len)
{
struct bio_list blist = BIO_EMPTY_LIST;
struct bio *clone;
- int ret = 0;
+ unsigned int ret = 0;
switch (num_bios) {
case 0:
@@ -1464,8 +1537,7 @@ static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
static void __send_empty_flush(struct clone_info *ci)
{
- unsigned target_nr = 0;
- struct dm_target *ti;
+ struct dm_table *t = ci->map;
struct bio flush_bio;
/*
@@ -1480,8 +1552,9 @@ static void __send_empty_flush(struct clone_info *ci)
ci->sector_count = 0;
ci->io->tio.clone.bi_iter.bi_size = 0;
- while ((ti = dm_table_get_target(ci->map, target_nr++))) {
- int bios;
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ unsigned int bios;
+ struct dm_target *ti = dm_table_get_target(t, i);
atomic_add(ti->num_flush_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
@@ -1501,7 +1574,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target
unsigned num_bios)
{
unsigned len;
- int bios;
+ unsigned int bios;
len = min_t(sector_t, ci->sector_count,
max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
@@ -1520,7 +1593,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target
static bool is_abnormal_io(struct bio *bio)
{
- unsigned int op = bio_op(bio);
+ enum req_op op = bio_op(bio);
if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
switch (op) {
@@ -1551,6 +1624,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
case REQ_OP_WRITE_ZEROES:
num_bios = ti->num_write_zeroes_bios;
break;
+ default:
+ break;
}
/*
@@ -1620,14 +1695,19 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
ti = dm_table_find_target(ci->map, ci->sector);
if (unlikely(!ti))
return BLK_STS_IOERR;
- else if (unlikely(ci->is_abnormal_io))
+
+ if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
+ unlikely(!dm_target_supports_nowait(ti->type)))
+ return BLK_STS_NOTSUPP;
+
+ if (unlikely(ci->is_abnormal_io))
return __process_abnormal_io(ci, ti);
/*
* Only support bio polling for normal IO, and the target io is
* exactly inside the dm_io instance (verified in dm_poll_dm_io)
*/
- ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
+ ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
setup_split_accounting(ci, len);
@@ -1722,8 +1802,9 @@ static void dm_submit_bio(struct bio *bio)
struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
int srcu_idx;
struct dm_table *map;
+ blk_opf_t bio_opf = bio->bi_opf;
- map = dm_get_live_table_bio(md, &srcu_idx, bio);
+ map = dm_get_live_table_bio(md, &srcu_idx, bio_opf);
/* If suspended, or map not yet available, queue this IO for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
@@ -1739,7 +1820,7 @@ static void dm_submit_bio(struct bio *bio)
dm_split_and_process_bio(md, map, bio);
out:
- dm_put_live_table_bio(md, srcu_idx, bio);
+ dm_put_live_table_bio(md, srcu_idx, bio_opf);
}
static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
@@ -1876,8 +1957,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
{
if (md->wq)
destroy_workqueue(md->wq);
- bioset_exit(&md->bs);
- bioset_exit(&md->io_bs);
+ dm_free_md_mempools(md->mempools);
if (md->dax_dev) {
dax_remove_host(md->disk);
@@ -1896,7 +1976,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
del_gendisk(md->disk);
}
dm_queue_destroy_crypto_profile(md->queue);
- blk_cleanup_disk(md->disk);
+ put_disk(md->disk);
}
if (md->pending_io) {
@@ -1971,9 +2051,11 @@ static struct mapped_device *alloc_dev(int minor)
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
+ INIT_WORK(&md->requeue_work, dm_wq_requeue_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
+ md->requeue_list = NULL;
md->swap_bios = get_swap_bios();
sema_init(&md->swap_bios_semaphore, md->swap_bios);
mutex_init(&md->swap_bios_lock);
@@ -2049,48 +2131,6 @@ static void free_dev(struct mapped_device *md)
kvfree(md);
}
-static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
-{
- struct dm_md_mempools *p = dm_table_get_md_mempools(t);
- int ret = 0;
-
- if (dm_table_bio_based(t)) {
- /*
- * The md may already have mempools that need changing.
- * If so, reload bioset because front_pad may have changed
- * because a different table was loaded.
- */
- bioset_exit(&md->bs);
- bioset_exit(&md->io_bs);
-
- } else if (bioset_initialized(&md->bs)) {
- /*
- * There's no need to reload with request-based dm
- * because the size of front_pad doesn't change.
- * Note for future: If you are to reload bioset,
- * prep-ed requests in the queue may refer
- * to bio from the old bioset, so you must walk
- * through the queue to unprep.
- */
- goto out;
- }
-
- BUG_ON(!p ||
- bioset_initialized(&md->bs) ||
- bioset_initialized(&md->io_bs));
-
- ret = bioset_init_from_src(&md->bs, &p->bs);
- if (ret)
- goto out;
- ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
- if (ret)
- bioset_exit(&md->bs);
-out:
- /* mempool bind completed, no longer need any mempools in the table */
- dm_table_free_md_mempools(t);
- return ret;
-}
-
/*
* Bind a table to the device.
*/
@@ -2144,12 +2184,28 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* immutable singletons - used to optimize dm_mq_queue_rq.
*/
md->immutable_target = dm_table_get_immutable_target(t);
- }
- ret = __bind_mempools(md, t);
- if (ret) {
- old_map = ERR_PTR(ret);
- goto out;
+ /*
+ * There is no need to reload with request-based dm because the
+ * size of front_pad doesn't change.
+ *
+ * Note for future: If you are to reload bioset, prep-ed
+ * requests in the queue may refer to bio from the old bioset,
+ * so you must walk through the queue to unprep.
+ */
+ if (!md->mempools) {
+ md->mempools = t->mempools;
+ t->mempools = NULL;
+ }
+ } else {
+ /*
+ * The md may already have mempools that need changing.
+ * If so, reload bioset because front_pad may have changed
+ * because a different table was loaded.
+ */
+ dm_free_md_mempools(md->mempools);
+ md->mempools = t->mempools;
+ t->mempools = NULL;
}
ret = dm_table_set_restrictions(t, md->queue, limits);
@@ -3006,54 +3062,6 @@ int dm_noflush_suspending(struct dm_target *ti)
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
- unsigned per_io_data_size, unsigned min_pool_size,
- bool integrity, bool poll)
-{
- struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
- unsigned int pool_size = 0;
- unsigned int front_pad, io_front_pad;
- int ret;
-
- if (!pools)
- return NULL;
-
- switch (type) {
- case DM_TYPE_BIO_BASED:
- case DM_TYPE_DAX_BIO_BASED:
- pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
- front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
- io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
- ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0);
- if (ret)
- goto out;
- if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
- goto out;
- break;
- case DM_TYPE_REQUEST_BASED:
- pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
- front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
- /* per_io_data_size is used for blk-mq pdu at queue allocation */
- break;
- default:
- BUG();
- }
-
- ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
- if (ret)
- goto out;
-
- if (integrity && bioset_integrity_create(&pools->bs, pool_size))
- goto out;
-
- return pools;
-
-out:
- dm_free_md_mempools(pools);
-
- return NULL;
-}
-
void dm_free_md_mempools(struct dm_md_mempools *pools)
{
if (!pools)
@@ -3069,11 +3077,14 @@ struct dm_pr {
u64 old_key;
u64 new_key;
u32 flags;
+ bool abort;
bool fail_early;
+ int ret;
+ enum pr_type type;
};
static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
- void *data)
+ struct dm_pr *pr)
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *table;
@@ -3085,15 +3096,21 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
goto out;
/* We only support devices that have a single target */
- if (dm_table_get_num_targets(table) != 1)
+ if (table->num_targets != 1)
goto out;
ti = dm_table_get_target(table, 0);
+ if (dm_suspended_md(md)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
ret = -EINVAL;
if (!ti->type->iterate_devices)
goto out;
- ret = ti->type->iterate_devices(ti, fn, data);
+ ti->type->iterate_devices(ti, fn, pr);
+ ret = 0;
out:
dm_put_live_table(md, srcu_idx);
return ret;
@@ -3107,10 +3124,24 @@ static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
{
struct dm_pr *pr = data;
const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+ int ret;
+
+ if (!ops || !ops->pr_register) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
+ if (!ret)
+ return 0;
+
+ if (!pr->ret)
+ pr->ret = ret;
+
+ if (pr->fail_early)
+ return -1;
- if (!ops || !ops->pr_register)
- return -EOPNOTSUPP;
- return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
+ return 0;
}
static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
@@ -3121,82 +3152,145 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
.new_key = new_key,
.flags = flags,
.fail_early = true,
+ .ret = 0,
};
int ret;
ret = dm_call_pr(bdev, __dm_pr_register, &pr);
- if (ret && new_key) {
- /* unregister all paths if we failed to register any path */
- pr.old_key = new_key;
- pr.new_key = 0;
- pr.flags = 0;
- pr.fail_early = false;
- dm_call_pr(bdev, __dm_pr_register, &pr);
+ if (ret) {
+ /* Didn't even get to register a path */
+ return ret;
}
+ if (!pr.ret)
+ return 0;
+ ret = pr.ret;
+
+ if (!new_key)
+ return ret;
+
+ /* unregister all paths if we failed to register any path */
+ pr.old_key = new_key;
+ pr.new_key = 0;
+ pr.flags = 0;
+ pr.fail_early = false;
+ (void) dm_call_pr(bdev, __dm_pr_register, &pr);
return ret;
}
+
+static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_pr *pr = data;
+ const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+ if (!ops || !ops->pr_reserve) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags);
+ if (!pr->ret)
+ return -1;
+
+ return 0;
+}
+
static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
u32 flags)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
- const struct pr_ops *ops;
- int r, srcu_idx;
+ struct dm_pr pr = {
+ .old_key = key,
+ .flags = flags,
+ .type = type,
+ .fail_early = false,
+ .ret = 0,
+ };
+ int ret;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
- if (r < 0)
- goto out;
+ ret = dm_call_pr(bdev, __dm_pr_reserve, &pr);
+ if (ret)
+ return ret;
- ops = bdev->bd_disk->fops->pr_ops;
- if (ops && ops->pr_reserve)
- r = ops->pr_reserve(bdev, key, type, flags);
- else
- r = -EOPNOTSUPP;
-out:
- dm_unprepare_ioctl(md, srcu_idx);
- return r;
+ return pr.ret;
+}
+
+/*
+ * If there is a non-All Registrants type of reservation, the release must be
+ * sent down the holding path. For the cases where there is no reservation or
+ * the path is not the holder the device will also return success, so we must
+ * try each path to make sure we got the correct path.
+ */
+static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_pr *pr = data;
+ const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+ if (!ops || !ops->pr_release) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type);
+ if (pr->ret)
+ return -1;
+
+ return 0;
}
static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
- const struct pr_ops *ops;
- int r, srcu_idx;
+ struct dm_pr pr = {
+ .old_key = key,
+ .type = type,
+ .fail_early = false,
+ };
+ int ret;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
- if (r < 0)
- goto out;
+ ret = dm_call_pr(bdev, __dm_pr_release, &pr);
+ if (ret)
+ return ret;
- ops = bdev->bd_disk->fops->pr_ops;
- if (ops && ops->pr_release)
- r = ops->pr_release(bdev, key, type);
- else
- r = -EOPNOTSUPP;
-out:
- dm_unprepare_ioctl(md, srcu_idx);
- return r;
+ return pr.ret;
+}
+
+static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_pr *pr = data;
+ const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+ if (!ops || !ops->pr_preempt) {
+ pr->ret = -EOPNOTSUPP;
+ return -1;
+ }
+
+ pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type,
+ pr->abort);
+ if (!pr->ret)
+ return -1;
+
+ return 0;
}
static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
enum pr_type type, bool abort)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
- const struct pr_ops *ops;
- int r, srcu_idx;
+ struct dm_pr pr = {
+ .new_key = new_key,
+ .old_key = old_key,
+ .type = type,
+ .fail_early = false,
+ };
+ int ret;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
- if (r < 0)
- goto out;
+ ret = dm_call_pr(bdev, __dm_pr_preempt, &pr);
+ if (ret)
+ return ret;
- ops = bdev->bd_disk->fops->pr_ops;
- if (ops && ops->pr_preempt)
- r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
- else
- r = -EOPNOTSUPP;
-out:
- dm_unprepare_ioctl(md, srcu_idx);
- return r;
+ return pr.ret;
}
static int dm_pr_clear(struct block_device *bdev, u64 key)