aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig6
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/bcache/bcache.h1
-rw-r--r--drivers/md/bcache/request.c20
-rw-r--r--drivers/md/bcache/stats.c14
-rw-r--r--drivers/md/bcache/stats.h1
-rw-r--r--drivers/md/bcache/super.c15
-rw-r--r--drivers/md/bcache/sysfs.c4
-rw-r--r--drivers/md/dm-cache-target.c82
-rw-r--r--drivers/md/dm-core.h65
-rw-r--r--drivers/md/dm-crypt.c31
-rw-r--r--drivers/md/dm-era-target.c24
-rw-r--r--drivers/md/dm-flakey.c7
-rw-r--r--drivers/md/dm-integrity.c81
-rw-r--r--drivers/md/dm-io-tracker.h81
-rw-r--r--drivers/md/dm-kcopyd.c41
-rw-r--r--drivers/md/dm-linear.c7
-rw-r--r--drivers/md/dm-ps-io-affinity.c1
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-rq.c9
-rw-r--r--drivers/md/dm-snap.c3
-rw-r--r--drivers/md/dm-table.c23
-rw-r--r--drivers/md/dm-thin-metadata.c91
-rw-r--r--drivers/md/dm-verity-verify-sig.c2
-rw-r--r--drivers/md/dm-writecache.c140
-rw-r--r--drivers/md/dm-zone.c660
-rw-r--r--drivers/md/dm-zoned-metadata.c7
-rw-r--r--drivers/md/dm-zoned-reclaim.c2
-rw-r--r--drivers/md/dm.c230
-rw-r--r--drivers/md/dm.h30
-rw-r--r--drivers/md/md-bitmap.c2
-rw-r--r--drivers/md/md-faulty.c2
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md-multipath.c2
-rw-r--r--drivers/md/md.c141
-rw-r--r--drivers/md/md.h19
-rw-r--r--drivers/md/persistent-data/dm-array.c52
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h13
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c7
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c16
-rw-r--r--drivers/md/persistent-data/dm-btree.c542
-rw-r--r--drivers/md/persistent-data/dm-btree.h10
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c534
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h34
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c83
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c105
-rw-r--r--drivers/md/persistent-data/dm-space-map.h18
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c61
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h22
-rw-r--r--drivers/md/raid0.c3
-rw-r--r--drivers/md/raid1.c15
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5.c65
55 files changed, 2635 insertions, 805 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f2014385d48b..0602e82a9516 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -47,7 +47,7 @@ config MD_AUTODETECT
If unsure, say Y.
config MD_LINEAR
- tristate "Linear (append) mode"
+ tristate "Linear (append) mode (deprecated)"
depends on BLK_DEV_MD
help
If you say Y here, then your multiple devices driver will be able to
@@ -158,7 +158,7 @@ config MD_RAID456
If unsure, say Y.
config MD_MULTIPATH
- tristate "Multipath I/O support"
+ tristate "Multipath I/O support (deprecated)"
depends on BLK_DEV_MD
help
MD_MULTIPATH provides a simple multi-path personality for use
@@ -169,7 +169,7 @@ config MD_MULTIPATH
If unsure, say N.
config MD_FAULTY
- tristate "Faulty test module for MD"
+ tristate "Faulty test module for MD (deprecated)"
depends on BLK_DEV_MD
help
The "faulty" module allows for a block device that occasionally returns
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index ef7ddc27685c..a74aaf8b1445 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
+ifeq ($(CONFIG_BLK_DEV_ZONED),y)
+dm-mod-objs += dm-zone.o
+endif
+
ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0a4551e165ab..5fc989a6d452 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -364,7 +364,6 @@ struct cached_dev {
/* The rest of this all shows up in sysfs */
unsigned int sequential_cutoff;
- unsigned int readahead;
unsigned int io_disable:1;
unsigned int verify:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 29c231758293..6d1de889baeb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -880,9 +880,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
struct bio *bio, unsigned int sectors)
{
int ret = MAP_CONTINUE;
- unsigned int reada = 0;
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
struct bio *miss, *cache_bio;
+ unsigned int size_limit;
s->cache_missed = 1;
@@ -892,14 +892,10 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
goto out_submit;
}
- if (!(bio->bi_opf & REQ_RAHEAD) &&
- !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
- s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
- reada = min_t(sector_t, dc->readahead >> 9,
- get_capacity(bio->bi_bdev->bd_disk) -
- bio_end_sector(bio));
-
- s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+ /* Limitation for valid replace key size and cache_bio bvecs number */
+ size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS,
+ (1 << KEY_SIZE_BITS) - 1);
+ s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio));
s->iop.replace_key = KEY(s->iop.inode,
bio->bi_iter.bi_sector + s->insert_bio_sectors,
@@ -911,7 +907,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->iop.replace = true;
- miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
+ miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO,
+ &s->d->bio_split);
/* btree_search_recurse()'s btree iterator is no good anymore */
ret = miss == bio ? MAP_DONE : -EINTR;
@@ -933,9 +930,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put;
- if (reada)
- bch_mark_cache_readahead(s->iop.c, s->d);
-
s->cache_miss = miss;
s->iop.bio = cache_bio;
bio_get(cache_bio);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 503aafe188dc..4c7ee5fedb9d 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -46,7 +46,6 @@ read_attribute(cache_misses);
read_attribute(cache_bypass_hits);
read_attribute(cache_bypass_misses);
read_attribute(cache_hit_ratio);
-read_attribute(cache_readaheads);
read_attribute(cache_miss_collisions);
read_attribute(bypassed);
@@ -64,7 +63,6 @@ SHOW(bch_stats)
DIV_SAFE(var(cache_hits) * 100,
var(cache_hits) + var(cache_misses)));
- var_print(cache_readaheads);
var_print(cache_miss_collisions);
sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
#undef var
@@ -86,7 +84,6 @@ static struct attribute *bch_stats_files[] = {
&sysfs_cache_bypass_hits,
&sysfs_cache_bypass_misses,
&sysfs_cache_hit_ratio,
- &sysfs_cache_readaheads,
&sysfs_cache_miss_collisions,
&sysfs_bypassed,
NULL
@@ -113,7 +110,6 @@ void bch_cache_accounting_clear(struct cache_accounting *acc)
acc->total.cache_misses = 0;
acc->total.cache_bypass_hits = 0;
acc->total.cache_bypass_misses = 0;
- acc->total.cache_readaheads = 0;
acc->total.cache_miss_collisions = 0;
acc->total.sectors_bypassed = 0;
}
@@ -145,7 +141,6 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
scale_stat(&stats->cache_misses);
scale_stat(&stats->cache_bypass_hits);
scale_stat(&stats->cache_bypass_misses);
- scale_stat(&stats->cache_readaheads);
scale_stat(&stats->cache_miss_collisions);
scale_stat(&stats->sectors_bypassed);
}
@@ -168,7 +163,6 @@ static void scale_accounting(struct timer_list *t)
move_stat(cache_misses);
move_stat(cache_bypass_hits);
move_stat(cache_bypass_misses);
- move_stat(cache_readaheads);
move_stat(cache_miss_collisions);
move_stat(sectors_bypassed);
@@ -209,14 +203,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
mark_cache_stats(&c->accounting.collector, hit, bypass);
}
-void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
-{
- struct cached_dev *dc = container_of(d, struct cached_dev, disk);
-
- atomic_inc(&dc->accounting.collector.cache_readaheads);
- atomic_inc(&c->accounting.collector.cache_readaheads);
-}
-
void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index abfaabf7e7fc..ca4f435f7216 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -7,7 +7,6 @@ struct cache_stat_collector {
atomic_t cache_misses;
atomic_t cache_bypass_hits;
atomic_t cache_bypass_misses;
- atomic_t cache_readaheads;
atomic_t cache_miss_collisions;
atomic_t sectors_bypassed;
};
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index bea8c4429ae8..185246a0d855 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -890,13 +890,9 @@ static void bcache_device_free(struct bcache_device *d)
if (disk_added)
del_gendisk(disk);
- if (disk->queue)
- blk_cleanup_queue(disk->queue);
-
+ blk_cleanup_disk(disk);
ida_simple_remove(&bcache_device_idx,
first_minor_to_idx(disk->first_minor));
- if (disk_added)
- put_disk(disk);
}
bioset_exit(&d->bio_split);
@@ -946,7 +942,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto err;
- d->disk = alloc_disk(BCACHE_MINORS);
+ d->disk = blk_alloc_disk(NUMA_NO_NODE);
if (!d->disk)
goto err;
@@ -955,14 +951,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->major = bcache_major;
d->disk->first_minor = idx_to_first_minor(idx);
+ d->disk->minors = BCACHE_MINORS;
d->disk->fops = ops;
d->disk->private_data = d;
- q = blk_alloc_queue(NUMA_NO_NODE);
- if (!q)
- return -ENOMEM;
-
- d->disk->queue = q;
+ q = d->disk->queue;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index cc89f3156d1a..05ac1d6fbbf3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -137,7 +137,6 @@ rw_attribute(io_disable);
rw_attribute(discard);
rw_attribute(running);
rw_attribute(label);
-rw_attribute(readahead);
rw_attribute(errors);
rw_attribute(io_error_limit);
rw_attribute(io_error_halflife);
@@ -260,7 +259,6 @@ SHOW(__bch_cached_dev)
var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff);
- var_hprint(readahead);
sysfs_print(running, atomic_read(&dc->running));
sysfs_print(state, states[BDEV_STATE(&dc->sb)]);
@@ -365,7 +363,6 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(sequential_cutoff,
dc->sequential_cutoff,
0, UINT_MAX);
- d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats)
bch_cache_accounting_clear(&dc->accounting);
@@ -538,7 +535,6 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_running,
&sysfs_state,
&sysfs_label,
- &sysfs_readahead,
#ifdef CONFIG_BCACHE_DEBUG
&sysfs_verify,
&sysfs_bypass_torture_test,
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6ab01ff25747..8e4ced5a2516 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -8,6 +8,7 @@
#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
+#include "dm-io-tracker.h"
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
@@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
/*----------------------------------------------------------------*/
-struct io_tracker {
- spinlock_t lock;
-
- /*
- * Sectors of in-flight IO.
- */
- sector_t in_flight;
-
- /*
- * The time, in jiffies, when this device became idle (if it is
- * indeed idle).
- */
- unsigned long idle_time;
- unsigned long last_update_time;
-};
-
-static void iot_init(struct io_tracker *iot)
-{
- spin_lock_init(&iot->lock);
- iot->in_flight = 0ul;
- iot->idle_time = 0ul;
- iot->last_update_time = jiffies;
-}
-
-static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
-{
- if (iot->in_flight)
- return false;
-
- return time_after(jiffies, iot->idle_time + jifs);
-}
-
-static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
-{
- bool r;
-
- spin_lock_irq(&iot->lock);
- r = __iot_idle_for(iot, jifs);
- spin_unlock_irq(&iot->lock);
-
- return r;
-}
-
-static void iot_io_begin(struct io_tracker *iot, sector_t len)
-{
- spin_lock_irq(&iot->lock);
- iot->in_flight += len;
- spin_unlock_irq(&iot->lock);
-}
-
-static void __iot_io_end(struct io_tracker *iot, sector_t len)
-{
- if (!len)
- return;
-
- iot->in_flight -= len;
- if (!iot->in_flight)
- iot->idle_time = jiffies;
-}
-
-static void iot_io_end(struct io_tracker *iot, sector_t len)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&iot->lock, flags);
- __iot_io_end(iot, len);
- spin_unlock_irqrestore(&iot->lock, flags);
-}
-
-/*----------------------------------------------------------------*/
-
/*
* Represents a chunk of future work. 'input' allows continuations to pass
* values between themselves, typically error values.
@@ -470,7 +400,7 @@ struct cache {
struct batcher committer;
struct work_struct commit_ws;
- struct io_tracker tracker;
+ struct dm_io_tracker tracker;
mempool_t migration_pool;
@@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
if (accountable_bio(cache, bio)) {
pb = get_per_bio_data(bio);
pb->len = bio_sectors(bio);
- iot_io_begin(&cache->tracker, pb->len);
+ dm_iot_io_begin(&cache->tracker, pb->len);
}
}
@@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
{
struct per_bio_data *pb = get_per_bio_data(bio);
- iot_io_end(&cache->tracker, pb->len);
+ dm_iot_io_end(&cache->tracker, pb->len);
}
static void accounted_request(struct cache *cache, struct bio *bio)
@@ -1642,7 +1572,7 @@ enum busy {
static enum busy spare_migration_bandwidth(struct cache *cache)
{
- bool idle = iot_idle_for(&cache->tracker, HZ);
+ bool idle = dm_iot_idle_for(&cache->tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
@@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
batcher_init(&cache->committer, commit_op, cache,
issue_op, cache, cache->wq);
- iot_init(&cache->tracker);
+ dm_iot_init(&cache->tracker);
init_rwsem(&cache->background_work_lock);
prevent_background_work(cache);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 5953ff2bd260..edc1553c4eea 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -114,8 +114,27 @@ struct mapped_device {
bool init_tio_pdu:1;
struct srcu_struct io_barrier;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int nr_zones;
+ unsigned int *zwp_offset;
+#endif
};
+/*
+ * Bits for the flags field of struct mapped_device.
+ */
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
+#define DMF_SUSPENDED 1
+#define DMF_FROZEN 2
+#define DMF_FREEING 3
+#define DMF_DELETING 4
+#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_DEFERRED_REMOVE 6
+#define DMF_SUSPENDED_INTERNALLY 7
+#define DMF_POST_SUSPENDING 8
+#define DMF_EMULATE_ZONE_APPEND 9
+
void disable_discard(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
@@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
return &md->stats;
}
+static inline bool dm_emulate_zone_append(struct mapped_device *md)
+{
+ if (blk_queue_is_zoned(md->queue))
+ return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ return false;
+}
+
#define DM_TABLE_MAX_DEPTH 16
struct dm_table {
@@ -173,6 +199,45 @@ struct dm_table {
#endif
};
+/*
+ * One of these is allocated per clone bio.
+ */
+#define DM_TIO_MAGIC 7282014
+struct dm_target_io {
+ unsigned int magic;
+ struct dm_io *io;
+ struct dm_target *ti;
+ unsigned int target_bio_nr;
+ unsigned int *len_ptr;
+ bool inside_dm_io;
+ struct bio clone;
+};
+
+/*
+ * One of these is allocated per original bio.
+ * It contains the first clone used for that original.
+ */
+#define DM_IO_MAGIC 5191977
+struct dm_io {
+ unsigned int magic;
+ struct mapped_device *md;
+ blk_status_t status;
+ atomic_t io_count;
+ struct bio *orig_bio;
+ unsigned long start_time;
+ spinlock_t endio_lock;
+ struct dm_stats_aux stats_aux;
+ /* last member of dm_target_io is 'struct bio' */
+ struct dm_target_io tio;
+};
+
+static inline void dm_io_inc_pending(struct dm_io *io)
+{
+ atomic_inc(&io->io_count);
+}
+
+void dm_io_dec_pending(struct dm_io *io, blk_status_t error);
+
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b0ab080f2567..50f4cbd600d5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct crypt_config *cc = ti->private;
- sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
- args->start = cc->start;
- return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
- dm_report_zones_cb, args);
+ return dm_report_zones(cc->dev->bdev, cc->start,
+ cc->start + dm_target_offset(ti, args->next_sector),
+ args, nr_zones);
}
#else
#define crypt_report_zones NULL
@@ -3281,14 +3280,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
- /*
- * For zoned block devices, we need to preserve the issuer write
- * ordering. To do so, disable write workqueues and force inline
- * encryption completion.
- */
if (bdev_is_zoned(cc->dev->bdev)) {
+ /*
+ * For zoned block devices, we need to preserve the issuer write
+ * ordering. To do so, disable write workqueues and force inline
+ * encryption completion.
+ */
set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
+
+ /*
+ * All zone append writes to a zone of a zoned block device will
+ * have the same BIO sector, the start of the zone. When the
+ * cypher IV mode uses sector values, all data targeting a
+ * zone will be encrypted using the first sector numbers of the
+ * zone. This will not result in write errors but will
+ * cause most reads to fail as reads will use the sector values
+ * for the actual data locations, resulting in IV mismatch.
+ * To avoid this problem, ask DM core to emulate zone append
+ * operations with regular writes.
+ */
+ DMDEBUG("Zone append operations will be emulated");
+ ti->emulate_zone_append = true;
}
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index d9ac7372108c..3b748393fca5 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -363,28 +363,32 @@ static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata
core->root = le64_to_cpu(disk->root);
}
-static void ws_inc(void *context, const void *value)
+static void ws_inc(void *context, const void *value, unsigned count)
{
struct era_metadata *md = context;
struct writeset_disk ws_d;
dm_block_t b;
+ unsigned i;
- memcpy(&ws_d, value, sizeof(ws_d));
- b = le64_to_cpu(ws_d.root);
-
- dm_tm_inc(md->tm, b);
+ for (i = 0; i < count; i++) {
+ memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
+ b = le64_to_cpu(ws_d.root);
+ dm_tm_inc(md->tm, b);
+ }
}
-static void ws_dec(void *context, const void *value)
+static void ws_dec(void *context, const void *value, unsigned count)
{
struct era_metadata *md = context;
struct writeset_disk ws_d;
dm_block_t b;
+ unsigned i;
- memcpy(&ws_d, value, sizeof(ws_d));
- b = le64_to_cpu(ws_d.root);
-
- dm_bitset_del(&md->bitset_info, b);
+ for (i = 0; i < count; i++) {
+ memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
+ b = le64_to_cpu(ws_d.root);
+ dm_bitset_del(&md->bitset_info, b);
+ }
}
static int ws_eq(void *context, const void *value1, const void *value2)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b7fee9936f05..5877220c01ed 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct flakey_c *fc = ti->private;
- sector_t sector = flakey_map_sector(ti, args->next_sector);
- args->start = fc->start;
- return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
- dm_report_zones_cb, args);
+ return dm_report_zones(fc->dev->bdev, fc->start,
+ flakey_map_sector(ti, args->next_sector),
+ args, nr_zones);
}
#else
#define flakey_report_zones NULL
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 781942aeddd1..20f2510db1f6 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -66,14 +66,14 @@ struct superblock {
__u8 magic[8];
__u8 version;
__u8 log2_interleave_sectors;
- __u16 integrity_tag_size;
- __u32 journal_sections;
- __u64 provided_data_sectors; /* userspace uses this value */
- __u32 flags;
+ __le16 integrity_tag_size;
+ __le32 journal_sections;
+ __le64 provided_data_sectors; /* userspace uses this value */
+ __le32 flags;
__u8 log2_sectors_per_block;
__u8 log2_blocks_per_bitmap_bit;
__u8 pad[2];
- __u64 recalc_sector;
+ __le64 recalc_sector;
__u8 pad2[8];
__u8 salt[SALT_SIZE];
};
@@ -86,16 +86,16 @@ struct superblock {
#define JOURNAL_ENTRY_ROUNDUP 8
-typedef __u64 commit_id_t;
+typedef __le64 commit_id_t;
#define JOURNAL_MAC_PER_SECTOR 8
struct journal_entry {
union {
struct {
- __u32 sector_lo;
- __u32 sector_hi;
+ __le32 sector_lo;
+ __le32 sector_hi;
} s;
- __u64 sector;
+ __le64 sector;
} u;
commit_id_t last_bytes[];
/* __u8 tag[0]; */
@@ -806,7 +806,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
}
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
- uint64_t section_le;
+ __le64 section_le;
r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE);
if (unlikely(r < 0)) {
@@ -1640,7 +1640,7 @@ static void integrity_end_io(struct bio *bio)
static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
const char *data, char *result)
{
- __u64 sector_le = cpu_to_le64(sector);
+ __le64 sector_le = cpu_to_le64(sector);
SHASH_DESC_ON_STACK(req, ic->internal_hash);
int r;
unsigned digest_size;
@@ -2689,30 +2689,26 @@ next_chunk:
if (unlikely(dm_integrity_failed(ic)))
goto err;
- if (!ic->discard) {
- io_req.bi_op = REQ_OP_READ;
- io_req.bi_op_flags = 0;
- io_req.mem.type = DM_IO_VMA;
- io_req.mem.ptr.addr = ic->recalc_buffer;
- io_req.notify.fn = NULL;
- io_req.client = ic->io;
- io_loc.bdev = ic->dev->bdev;
- io_loc.sector = get_data_sector(ic, area, offset);
- io_loc.count = n_sectors;
+ io_req.bi_op = REQ_OP_READ;
+ io_req.bi_op_flags = 0;
+ io_req.mem.type = DM_IO_VMA;
+ io_req.mem.ptr.addr = ic->recalc_buffer;
+ io_req.notify.fn = NULL;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = get_data_sector(ic, area, offset);
+ io_loc.count = n_sectors;
- r = dm_io(&io_req, 1, &io_loc, NULL);
- if (unlikely(r)) {
- dm_integrity_io_error(ic, "reading data", r);
- goto err;
- }
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "reading data", r);
+ goto err;
+ }
- t = ic->recalc_tags;
- for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
- integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
- t += ic->tag_size;
- }
- } else {
- t = ic->recalc_tags + (n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
+ t = ic->recalc_tags;
+ for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+ integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+ t += ic->tag_size;
}
metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
@@ -3826,7 +3822,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
for (i = 0; i < ic->journal_sections; i++) {
struct scatterlist sg;
struct skcipher_request *section_req;
- __u32 section_le = cpu_to_le32(i);
+ __le32 section_le = cpu_to_le32(i);
memset(crypt_iv, 0x00, ivsize);
memset(crypt_data, 0x00, crypt_len);
@@ -4368,13 +4364,11 @@ try_smaller_buffer:
goto bad;
}
INIT_WORK(&ic->recalc_work, integrity_recalc);
- if (!ic->discard) {
- ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
- if (!ic->recalc_buffer) {
- ti->error = "Cannot allocate buffer for recalculating";
- r = -ENOMEM;
- goto bad;
- }
+ ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
+ if (!ic->recalc_buffer) {
+ ti->error = "Cannot allocate buffer for recalculating";
+ r = -ENOMEM;
+ goto bad;
}
ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block,
ic->tag_size, GFP_KERNEL);
@@ -4383,9 +4377,6 @@ try_smaller_buffer:
r = -ENOMEM;
goto bad;
}
- if (ic->discard)
- memset(ic->recalc_tags, DISCARD_FILLER,
- (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size);
} else {
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
ti->error = "Recalculate can only be specified with internal_hash";
@@ -4579,7 +4570,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-io-tracker.h b/drivers/md/dm-io-tracker.h
new file mode 100644
index 000000000000..bdcc6273ebf0
--- /dev/null
+++ b/drivers/md/dm-io-tracker.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2021 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_IO_TRACKER_H
+#define DM_IO_TRACKER_H
+
+#include <linux/jiffies.h>
+
+struct dm_io_tracker {
+ spinlock_t lock;
+
+ /*
+ * Sectors of in-flight IO.
+ */
+ sector_t in_flight;
+
+ /*
+ * The time, in jiffies, when this device became idle
+ * (if it is indeed idle).
+ */
+ unsigned long idle_time;
+ unsigned long last_update_time;
+};
+
+static inline void dm_iot_init(struct dm_io_tracker *iot)
+{
+ spin_lock_init(&iot->lock);
+ iot->in_flight = 0ul;
+ iot->idle_time = 0ul;
+ iot->last_update_time = jiffies;
+}
+
+static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j)
+{
+ bool r = false;
+
+ spin_lock_irq(&iot->lock);
+ if (!iot->in_flight)
+ r = time_after(jiffies, iot->idle_time + j);
+ spin_unlock_irq(&iot->lock);
+
+ return r;
+}
+
+static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot)
+{
+ unsigned long r = 0;
+
+ spin_lock_irq(&iot->lock);
+ if (!iot->in_flight)
+ r = jiffies - iot->idle_time;
+ spin_unlock_irq(&iot->lock);
+
+ return r;
+}
+
+static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len)
+{
+ spin_lock_irq(&iot->lock);
+ iot->in_flight += len;
+ spin_unlock_irq(&iot->lock);
+}
+
+static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len)
+{
+ unsigned long flags;
+
+ if (!len)
+ return;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ iot->in_flight -= len;
+ if (!iot->in_flight)
+ iot->idle_time = jiffies;
+ spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+#endif
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 1bbe4a34ef4c..37b03ab7e5c9 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -341,7 +341,7 @@ static void client_free_pages(struct dm_kcopyd_client *kc)
struct kcopyd_job {
struct dm_kcopyd_client *kc;
struct list_head list;
- unsigned long flags;
+ unsigned flags;
/*
* Error state of the job.
@@ -418,7 +418,7 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs,
* constraint and sequential writes that are at the right position.
*/
list_for_each_entry(job, jobs, list) {
- if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
list_del(&job->list);
return job;
}
@@ -437,9 +437,8 @@ static struct kcopyd_job *pop(struct list_head *jobs,
struct dm_kcopyd_client *kc)
{
struct kcopyd_job *job = NULL;
- unsigned long flags;
- spin_lock_irqsave(&kc->job_lock, flags);
+ spin_lock_irq(&kc->job_lock);
if (!list_empty(jobs)) {
if (jobs == &kc->io_jobs)
@@ -449,7 +448,7 @@ static struct kcopyd_job *pop(struct list_head *jobs,
list_del(&job->list);
}
}
- spin_unlock_irqrestore(&kc->job_lock, flags);
+ spin_unlock_irq(&kc->job_lock);
return job;
}
@@ -467,12 +466,11 @@ static void push(struct list_head *jobs, struct kcopyd_job *job)
static void push_head(struct list_head *jobs, struct kcopyd_job *job)
{
- unsigned long flags;
struct dm_kcopyd_client *kc = job->kc;
- spin_lock_irqsave(&kc->job_lock, flags);
+ spin_lock_irq(&kc->job_lock);
list_add(&job->list, jobs);
- spin_unlock_irqrestore(&kc->job_lock, flags);
+ spin_unlock_irq(&kc->job_lock);
}
/*
@@ -525,7 +523,7 @@ static void complete_io(unsigned long error, void *context)
else
job->read_err = 1;
- if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+ if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
push(&kc->complete_jobs, job);
wake(kc);
return;
@@ -565,7 +563,7 @@ static int run_io_job(struct kcopyd_job *job)
* If we need to write sequentially and some reads or writes failed,
* no point in continuing.
*/
- if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+ if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
job->master_job->write_err) {
job->write_err = job->master_job->write_err;
return -EIO;
@@ -648,7 +646,6 @@ static void do_work(struct work_struct *work)
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
struct blk_plug plug;
- unsigned long flags;
/*
* The order that these are called is *very* important.
@@ -657,9 +654,9 @@ static void do_work(struct work_struct *work)
* list. io jobs call wake when they complete and it all
* starts again.
*/
- spin_lock_irqsave(&kc->job_lock, flags);
+ spin_lock_irq(&kc->job_lock);
list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
- spin_unlock_irqrestore(&kc->job_lock, flags);
+ spin_unlock_irq(&kc->job_lock);
blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job);
@@ -709,7 +706,7 @@ static void segment_complete(int read_err, unsigned long write_err,
* Only dispatch more work if there hasn't been an error.
*/
if ((!job->read_err && !job->write_err) ||
- test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+ job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
/* get the next chunk of work */
progress = job->progress;
count = job->source.count - progress;
@@ -801,10 +798,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
* we need to write sequentially. If one of the destination is a
* host-aware device, then leave it to the caller to choose what to do.
*/
- if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
for (i = 0; i < job->num_dests; i++) {
if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
- set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
+ job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
break;
}
}
@@ -813,9 +810,9 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
/*
* If we need to write sequentially, errors cannot be ignored.
*/
- if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
- clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
+ if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
+ job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
+ job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);
if (from) {
job->source = *from;
@@ -983,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
kfree(kc);
}
EXPORT_SYMBOL(dm_kcopyd_client_destroy);
+
+void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
+{
+ flush_workqueue(kc->kcopyd_wq);
+}
+EXPORT_SYMBOL(dm_kcopyd_client_flush);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 92db0f5e7f28..c91f1e2e2f65 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct linear_c *lc = ti->private;
- sector_t sector = linear_map_sector(ti, args->next_sector);
- args->start = lc->start;
- return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
- dm_report_zones_cb, args);
+ return dm_report_zones(lc->dev->bdev, lc->start,
+ linear_map_sector(ti, args->next_sector),
+ args, nr_zones);
}
#else
#define linear_report_zones NULL
diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c
index 077655cd4fae..cb8e83bfb1a7 100644
--- a/drivers/md/dm-ps-io-affinity.c
+++ b/drivers/md/dm-ps-io-affinity.c
@@ -91,7 +91,6 @@ static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
cpumask_set_cpu(cpu, s->path_mask);
s->path_map[cpu] = pi;
refcount_inc(&pi->refcount);
- continue;
}
if (refcount_dec_and_test(&pi->refcount)) {
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index b0a82f29a2e4..ebb4810cc3b4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -364,7 +364,7 @@ static void recover(struct mirror_set *ms, struct dm_region *reg)
/* hand to kcopyd */
if (!errors_handled(ms))
- set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
+ flags |= BIT(DM_KCOPYD_IGNORE_ERROR);
dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
flags, recovery_complete, reg);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9c3bc3711b33..0dbd48cbdff9 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = {
int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
{
- struct request_queue *q;
struct dm_target *immutable_tgt;
int err;
@@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
if (err)
goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
- if (IS_ERR(q)) {
- err = PTR_ERR(q);
+ err = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+ if (err)
goto out_tag_set;
- }
-
+ elevator_init_mq(md->queue);
return 0;
out_tag_set:
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a2acb014c13a..751ec5ea1dbb 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -855,7 +855,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
static uint32_t __minimum_chunk_size(struct origin *o)
{
struct dm_snapshot *snap;
- unsigned chunk_size = 0;
+ unsigned chunk_size = rounddown_pow_of_two(UINT_MAX);
if (o)
list_for_each_entry(snap, &o->snapshots, list)
@@ -1409,6 +1409,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (!s->store->chunk_size) {
ti->error = "Chunk size not set";
+ r = -EINVAL;
goto bad_read_metadata;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ee47a332b462..0543cdf89e92 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -249,7 +249,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* If the target is mapped to zoned block device(s), check
* that the zones are not partially mapped.
*/
- if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
+ if (bdev_is_zoned(bdev)) {
unsigned int zone_sectors = bdev_zone_sectors(bdev);
if (start & (zone_sectors - 1)) {
@@ -1244,7 +1244,7 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
return args.err;
}
-static struct blk_ksm_ll_ops dm_ksm_ll_ops = {
+static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
.keyslot_evict = dm_keyslot_evict,
};
@@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti,
return blk_queue_stable_writes(q);
}
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
- struct queue_limits *limits)
+int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *limits)
{
bool wc = false, fua = false;
int page_size = PAGE_SIZE;
+ int r;
/*
* Copy table's limits to the DM device's request_queue
@@ -2065,19 +2066,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
/*
- * For a zoned target, the number of zones should be updated for the
- * correct value to be exposed in sysfs queue/nr_zones. For a BIO based
- * target, this is all that is needed.
+ * For a zoned target, setup the zones related queue attributes
+ * and resources necessary for zone append emulation if necessary.
*/
-#ifdef CONFIG_BLK_DEV_ZONED
if (blk_queue_is_zoned(q)) {
- WARN_ON_ONCE(queue_is_mq(q));
- q->nr_zones = blkdev_nr_zones(t->md->disk);
+ r = dm_set_zones_restrictions(t, q);
+ if (r)
+ return r;
}
-#endif
dm_update_keyslot_manager(q, t);
blk_queue_update_readahead(q);
+
+ return 0;
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e75b20480e46..c88ed14d49e6 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -311,28 +311,53 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
*t = v & ((1 << 24) - 1);
}
-static void data_block_inc(void *context, const void *value_le)
+/*
+ * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
+ * possible. 'with_runs' reads contiguous runs of blocks, and calls the
+ * given sm function.
+ */
+typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
+
+static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn)
{
- struct dm_space_map *sm = context;
- __le64 v_le;
- uint64_t b;
+ uint64_t b, begin, end;
uint32_t t;
+ bool in_run = false;
+ unsigned i;
- memcpy(&v_le, value_le, sizeof(v_le));
- unpack_block_time(le64_to_cpu(v_le), &b, &t);
- dm_sm_inc_block(sm, b);
+ for (i = 0; i < count; i++, value_le++) {
+ /* We know value_le is 8 byte aligned */
+ unpack_block_time(le64_to_cpu(*value_le), &b, &t);
+
+ if (in_run) {
+ if (b == end) {
+ end++;
+ } else {
+ fn(sm, begin, end);
+ begin = b;
+ end = b + 1;
+ }
+ } else {
+ in_run = true;
+ begin = b;
+ end = b + 1;
+ }
+ }
+
+ if (in_run)
+ fn(sm, begin, end);
}
-static void data_block_dec(void *context, const void *value_le)
+static void data_block_inc(void *context, const void *value_le, unsigned count)
{
- struct dm_space_map *sm = context;
- __le64 v_le;
- uint64_t b;
- uint32_t t;
+ with_runs((struct dm_space_map *) context,
+ (const __le64 *) value_le, count, dm_sm_inc_blocks);
+}
- memcpy(&v_le, value_le, sizeof(v_le));
- unpack_block_time(le64_to_cpu(v_le), &b, &t);
- dm_sm_dec_block(sm, b);
+static void data_block_dec(void *context, const void *value_le, unsigned count)
+{
+ with_runs((struct dm_space_map *) context,
+ (const __le64 *) value_le, count, dm_sm_dec_blocks);
}
static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
@@ -349,27 +374,25 @@ static int data_block_equal(void *context, const void *value1_le, const void *va
return b1 == b2;
}
-static void subtree_inc(void *context, const void *value)
+static void subtree_inc(void *context, const void *value, unsigned count)
{
struct dm_btree_info *info = context;
- __le64 root_le;
- uint64_t root;
+ const __le64 *root_le = value;
+ unsigned i;
- memcpy(&root_le, value, sizeof(root_le));
- root = le64_to_cpu(root_le);
- dm_tm_inc(info->tm, root);
+ for (i = 0; i < count; i++, root_le++)
+ dm_tm_inc(info->tm, le64_to_cpu(*root_le));
}
-static void subtree_dec(void *context, const void *value)
+static void subtree_dec(void *context, const void *value, unsigned count)
{
struct dm_btree_info *info = context;
- __le64 root_le;
- uint64_t root;
+ const __le64 *root_le = value;
+ unsigned i;
- memcpy(&root_le, value, sizeof(root_le));
- root = le64_to_cpu(root_le);
- if (dm_btree_del(info, root))
- DMERR("btree delete failed");
+ for (i = 0; i < count; i++, root_le++)
+ if (dm_btree_del(info, le64_to_cpu(*root_le)))
+ DMERR("btree delete failed");
}
static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
@@ -1761,11 +1784,7 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
int r = 0;
pmd_write_lock(pmd);
- for (; b != e; b++) {
- r = dm_sm_inc_block(pmd->data_sm, b);
- if (r)
- break;
- }
+ r = dm_sm_inc_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
@@ -1776,11 +1795,7 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
int r = 0;
pmd_write_lock(pmd);
- for (; b != e; b++) {
- r = dm_sm_dec_block(pmd->data_sm, b);
- if (r)
- break;
- }
+ r = dm_sm_dec_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index 29385dc470d5..db61a1f43ae9 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -15,7 +15,7 @@
#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s
static bool require_signatures;
-module_param(require_signatures, bool, false);
+module_param(require_signatures, bool, 0444);
MODULE_PARM_DESC(require_signatures,
"Verify the roothash of dm-verity hash tree");
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index aecc246ade26..e21e29e81bbf 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -15,6 +15,8 @@
#include <linux/dax.h>
#include <linux/pfn_t.h>
#include <linux/libnvdimm.h>
+#include <linux/delay.h>
+#include "dm-io-tracker.h"
#define DM_MSG_PREFIX "writecache"
@@ -28,6 +30,7 @@
#define AUTOCOMMIT_MSEC 1000
#define MAX_AGE_DIV 16
#define MAX_AGE_UNSPECIFIED -1UL
+#define PAUSE_WRITEBACK (HZ * 3)
#define BITMAP_GRANULARITY 65536
#if BITMAP_GRANULARITY < PAGE_SIZE
@@ -123,6 +126,7 @@ struct dm_writecache {
size_t freelist_high_watermark;
size_t freelist_low_watermark;
unsigned long max_age;
+ unsigned long pause;
unsigned uncommitted_blocks;
unsigned autocommit_blocks;
@@ -171,17 +175,22 @@ struct dm_writecache {
bool flush_on_suspend:1;
bool cleaner:1;
bool cleaner_set:1;
+ bool metadata_only:1;
+ bool pause_set:1;
unsigned high_wm_percent_value;
unsigned low_wm_percent_value;
unsigned autocommit_time_value;
unsigned max_age_value;
+ unsigned pause_value;
unsigned writeback_all;
struct workqueue_struct *writeback_wq;
struct work_struct writeback_work;
struct work_struct flush_work;
+ struct dm_io_tracker iot;
+
struct dm_io_client *dm_io;
raw_spinlock_t endio_list_lock;
@@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
region.bdev = wc->ssd_dev->bdev;
region.sector = 0;
- region.count = PAGE_SIZE >> SECTOR_SHIFT;
+ region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
if (unlikely(region.sector + region.count > wc->metadata_sectors))
region.count = wc->metadata_sectors - region.sector;
@@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
writecache_flush(wc);
if (writecache_has_error(wc))
goto unlock_error;
+ if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
+ goto unlock_remap_origin;
goto unlock_submit;
} else {
+ if (dm_bio_get_target_bio_nr(bio))
+ goto unlock_remap_origin;
writecache_offload_bio(wc, bio);
goto unlock_return;
}
@@ -1360,24 +1373,29 @@ read_next_block:
} else {
do {
bool found_entry = false;
+ bool search_used = false;
if (writecache_has_error(wc))
goto unlock_error;
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
if (e) {
- if (!writecache_entry_is_committed(wc, e))
+ if (!writecache_entry_is_committed(wc, e)) {
+ search_used = true;
goto bio_copy;
+ }
if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
wc->overwrote_committed = true;
+ search_used = true;
goto bio_copy;
}
found_entry = true;
} else {
- if (unlikely(wc->cleaner))
+ if (unlikely(wc->cleaner) ||
+ (wc->metadata_only && !(bio->bi_opf & REQ_META)))
goto direct_write;
}
e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
- if (!found_entry) {
+ if (!WC_MODE_PMEM(wc) && !found_entry) {
direct_write:
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
if (e) {
@@ -1404,13 +1422,31 @@ bio_copy:
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
while (bio_size < bio->bi_iter.bi_size) {
- struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
- if (!f)
- break;
- write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
- (bio_size >> SECTOR_SHIFT), wc->seq_count);
- writecache_insert_entry(wc, f);
- wc->uncommitted_blocks++;
+ if (!search_used) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ } else {
+ struct wc_entry *f;
+ struct rb_node *next = rb_next(&e->rb_node);
+ if (!next)
+ break;
+ f = container_of(next, struct wc_entry, rb_node);
+ if (f != e + 1)
+ break;
+ if (read_original_sector(wc, f) !=
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (unlikely(f->write_in_progress))
+ break;
+ if (writecache_entry_is_committed(wc, f))
+ wc->overwrote_committed = true;
+ e = f;
+ }
bio_size += wc->block_size;
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
@@ -1438,6 +1474,12 @@ bio_copy:
}
unlock_remap_origin:
+ if (likely(wc->pause != 0)) {
+ if (bio_op(bio) == REQ_OP_WRITE) {
+ dm_iot_io_begin(&wc->iot, 1);
+ bio->bi_private = (void *)2;
+ }
+ }
bio_set_dev(bio, wc->dev->bdev);
wc_unlock(wc);
return DM_MAPIO_REMAPPED;
@@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t
{
struct dm_writecache *wc = ti->private;
- if (bio->bi_private != NULL) {
+ if (bio->bi_private == (void *)1) {
int dir = bio_data_dir(bio);
if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
wake_up(&wc->bio_in_progress_wait[dir]);
+ } else if (bio->bi_private == (void *)2) {
+ dm_iot_io_end(&wc->iot, 1);
}
return 0;
}
@@ -1642,7 +1686,7 @@ pop_from_list:
return 0;
}
-static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
+static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
{
struct dm_writecache *wc = wb->wc;
unsigned block_size = wc->block_size;
@@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
max_pages = WB_LIST_INLINE;
}
- BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
+ BUG_ON(!wc_add_block(wb, e));
wb->wc_list[0] = e;
wb->wc_list_n = 1;
@@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
if (read_original_sector(wc, f) !=
read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
break;
- if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
+ if (!wc_add_block(wb, f))
break;
wbl->size--;
list_del(&f->lru);
@@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work)
struct writeback_list wbl;
unsigned long n_walked;
+ if (!WC_MODE_PMEM(wc)) {
+ /* Wait for any active kcopyd work on behalf of ssd writeback */
+ dm_kcopyd_client_flush(wc->dm_kcopyd);
+ }
+
+ if (likely(wc->pause != 0)) {
+ while (1) {
+ unsigned long idle;
+ if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
+ unlikely(dm_suspended(wc->ti)))
+ break;
+ idle = dm_iot_idle_time(&wc->iot);
+ if (idle >= wc->pause)
+ break;
+ idle = wc->pause - idle;
+ if (idle > HZ)
+ idle = HZ;
+ schedule_timeout_idle(idle);
+ }
+ }
+
wc_lock(wc);
restart:
if (writecache_has_error(wc)) {
@@ -1822,8 +1887,9 @@ restart:
n_walked++;
if (unlikely(n_walked > WRITEBACK_LATENCY) &&
- likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
- queue_work(wc->writeback_wq, &wc->writeback_work);
+ likely(!wc->writeback_all)) {
+ if (likely(!dm_suspended(wc->ti)))
+ queue_work(wc->writeback_wq, &wc->writeback_work);
break;
}
@@ -1845,15 +1911,13 @@ restart:
if (unlikely(read_original_sector(wc, f) ==
read_original_sector(wc, e))) {
BUG_ON(!f->write_in_progress);
- list_del(&e->lru);
- list_add(&e->lru, &skipped);
+ list_move(&e->lru, &skipped);
cond_resched();
continue;
}
}
wc->writeback_size++;
- list_del(&e->lru);
- list_add(&e->lru, &wbl.list);
+ list_move(&e->lru, &wbl.list);
wbl.size++;
e->write_in_progress = true;
e->wc_list_contiguous = 1;
@@ -1888,8 +1952,7 @@ restart:
// break;
wc->writeback_size++;
- list_del(&g->lru);
- list_add(&g->lru, &wbl.list);
+ list_move(&g->lru, &wbl.list);
wbl.size++;
g->write_in_progress = true;
g->wc_list_contiguous = BIO_MAX_VECS;
@@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct wc_memory_superblock s;
static struct dm_arg _args[] = {
- {0, 16, "Invalid number of feature args"},
+ {0, 18, "Invalid number of feature args"},
};
as.argc = argc;
@@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
INIT_WORK(&wc->writeback_work, writecache_writeback);
INIT_WORK(&wc->flush_work, writecache_flush_work);
+ dm_iot_init(&wc->iot);
+
raw_spin_lock_init(&wc->endio_list_lock);
INIT_LIST_HEAD(&wc->endio_list);
wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
@@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
} else {
+ wc->pause = PAUSE_WRITEBACK;
r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
if (r) {
ti->error = "Could not allocate mempool";
@@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
wc->writeback_fua = false;
wc->writeback_fua_set = true;
} else goto invalid_optional;
+ } else if (!strcasecmp(string, "metadata_only")) {
+ wc->metadata_only = true;
+ } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
+ unsigned pause_msecs;
+ if (WC_MODE_PMEM(wc))
+ goto invalid_optional;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
+ goto invalid_optional;
+ if (pause_msecs > 60000)
+ goto invalid_optional;
+ wc->pause = msecs_to_jiffies(pause_msecs);
+ wc->pause_set = true;
+ wc->pause_value = pause_msecs;
} else {
invalid_optional:
r = -EINVAL;
@@ -2463,7 +2543,7 @@ overflow:
goto bad;
}
- ti->num_flush_bios = 1;
+ ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
ti->flush_supported = true;
ti->num_discard_bios = 1;
@@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args++;
if (wc->writeback_fua_set)
extra_args++;
+ if (wc->metadata_only)
+ extra_args++;
+ if (wc->pause_set)
+ extra_args += 2;
DMEMIT("%u", extra_args);
if (wc->start_sector_set)
@@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
DMEMIT(" cleaner");
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+ if (wc->metadata_only)
+ DMEMIT(" metadata_only");
+ if (wc->pause_set)
+ DMEMIT(" pause_writeback %u", wc->pause_value);
break;
}
}
static struct target_type writecache_target = {
.name = "writecache",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
new file mode 100644
index 000000000000..6d82a34438c8
--- /dev/null
+++ b/drivers/md/dm-zone.c
@@ -0,0 +1,660 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+
+#include "dm-core.h"
+
+#define DM_MSG_PREFIX "zone"
+
+#define DM_ZONE_INVALID_WP_OFST UINT_MAX
+
+/*
+ * For internal zone reports bypassing the top BIO submission path.
+ */
+static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
+ sector_t sector, unsigned int nr_zones,
+ report_zones_cb cb, void *data)
+{
+ struct gendisk *disk = md->disk;
+ int ret;
+ struct dm_report_zones_args args = {
+ .next_sector = sector,
+ .orig_data = data,
+ .orig_cb = cb,
+ };
+
+ do {
+ struct dm_target *tgt;
+
+ tgt = dm_table_find_target(t, args.next_sector);
+ if (WARN_ON_ONCE(!tgt->type->report_zones))
+ return -EIO;
+
+ args.tgt = tgt;
+ ret = tgt->type->report_zones(tgt, &args,
+ nr_zones - args.zone_idx);
+ if (ret < 0)
+ return ret;
+ } while (args.zone_idx < nr_zones &&
+ args.next_sector < get_capacity(disk));
+
+ return args.zone_idx;
+}
+
+/*
+ * User facing dm device block device report zone operation. This calls the
+ * report_zones operation for each target of a device table. This operation is
+ * generally implemented by targets using dm_report_zones().
+ */
+int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct mapped_device *md = disk->private_data;
+ struct dm_table *map;
+ int srcu_idx, ret;
+
+ if (dm_suspended_md(md))
+ return -EAGAIN;
+
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return -EIO;
+
+ ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
+
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
+static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct dm_report_zones_args *args = data;
+ sector_t sector_diff = args->tgt->begin - args->start;
+
+ /*
+ * Ignore zones beyond the target range.
+ */
+ if (zone->start >= args->start + args->tgt->len)
+ return 0;
+
+ /*
+ * Remap the start sector and write pointer position of the zone
+ * to match its position in the target range.
+ */
+ zone->start += sector_diff;
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = zone->start + zone->len;
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->wp = zone->start;
+ else
+ zone->wp += sector_diff;
+ }
+
+ args->next_sector = zone->start + zone->len;
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+}
+
+/*
+ * Helper for drivers of zoned targets to implement struct target_type
+ * report_zones operation.
+ */
+int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ /*
+ * Set the target mapping start sector first so that
+ * dm_report_zones_cb() can correctly remap zone information.
+ */
+ args->start = start;
+
+ return blkdev_report_zones(bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
+}
+EXPORT_SYMBOL_GPL(dm_report_zones);
+
+bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
+{
+ struct request_queue *q = md->queue;
+
+ if (!blk_queue_is_zoned(q))
+ return false;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
+ default:
+ return false;
+ }
+}
+
+void dm_cleanup_zoned_dev(struct mapped_device *md)
+{
+ struct request_queue *q = md->queue;
+
+ if (q) {
+ kfree(q->conv_zones_bitmap);
+ q->conv_zones_bitmap = NULL;
+ kfree(q->seq_zones_wlock);
+ q->seq_zones_wlock = NULL;
+ }
+
+ kvfree(md->zwp_offset);
+ md->zwp_offset = NULL;
+ md->nr_zones = 0;
+}
+
+static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Conventional, offline and read-only zones do not have a valid
+ * write pointer. Use 0 as for an empty zone.
+ */
+ return 0;
+ }
+}
+
+static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct mapped_device *md = data;
+ struct request_queue *q = md->queue;
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!q->conv_zones_bitmap) {
+ q->conv_zones_bitmap =
+ kcalloc(BITS_TO_LONGS(q->nr_zones),
+ sizeof(unsigned long), GFP_NOIO);
+ if (!q->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+ set_bit(idx, q->conv_zones_bitmap);
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ if (!q->seq_zones_wlock) {
+ q->seq_zones_wlock =
+ kcalloc(BITS_TO_LONGS(q->nr_zones),
+ sizeof(unsigned long), GFP_NOIO);
+ if (!q->seq_zones_wlock)
+ return -ENOMEM;
+ }
+ if (!md->zwp_offset) {
+ md->zwp_offset =
+ kvcalloc(q->nr_zones, sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!md->zwp_offset)
+ return -ENOMEM;
+ }
+ md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
+
+ break;
+ default:
+ DMERR("Invalid zone type 0x%x at sectors %llu",
+ (int)zone->type, zone->start);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+/*
+ * Revalidate the zones of a mapped device to initialize resource necessary
+ * for zone append emulation. Note that we cannot simply use the block layer
+ * blk_revalidate_disk_zones() function here as the mapped device is suspended
+ * (this is called from __bind() context).
+ */
+static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
+{
+ struct request_queue *q = md->queue;
+ unsigned int noio_flag;
+ int ret;
+
+ /*
+ * Check if something changed. If yes, cleanup the current resources
+ * and reallocate everything.
+ */
+ if (!q->nr_zones || q->nr_zones != md->nr_zones)
+ dm_cleanup_zoned_dev(md);
+ if (md->nr_zones)
+ return 0;
+
+ /*
+ * Scan all zones to initialize everything. Ensure that all vmalloc
+ * operations in this context are done as if GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
+ ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones,
+ dm_zone_revalidate_cb, md);
+ memalloc_noio_restore(noio_flag);
+ if (ret < 0)
+ goto err;
+ if (ret != q->nr_zones) {
+ ret = -EIO;
+ goto err;
+ }
+
+ md->nr_zones = q->nr_zones;
+
+ return 0;
+
+err:
+ DMERR("Revalidate zones failed %d", ret);
+ dm_cleanup_zoned_dev(md);
+ return ret;
+}
+
+static int device_not_zone_append_capable(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ return !blk_queue_is_zoned(bdev_get_queue(dev->bdev));
+}
+
+static bool dm_table_supports_zone_append(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned int i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (ti->emulate_zone_append)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
+{
+ struct mapped_device *md = t->md;
+
+ /*
+ * For a zoned target, the number of zones should be updated for the
+ * correct value to be exposed in sysfs queue/nr_zones.
+ */
+ WARN_ON_ONCE(queue_is_mq(q));
+ q->nr_zones = blkdev_nr_zones(md->disk);
+
+ /* Check if zone append is natively supported */
+ if (dm_table_supports_zone_append(t)) {
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ dm_cleanup_zoned_dev(md);
+ return 0;
+ }
+
+ /*
+ * Mark the mapped device as needing zone append emulation and
+ * initialize the emulation resources once the capacity is set.
+ */
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ if (!get_capacity(md->disk))
+ return 0;
+
+ return dm_revalidate_zones(md, t);
+}
+
+static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ unsigned int *wp_offset = data;
+
+ *wp_offset = dm_get_zone_wp_offset(zone);
+
+ return 0;
+}
+
+static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
+ unsigned int *wp_ofst)
+{
+ sector_t sector = zno * blk_queue_zone_sectors(md->queue);
+ unsigned int noio_flag;
+ struct dm_table *t;
+ int srcu_idx, ret;
+
+ t = dm_get_live_table(md, &srcu_idx);
+ if (!t)
+ return -EIO;
+
+ /*
+ * Ensure that all memory allocations in this context are done as if
+ * GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
+ ret = dm_blk_do_report_zones(md, t, sector, 1,
+ dm_update_zone_wp_offset_cb, wp_ofst);
+ memalloc_noio_restore(noio_flag);
+
+ dm_put_live_table(md, srcu_idx);
+
+ if (ret != 1)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * First phase of BIO mapping for targets with zone append emulation:
+ * check all BIO that change a zone writer pointer and change zone
+ * append operations into regular write operations.
+ */
+static bool dm_zone_map_bio_begin(struct mapped_device *md,
+ struct bio *orig_bio, struct bio *clone)
+{
+ sector_t zsectors = blk_queue_zone_sectors(md->queue);
+ unsigned int zno = bio_zone_no(orig_bio);
+ unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+
+ /*
+ * If the target zone is in an error state, recover by inspecting the
+ * zone to get its current write pointer position. Note that since the
+ * target zone is already locked, a BIO issuing context should never
+ * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
+ */
+ if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
+ if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
+ return false;
+ WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
+ }
+
+ switch (bio_op(orig_bio)) {
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_FINISH:
+ return true;
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ /* Writes must be aligned to the zone write pointer */
+ if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
+ return false;
+ break;
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * Change zone append operations into a non-mergeable regular
+ * writes directed at the current write pointer position of the
+ * target zone.
+ */
+ clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
+ (orig_bio->bi_opf & (~REQ_OP_MASK));
+ clone->bi_iter.bi_sector =
+ orig_bio->bi_iter.bi_sector + zwp_offset;
+ break;
+ default:
+ DMWARN_LIMIT("Invalid BIO operation");
+ return false;
+ }
+
+ /* Cannot write to a full zone */
+ if (zwp_offset >= zsectors)
+ return false;
+
+ return true;
+}
+
+/*
+ * Second phase of BIO mapping for targets with zone append emulation:
+ * update the zone write pointer offset array to account for the additional
+ * data written to a zone. Note that at this point, the remapped clone BIO
+ * may already have completed, so we do not touch it.
+ */
+static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
+ struct bio *orig_bio,
+ unsigned int nr_sectors)
+{
+ unsigned int zno = bio_zone_no(orig_bio);
+ unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+
+ /* The clone BIO may already have been completed and failed */
+ if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
+ return BLK_STS_IOERR;
+
+ /* Update the zone wp offset */
+ switch (bio_op(orig_bio)) {
+ case REQ_OP_ZONE_RESET:
+ WRITE_ONCE(md->zwp_offset[zno], 0);
+ return BLK_STS_OK;
+ case REQ_OP_ZONE_FINISH:
+ WRITE_ONCE(md->zwp_offset[zno],
+ blk_queue_zone_sectors(md->queue));
+ return BLK_STS_OK;
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
+ return BLK_STS_OK;
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * Check that the target did not truncate the write operation
+ * emulating a zone append.
+ */
+ if (nr_sectors != bio_sectors(orig_bio)) {
+ DMWARN_LIMIT("Truncated write for zone append");
+ return BLK_STS_IOERR;
+ }
+ WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
+ return BLK_STS_OK;
+ default:
+ DMWARN_LIMIT("Invalid BIO operation");
+ return BLK_STS_IOERR;
+ }
+}
+
+static inline void dm_zone_lock(struct request_queue *q,
+ unsigned int zno, struct bio *clone)
+{
+ if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
+ return;
+
+ wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
+ bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
+}
+
+static inline void dm_zone_unlock(struct request_queue *q,
+ unsigned int zno, struct bio *clone)
+{
+ if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
+ return;
+
+ WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock));
+ clear_bit_unlock(zno, q->seq_zones_wlock);
+ smp_mb__after_atomic();
+ wake_up_bit(q->seq_zones_wlock, zno);
+
+ bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
+}
+
+static bool dm_need_zone_wp_tracking(struct bio *orig_bio)
+{
+ /*
+ * Special processing is not needed for operations that do not need the
+ * zone write lock, that is, all operations that target conventional
+ * zones and all operations that do not modify directly a sequential
+ * zone write pointer.
+ */
+ if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio))
+ return false;
+ switch (bio_op(orig_bio)) {
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_FINISH:
+ case REQ_OP_ZONE_APPEND:
+ return bio_zone_is_seq(orig_bio);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Special IO mapping for targets needing zone append emulation.
+ */
+int dm_zone_map_bio(struct dm_target_io *tio)
+{
+ struct dm_io *io = tio->io;
+ struct dm_target *ti = tio->ti;
+ struct mapped_device *md = io->md;
+ struct request_queue *q = md->queue;
+ struct bio *orig_bio = io->orig_bio;
+ struct bio *clone = &tio->clone;
+ unsigned int zno;
+ blk_status_t sts;
+ int r;
+
+ /*
+ * IOs that do not change a zone write pointer do not need
+ * any additional special processing.
+ */
+ if (!dm_need_zone_wp_tracking(orig_bio))
+ return ti->type->map(ti, clone);
+
+ /* Lock the target zone */
+ zno = bio_zone_no(orig_bio);
+ dm_zone_lock(q, zno, clone);
+
+ /*
+ * Check that the bio and the target zone write pointer offset are
+ * both valid, and if the bio is a zone append, remap it to a write.
+ */
+ if (!dm_zone_map_bio_begin(md, orig_bio, clone)) {
+ dm_zone_unlock(q, zno, clone);
+ return DM_MAPIO_KILL;
+ }
+
+ /*
+ * The target map function may issue and complete the IO quickly.
+ * Take an extra reference on the IO to make sure it does disappear
+ * until we run dm_zone_map_bio_end().
+ */
+ dm_io_inc_pending(io);
+
+ /* Let the target do its work */
+ r = ti->type->map(ti, clone);
+ switch (r) {
+ case DM_MAPIO_SUBMITTED:
+ /*
+ * The target submitted the clone BIO. The target zone will
+ * be unlocked on completion of the clone.
+ */
+ sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
+ break;
+ case DM_MAPIO_REMAPPED:
+ /*
+ * The target only remapped the clone BIO. In case of error,
+ * unlock the target zone here as the clone will not be
+ * submitted.
+ */
+ sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
+ if (sts != BLK_STS_OK)
+ dm_zone_unlock(q, zno, clone);
+ break;
+ case DM_MAPIO_REQUEUE:
+ case DM_MAPIO_KILL:
+ default:
+ dm_zone_unlock(q, zno, clone);
+ sts = BLK_STS_IOERR;
+ break;
+ }
+
+ /* Drop the extra reference on the IO */
+ dm_io_dec_pending(io, sts);
+
+ if (sts != BLK_STS_OK)
+ return DM_MAPIO_KILL;
+
+ return r;
+}
+
+/*
+ * IO completion callback called from clone_endio().
+ */
+void dm_zone_endio(struct dm_io *io, struct bio *clone)
+{
+ struct mapped_device *md = io->md;
+ struct request_queue *q = md->queue;
+ struct bio *orig_bio = io->orig_bio;
+ unsigned int zwp_offset;
+ unsigned int zno;
+
+ /*
+ * For targets that do not emulate zone append, we only need to
+ * handle native zone-append bios.
+ */
+ if (!dm_emulate_zone_append(md)) {
+ /*
+ * Get the offset within the zone of the written sector
+ * and add that to the original bio sector position.
+ */
+ if (clone->bi_status == BLK_STS_OK &&
+ bio_op(clone) == REQ_OP_ZONE_APPEND) {
+ sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
+
+ orig_bio->bi_iter.bi_sector +=
+ clone->bi_iter.bi_sector & mask;
+ }
+
+ return;
+ }
+
+ /*
+ * For targets that do emulate zone append, if the clone BIO does not
+ * own the target zone write lock, we have nothing to do.
+ */
+ if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
+ return;
+
+ zno = bio_zone_no(orig_bio);
+
+ if (clone->bi_status != BLK_STS_OK) {
+ /*
+ * BIOs that modify a zone write pointer may leave the zone
+ * in an unknown state in case of failure (e.g. the write
+ * pointer was only partially advanced). In this case, set
+ * the target zone write pointer as invalid unless it is
+ * already being updated.
+ */
+ WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
+ } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Get the written sector for zone append operation that were
+ * emulated using regular write operations.
+ */
+ zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+ if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
+ WRITE_ONCE(md->zwp_offset[zno],
+ DM_ZONE_INVALID_WP_OFST);
+ else
+ orig_bio->bi_iter.bi_sector +=
+ zwp_offset - bio_sectors(orig_bio);
+ }
+
+ dm_zone_unlock(q, zno, clone);
+}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 039d17b28938..ee4626d08557 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1390,6 +1390,13 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
return -ENXIO;
}
+ /*
+ * Devices that have zones with a capacity smaller than the zone size
+ * (e.g. NVMe zoned namespaces) are not supported.
+ */
+ if (blkz->capacity != blkz->len)
+ return -ENXIO;
+
switch (blkz->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
set_bit(DMZ_RND, &zone->flags);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 9c0ecc9568a4..d58db9a27e6c 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -134,7 +134,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
dst_zone_block = dmz_start_block(zmd, dst_zone);
if (dmz_is_seq(dst_zone))
- set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
+ flags |= BIT(DM_KCOPYD_WRITE_SEQ);
while (block < end_block) {
if (src_zone->dev->flags & DMZ_BDEV_DYING)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ca2aedd8ee7d..2c5f9e585211 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -74,38 +74,6 @@ struct clone_info {
unsigned sector_count;
};
-/*
- * One of these is allocated per clone bio.
- */
-#define DM_TIO_MAGIC 7282014
-struct dm_target_io {
- unsigned magic;
- struct dm_io *io;
- struct dm_target *ti;
- unsigned target_bio_nr;
- unsigned *len_ptr;
- bool inside_dm_io;
- struct bio clone;
-};
-
-/*
- * One of these is allocated per original bio.
- * It contains the first clone used for that original.
- */
-#define DM_IO_MAGIC 5191977
-struct dm_io {
- unsigned magic;
- struct mapped_device *md;
- blk_status_t status;
- atomic_t io_count;
- struct bio *orig_bio;
- unsigned long start_time;
- spinlock_t endio_lock;
- struct dm_stats_aux stats_aux;
- /* last member of dm_target_io is 'struct bio' */
- struct dm_target_io tio;
-};
-
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
#define DM_IO_BIO_OFFSET \
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
@@ -137,19 +105,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
#define MINOR_ALLOCED ((void *)-1)
-/*
- * Bits for the md->flags field.
- */
-#define DMF_BLOCK_IO_FOR_SUSPEND 0
-#define DMF_SUSPENDED 1
-#define DMF_FROZEN 2
-#define DMF_FREEING 3
-#define DMF_DELETING 4
-#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_DEFERRED_REMOVE 6
-#define DMF_SUSPENDED_INTERNALLY 7
-#define DMF_POST_SUSPENDING 8
-
#define DM_NUMA_NODE NUMA_NO_NODE
static int dm_numa_node = DM_NUMA_NODE;
@@ -444,84 +399,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-#ifdef CONFIG_BLK_DEV_ZONED
-int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
-{
- struct dm_report_zones_args *args = data;
- sector_t sector_diff = args->tgt->begin - args->start;
-
- /*
- * Ignore zones beyond the target range.
- */
- if (zone->start >= args->start + args->tgt->len)
- return 0;
-
- /*
- * Remap the start sector and write pointer position of the zone
- * to match its position in the target range.
- */
- zone->start += sector_diff;
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
- if (zone->cond == BLK_ZONE_COND_FULL)
- zone->wp = zone->start + zone->len;
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->wp = zone->start;
- else
- zone->wp += sector_diff;
- }
-
- args->next_sector = zone->start + zone->len;
- return args->orig_cb(zone, args->zone_idx++, args->orig_data);
-}
-EXPORT_SYMBOL_GPL(dm_report_zones_cb);
-
-static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
-{
- struct mapped_device *md = disk->private_data;
- struct dm_table *map;
- int srcu_idx, ret;
- struct dm_report_zones_args args = {
- .next_sector = sector,
- .orig_data = data,
- .orig_cb = cb,
- };
-
- if (dm_suspended_md(md))
- return -EAGAIN;
-
- map = dm_get_live_table(md, &srcu_idx);
- if (!map) {
- ret = -EIO;
- goto out;
- }
-
- do {
- struct dm_target *tgt;
-
- tgt = dm_table_find_target(map, args.next_sector);
- if (WARN_ON_ONCE(!tgt->type->report_zones)) {
- ret = -EIO;
- goto out;
- }
-
- args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args,
- nr_zones - args.zone_idx);
- if (ret < 0)
- goto out;
- } while (args.zone_idx < nr_zones &&
- args.next_sector < get_capacity(disk));
-
- ret = args.zone_idx;
-out:
- dm_put_live_table(md, srcu_idx);
- return ret;
-}
-#else
-#define dm_blk_report_zones NULL
-#endif /* CONFIG_BLK_DEV_ZONED */
-
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
{
@@ -903,7 +780,7 @@ static int __noflush_suspending(struct mapped_device *md)
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
-static void dec_pending(struct dm_io *io, blk_status_t error)
+void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
{
unsigned long flags;
blk_status_t io_error;
@@ -919,22 +796,27 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
}
if (atomic_dec_and_test(&io->io_count)) {
+ bio = io->orig_bio;
if (io->status == BLK_STS_DM_REQUEUE) {
/*
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md))
+ if (__noflush_suspending(md) &&
+ !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
/* NOTE early return due to BLK_STS_DM_REQUEUE below */
- bio_list_add_head(&md->deferred, io->orig_bio);
- else
- /* noflush suspend was interrupted. */
+ bio_list_add_head(&md->deferred, bio);
+ } else {
+ /*
+ * noflush suspend was interrupted or this is
+ * a write to a zoned target.
+ */
io->status = BLK_STS_IOERR;
+ }
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
io_error = io->status;
- bio = io->orig_bio;
end_io_acct(io);
free_io(md, io);
@@ -994,7 +876,6 @@ static void clone_endio(struct bio *bio)
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
- struct bio *orig_bio = io->orig_bio;
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
if (unlikely(error == BLK_STS_TARGET)) {
@@ -1009,23 +890,22 @@ static void clone_endio(struct bio *bio)
disable_write_zeroes(md);
}
- /*
- * For zone-append bios get offset in zone of the written
- * sector and add that to the original bio sector pos.
- */
- if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
- sector_t written_sector = bio->bi_iter.bi_sector;
- struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
- u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
-
- orig_bio->bi_iter.bi_sector += written_sector & mask;
- }
+ if (blk_queue_is_zoned(q))
+ dm_zone_endio(io, bio);
if (endio) {
int r = endio(tio->ti, bio, &error);
switch (r) {
case DM_ENDIO_REQUEUE:
- error = BLK_STS_DM_REQUEUE;
+ /*
+ * Requeuing writes to a sequential zone of a zoned
+ * target will break the sequential write pattern:
+ * fail such IO.
+ */
+ if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
+ error = BLK_STS_IOERR;
+ else
+ error = BLK_STS_DM_REQUEUE;
fallthrough;
case DM_ENDIO_DONE:
break;
@@ -1044,7 +924,7 @@ static void clone_endio(struct bio *bio)
}
free_tio(tio);
- dec_pending(io, error);
+ dm_io_dec_pending(io, error);
}
/*
@@ -1237,8 +1117,8 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
- * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
- * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
+ * operations and REQ_OP_ZONE_APPEND (zone append writes).
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1268,9 +1148,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
+ BUG_ON(op_is_zone_mgmt(bio_op(bio)));
+ BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
BUG_ON(bi_size > *tio->len_ptr);
BUG_ON(n_sectors > bi_size);
+
*tio->len_ptr -= bi_size - n_sectors;
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
}
@@ -1308,7 +1192,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&io->io_count);
+ dm_io_inc_pending(io);
sector = clone->bi_iter.bi_sector;
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1319,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
down(&md->swap_bios_semaphore);
}
- r = ti->type->map(ti, clone);
+ /*
+ * Check if the IO needs a special mapping due to zone append emulation
+ * on zoned target. In this case, dm_zone_map_bio() calls the target
+ * map operation.
+ */
+ if (dm_emulate_zone_append(io->md))
+ r = dm_zone_map_bio(tio);
+ else
+ r = ti->type->map(ti, clone);
+
switch (r) {
case DM_MAPIO_SUBMITTED:
break;
@@ -1334,7 +1227,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
up(&md->swap_bios_semaphore);
}
free_tio(tio);
- dec_pending(io, BLK_STS_IOERR);
+ dm_io_dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1342,7 +1235,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
up(&md->swap_bios_semaphore);
}
free_tio(tio);
- dec_pending(io, BLK_STS_DM_REQUEUE);
+ dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
break;
default:
DMWARN("unimplemented target map return value: %d", r);
@@ -1631,7 +1524,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
if (bio->bi_opf & REQ_PREFLUSH) {
error = __send_empty_flush(&ci);
- /* dec_pending submits any data associated with flush */
+ /* dm_io_dec_pending submits any data associated with flush */
} else if (op_is_zone_mgmt(bio_op(bio))) {
ci.bio = bio;
ci.sector_count = 0;
@@ -1672,7 +1565,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
}
/* drop the extra reference count */
- dec_pending(ci.io, errno_to_blk_status(error));
+ dm_io_dec_pending(ci.io, errno_to_blk_status(error));
return ret;
}
@@ -1801,13 +1694,13 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
del_gendisk(md->disk);
- put_disk(md->disk);
}
- if (md->queue) {
+ if (md->queue)
dm_queue_destroy_keyslot_manager(md->queue);
- blk_cleanup_queue(md->queue);
- }
+
+ if (md->disk)
+ blk_cleanup_disk(md->disk);
cleanup_srcu_struct(&md->io_barrier);
@@ -1817,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
mutex_destroy(&md->swap_bios_lock);
dm_mq_cleanup_mapped_device(md);
+ dm_cleanup_zoned_dev(md);
}
/*
@@ -1869,13 +1763,10 @@ static struct mapped_device *alloc_dev(int minor)
* established. If request-based table is loaded: blk-mq will
* override accordingly.
*/
- md->queue = blk_alloc_queue(numa_node_id);
- if (!md->queue)
- goto bad;
-
- md->disk = alloc_disk_node(1, md->numa_node_id);
+ md->disk = blk_alloc_disk(md->numa_node_id);
if (!md->disk)
goto bad;
+ md->queue = md->disk->queue;
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
@@ -1888,6 +1779,7 @@ static struct mapped_device *alloc_dev(int minor)
md->disk->major = _major;
md->disk->first_minor = minor;
+ md->disk->minors = 1;
md->disk->fops = &dm_blk_dops;
md->disk->queue = md->queue;
md->disk->private_data = md;
@@ -2062,11 +1954,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
goto out;
}
+ ret = dm_table_set_restrictions(t, q, limits);
+ if (ret) {
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
+
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
- dm_table_set_restrictions(t, q, limits);
if (old_map)
dm_sync_table(md);
@@ -2185,7 +2082,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
DMERR("Cannot calculate initial queue limits");
return r;
}
- dm_table_set_restrictions(t, md->queue, &limits);
+ r = dm_table_set_restrictions(t, md->queue, &limits);
+ if (r)
+ return r;
+
blk_register_queue(md->disk);
return 0;
@@ -2328,7 +2228,7 @@ static bool md_in_flight_bios(struct mapped_device *md)
return sum != 0;
}
-static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
+static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
{
int r = 0;
DEFINE_WAIT(wait);
@@ -2351,7 +2251,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state
return r;
}
-static int dm_wait_for_completion(struct mapped_device *md, long task_state)
+static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
{
int r = 0;
@@ -2478,7 +2378,7 @@ static void unlock_fs(struct mapped_device *md)
* are being added to md->deferred list.
*/
static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
- unsigned suspend_flags, long task_state,
+ unsigned suspend_flags, unsigned int task_state,
int dmf_suspended_flag)
{
bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b441ad772c18..742d9c80efe1 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -45,6 +45,8 @@ struct dm_dev_internal {
struct dm_table;
struct dm_md_mempools;
+struct dm_target_io;
+struct dm_io;
/*-----------------------------------------------------------------
* Internal table functions.
@@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
bool dm_table_has_no_data_devices(struct dm_table *table);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
- struct queue_limits *limits);
+int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *limits);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_presuspend_undo_targets(struct dm_table *t);
@@ -100,6 +102,30 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
*/
#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
+/*
+ * Zoned targets related functions.
+ */
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
+void dm_zone_endio(struct dm_io *io, struct bio *clone);
+#ifdef CONFIG_BLK_DEV_ZONED
+void dm_cleanup_zoned_dev(struct mapped_device *md);
+int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data);
+bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
+int dm_zone_map_bio(struct dm_target_io *io);
+#else
+static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
+#define dm_blk_report_zones NULL
+static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
+{
+ return false;
+}
+static inline int dm_zone_map_bio(struct dm_target_io *tio)
+{
+ return DM_MAPIO_KILL;
+}
+#endif
+
/*-----------------------------------------------------------------
* A registry of target types.
*---------------------------------------------------------------*/
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ea3130e11680..e29c6298ef5c 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2616,7 +2616,7 @@ static struct attribute *md_bitmap_attrs[] = {
&max_backlog_used.attr,
NULL
};
-struct attribute_group md_bitmap_group = {
+const struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index fda4cb3f936f..c0dc6f2ef4a3 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -357,7 +357,7 @@ static void raid_exit(void)
module_init(raid_init);
module_exit(raid_exit);
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Fault injection personality for MD");
+MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)");
MODULE_ALIAS("md-personality-10"); /* faulty */
MODULE_ALIAS("md-faulty");
MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 63ed8329a98d..1ff51647a682 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -312,7 +312,7 @@ static void linear_exit (void)
module_init(linear_init);
module_exit(linear_exit);
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Linear device concatenation personality for MD");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
MODULE_ALIAS("md-linear");
MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 776bbe542db5..e7d6486f090f 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -471,7 +471,7 @@ static void __exit multipath_exit (void)
module_init(multipath_init);
module_exit(multipath_exit);
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("simple multi-path personality for MD");
+MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)");
MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
MODULE_ALIAS("md-multipath");
MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 49f897fbb89b..ae8fe54ea358 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -441,30 +441,6 @@ check_suspended:
}
EXPORT_SYMBOL(md_handle_request);
-struct md_io {
- struct mddev *mddev;
- bio_end_io_t *orig_bi_end_io;
- void *orig_bi_private;
- struct block_device *orig_bi_bdev;
- unsigned long start_time;
-};
-
-static void md_end_io(struct bio *bio)
-{
- struct md_io *md_io = bio->bi_private;
- struct mddev *mddev = md_io->mddev;
-
- bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev);
-
- bio->bi_end_io = md_io->orig_bi_end_io;
- bio->bi_private = md_io->orig_bi_private;
-
- mempool_free(md_io, &mddev->md_io_pool);
-
- if (bio->bi_end_io)
- bio->bi_end_io(bio);
-}
-
static blk_qc_t md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
@@ -489,21 +465,6 @@ static blk_qc_t md_submit_bio(struct bio *bio)
return BLK_QC_T_NONE;
}
- if (bio->bi_end_io != md_end_io) {
- struct md_io *md_io;
-
- md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO);
- md_io->mddev = mddev;
- md_io->orig_bi_end_io = bio->bi_end_io;
- md_io->orig_bi_private = bio->bi_private;
- md_io->orig_bi_bdev = bio->bi_bdev;
-
- bio->bi_end_io = md_end_io;
- bio->bi_private = md_io;
-
- md_io->start_time = bio_start_io_acct(bio);
- }
-
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
@@ -824,7 +785,7 @@ out_free_new:
return ERR_PTR(error);
}
-static struct attribute_group md_redundancy_group;
+static const struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
{
@@ -841,7 +802,7 @@ void mddev_unlock(struct mddev *mddev)
* test it under the same mutex to ensure its correct value
* is seen.
*/
- struct attribute_group *to_remove = mddev->to_remove;
+ const struct attribute_group *to_remove = mddev->to_remove;
mddev->to_remove = NULL;
mddev->sysfs_active = 1;
mutex_unlock(&mddev->reconfig_mutex);
@@ -2379,7 +2340,15 @@ int md_integrity_register(struct mddev *mddev)
bdev_get_integrity(reference->bdev));
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
+ if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
+ (mddev->level != 1 && mddev->level != 10 &&
+ bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+ /*
+ * No need to handle the failure of bioset_integrity_create,
+ * because the function is called by md_run() -> pers->run(),
+ * md_run calls bioset_exit -> bioset_integrity_free in case
+ * of failure case.
+ */
pr_err("md: failed to create integrity pool for %s\n",
mdname(mddev));
return -EINVAL;
@@ -5538,7 +5507,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_degraded.attr,
NULL,
};
-static struct attribute_group md_redundancy_group = {
+static const struct attribute_group md_redundancy_group = {
.name = NULL,
.attrs = md_redundancy_attrs,
};
@@ -5598,17 +5567,16 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
- if (mddev->gendisk)
+ if (mddev->gendisk) {
del_gendisk(mddev->gendisk);
- if (mddev->queue)
- blk_cleanup_queue(mddev->queue);
- if (mddev->gendisk)
- put_disk(mddev->gendisk);
+ blk_cleanup_disk(mddev->gendisk);
+ }
percpu_ref_exit(&mddev->writes_pending);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
- mempool_exit(&mddev->md_io_pool);
+ if (mddev->level != 1 && mddev->level != 10)
+ bioset_exit(&mddev->io_acct_set);
kfree(mddev);
}
@@ -5705,26 +5673,14 @@ static int md_alloc(dev_t dev, char *name)
*/
mddev->hold_active = UNTIL_STOP;
- error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
- sizeof(struct md_io));
- if (error)
- goto abort;
-
error = -ENOMEM;
- mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
- if (!mddev->queue)
+ disk = blk_alloc_disk(NUMA_NO_NODE);
+ if (!disk)
goto abort;
- blk_set_stacking_limits(&mddev->queue->limits);
-
- disk = alloc_disk(1 << shift);
- if (!disk) {
- blk_cleanup_queue(mddev->queue);
- mddev->queue = NULL;
- goto abort;
- }
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
+ disk->minors = 1 << shift;
if (name)
strcpy(disk->disk_name, name);
else if (partitioned)
@@ -5733,7 +5689,9 @@ static int md_alloc(dev_t dev, char *name)
sprintf(disk->disk_name, "md%d", unit);
disk->fops = &md_fops;
disk->private_data = mddev;
- disk->queue = mddev->queue;
+
+ mddev->queue = disk->queue;
+ blk_set_stacking_limits(&mddev->queue->limits);
blk_queue_write_cache(mddev->queue, true, true);
/* Allow extended partitions. This makes the
* 'mdp' device redundant, but we can't really
@@ -5907,7 +5865,14 @@ int md_run(struct mddev *mddev)
if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (err)
- return err;
+ goto exit_bio_set;
+ }
+ if (mddev->level != 1 && mddev->level != 10 &&
+ !bioset_initialized(&mddev->io_acct_set)) {
+ err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_acct, bio_clone), 0);
+ if (err)
+ goto exit_sync_set;
}
spin_lock(&pers_lock);
@@ -6035,6 +6000,7 @@ int md_run(struct mddev *mddev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
else
blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@@ -6084,8 +6050,12 @@ bitmap_abort:
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
- bioset_exit(&mddev->bio_set);
+ if (mddev->level != 1 && mddev->level != 10)
+ bioset_exit(&mddev->io_acct_set);
+exit_sync_set:
bioset_exit(&mddev->sync_set);
+exit_bio_set:
+ bioset_exit(&mddev->bio_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6309,6 +6279,8 @@ void md_stop(struct mddev *mddev)
__md_stop(mddev);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
+ if (mddev->level != 1 && mddev->level != 10)
+ bioset_exit(&mddev->io_acct_set);
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -8613,6 +8585,41 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_end_io_acct(struct bio *bio)
+{
+ struct md_io_acct *md_io_acct = bio->bi_private;
+ struct bio *orig_bio = md_io_acct->orig_bio;
+
+ orig_bio->bi_status = bio->bi_status;
+
+ bio_end_io_acct(orig_bio, md_io_acct->start_time);
+ bio_put(bio);
+ bio_endio(orig_bio);
+}
+
+/*
+ * Used by personalities that don't already clone the bio and thus can't
+ * easily add the timestamp to their extended bio structure.
+ */
+void md_account_bio(struct mddev *mddev, struct bio **bio)
+{
+ struct md_io_acct *md_io_acct;
+ struct bio *clone;
+
+ if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue))
+ return;
+
+ clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set);
+ md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
+ md_io_acct->orig_bio = *bio;
+ md_io_acct->start_time = bio_start_io_acct(*bio);
+
+ clone->bi_end_io = md_end_io_acct;
+ clone->bi_private = md_io_acct;
+ *bio = clone;
+}
+EXPORT_SYMBOL_GPL(md_account_bio);
+
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fb7eab58cfd5..832547cf038f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -395,10 +395,10 @@ struct mddev {
* that we are never stopping an array while it is open.
* 'reconfig_mutex' protects all other reconfiguration.
* These locks are separate due to conflicting interactions
- * with bdev->bd_mutex.
+ * with disk->open_mutex.
* Lock ordering is:
- * reconfig_mutex -> bd_mutex
- * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
+ * reconfig_mutex -> disk->open_mutex
+ * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open
*/
struct mutex open_mutex;
struct mutex reconfig_mutex;
@@ -481,13 +481,13 @@ struct mddev {
atomic_t max_corr_read_errors; /* max read retries */
struct list_head all_mddevs;
- struct attribute_group *to_remove;
+ const struct attribute_group *to_remove;
struct bio_set bio_set;
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
- mempool_t md_io_pool;
+ struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -613,7 +613,7 @@ struct md_sysfs_entry {
ssize_t (*show)(struct mddev *, char *);
ssize_t (*store)(struct mddev *, const char *, size_t);
};
-extern struct attribute_group md_bitmap_group;
+extern const struct attribute_group md_bitmap_group;
static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
{
@@ -684,6 +684,12 @@ struct md_thread {
void *private;
};
+struct md_io_acct {
+ struct bio *orig_bio;
+ unsigned long start_time;
+ struct bio bio_clone;
+};
+
#define THREAD_WAKEUP 0
static inline void safe_put_page(struct page *p)
@@ -715,6 +721,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
+void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 185dc60360b5..3a963d783a86 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab,
* in an array block.
*/
static void on_entries(struct dm_array_info *info, struct array_block *ab,
- void (*fn)(void *, const void *))
+ void (*fn)(void *, const void *, unsigned))
{
- unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
-
- for (i = 0; i < nr_entries; i++)
- fn(info->value_type.context, element_at(info, ab, i));
+ unsigned nr_entries = le32_to_cpu(ab->nr_entries);
+ fn(info->value_type.context, element_at(info, ab, 0), nr_entries);
}
/*
@@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
const void *value, unsigned new_nr)
{
- unsigned i;
- uint32_t nr_entries;
+ uint32_t nr_entries, delta, i;
struct dm_btree_value_type *vt = &info->value_type;
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
nr_entries = le32_to_cpu(ab->nr_entries);
- for (i = nr_entries; i < new_nr; i++) {
- if (vt->inc)
- vt->inc(vt->context, value);
+ delta = new_nr - nr_entries;
+ if (vt->inc)
+ vt->inc(vt->context, value, delta);
+ for (i = nr_entries; i < new_nr; i++)
memcpy(element_at(info, ab, i), value, vt->size);
- }
ab->nr_entries = cpu_to_le32(new_nr);
}
@@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
unsigned new_nr)
{
- unsigned i;
- uint32_t nr_entries;
+ uint32_t nr_entries, delta;
struct dm_btree_value_type *vt = &info->value_type;
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
nr_entries = le32_to_cpu(ab->nr_entries);
- for (i = nr_entries; i > new_nr; i--)
- if (vt->dec)
- vt->dec(vt->context, element_at(info, ab, i - 1));
+ delta = nr_entries - new_nr;
+ if (vt->dec)
+ vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta);
ab->nr_entries = cpu_to_le32(new_nr);
}
@@ -573,16 +569,17 @@ static int grow(struct resize *resize)
* These are the value_type functions for the btree elements, which point
* to array blocks.
*/
-static void block_inc(void *context, const void *value)
+static void block_inc(void *context, const void *value, unsigned count)
{
- __le64 block_le;
+ const __le64 *block_le = value;
struct dm_array_info *info = context;
+ unsigned i;
- memcpy(&block_le, value, sizeof(block_le));
- dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
+ for (i = 0; i < count; i++, block_le++)
+ dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le));
}
-static void block_dec(void *context, const void *value)
+static void __block_dec(void *context, const void *value)
{
int r;
uint64_t b;
@@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value)
dm_tm_dec(info->btree_info.tm, b);
}
+static void block_dec(void *context, const void *value, unsigned count)
+{
+ unsigned i;
+ for (i = 0; i < count; i++, value += sizeof(__le64))
+ __block_dec(context, value);
+}
+
static int block_equal(void *context, const void *value1, const void *value2)
{
return !memcmp(value1, value2, sizeof(__le64));
@@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_
return r;
if (vt->inc)
- vt->inc(vt->context, element_at(info, ab, i));
+ vt->inc(vt->context, element_at(info, ab, i), 1);
}
ab->nr_entries = cpu_to_le32(new_nr);
@@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root,
old_value = element_at(info, ab, entry);
if (vt->dec &&
(!vt->equal || !vt->equal(vt->context, old_value, value))) {
- vt->dec(vt->context, old_value);
+ vt->dec(vt->context, old_value, 1);
if (vt->inc)
- vt->inc(vt->context, value);
+ vt->inc(vt->context, value, 1);
}
memcpy(old_value, value, info->value_type.size);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index b1788853a355..893edb426dba 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator;
extern void init_le64_type(struct dm_transaction_manager *tm,
struct dm_btree_value_type *vt);
+/*
+ * This returns a shadowed btree leaf that you may modify. In practise
+ * this means overwrites only, since an insert could cause a node to
+ * be split. Useful if you need access to the old value to calculate the
+ * new one.
+ *
+ * This only works with single level btrees. The given key must be present in
+ * the tree, otherwise -EINVAL will be returned.
+ */
+int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
+ uint64_t key, int *index,
+ dm_block_t *new_root, struct dm_block **leaf);
+
#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index eff04fa23dfa..70532335c7c7 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
- value_ptr(n, index));
+ value_ptr(n, index), 1);
delete_at(n, index);
}
- *new_root = shadow_root(&spine);
+ if (!r)
+ *new_root = shadow_root(&spine);
exit_shadow_spine(&spine);
return r;
@@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
if (k >= keys[last_level] && k < end_key) {
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
- value_ptr(n, index));
+ value_ptr(n, index), 1);
delete_at(n, index);
keys[last_level] = k + 1ull;
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 2061ab865567..f5bd76ed8fe6 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s)
return s->root;
}
-static void le64_inc(void *context, const void *value_le)
+static void le64_inc(void *context, const void *value_le, unsigned count)
{
- struct dm_transaction_manager *tm = context;
- __le64 v_le;
-
- memcpy(&v_le, value_le, sizeof(v_le));
- dm_tm_inc(tm, le64_to_cpu(v_le));
+ dm_tm_with_runs(context, value_le, count, dm_tm_inc_range);
}
-static void le64_dec(void *context, const void *value_le)
+static void le64_dec(void *context, const void *value_le, unsigned count)
{
- struct dm_transaction_manager *tm = context;
- __le64 v_le;
-
- memcpy(&v_le, value_le, sizeof(v_le));
- dm_tm_dec(tm, le64_to_cpu(v_le));
+ dm_tm_with_runs(context, value_le, count, dm_tm_dec_range);
}
static int le64_equal(void *context, const void *value1_le, const void *value2_le)
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index ef6e78d45d5b..0703ca7a7d9a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -71,15 +71,13 @@ static int upper_bound(struct btree_node *n, uint64_t key)
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt)
{
- unsigned i;
uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
- for (i = 0; i < nr_entries; i++)
- dm_tm_inc(tm, value64(n, i));
+ dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
+
else if (vt->inc)
- for (i = 0; i < nr_entries; i++)
- vt->inc(vt->context, value_ptr(n, i));
+ vt->inc(vt->context, value_ptr(n, 0), nr_entries);
}
static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
@@ -318,13 +316,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
goto out;
} else {
- if (info->value_type.dec) {
- unsigned i;
-
- for (i = 0; i < f->nr_children; i++)
- info->value_type.dec(info->value_type.context,
- value_ptr(f->n, i));
- }
+ if (info->value_type.dec)
+ info->value_type.dec(info->value_type.context,
+ value_ptr(f->n, 0), f->nr_children);
pop_frame(s);
}
}
@@ -500,6 +494,122 @@ out:
EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
+/*----------------------------------------------------------------*/
+
+/*
+ * Copies entries from one region of a btree node to another. The regions
+ * must not overlap.
+ */
+static void copy_entries(struct btree_node *dest, unsigned dest_offset,
+ struct btree_node *src, unsigned src_offset,
+ unsigned count)
+{
+ size_t value_size = le32_to_cpu(dest->header.value_size);
+ memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
+ memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
+}
+
+/*
+ * Moves entries from one region fo a btree node to another. The regions
+ * may overlap.
+ */
+static void move_entries(struct btree_node *dest, unsigned dest_offset,
+ struct btree_node *src, unsigned src_offset,
+ unsigned count)
+{
+ size_t value_size = le32_to_cpu(dest->header.value_size);
+ memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
+ memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
+}
+
+/*
+ * Erases the first 'count' entries of a btree node, shifting following
+ * entries down into their place.
+ */
+static void shift_down(struct btree_node *n, unsigned count)
+{
+ move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
+}
+
+/*
+ * Moves entries in a btree node up 'count' places, making space for
+ * new entries at the start of the node.
+ */
+static void shift_up(struct btree_node *n, unsigned count)
+{
+ move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
+}
+
+/*
+ * Redistributes entries between two btree nodes to make them
+ * have similar numbers of entries.
+ */
+static void redistribute2(struct btree_node *left, struct btree_node *right)
+{
+ unsigned nr_left = le32_to_cpu(left->header.nr_entries);
+ unsigned nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned total = nr_left + nr_right;
+ unsigned target_left = total / 2;
+ unsigned target_right = total - target_left;
+
+ if (nr_left < target_left) {
+ unsigned delta = target_left - nr_left;
+ copy_entries(left, nr_left, right, 0, delta);
+ shift_down(right, delta);
+ } else if (nr_left > target_left) {
+ unsigned delta = nr_left - target_left;
+ if (nr_right)
+ shift_up(right, delta);
+ copy_entries(right, 0, left, target_left, delta);
+ }
+
+ left->header.nr_entries = cpu_to_le32(target_left);
+ right->header.nr_entries = cpu_to_le32(target_right);
+}
+
+/*
+ * Redistribute entries between three nodes. Assumes the central
+ * node is empty.
+ */
+static void redistribute3(struct btree_node *left, struct btree_node *center,
+ struct btree_node *right)
+{
+ unsigned nr_left = le32_to_cpu(left->header.nr_entries);
+ unsigned nr_center = le32_to_cpu(center->header.nr_entries);
+ unsigned nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned total, target_left, target_center, target_right;
+
+ BUG_ON(nr_center);
+
+ total = nr_left + nr_right;
+ target_left = total / 3;
+ target_center = (total - target_left) / 2;
+ target_right = (total - target_left - target_center);
+
+ if (nr_left < target_left) {
+ unsigned left_short = target_left - nr_left;
+ copy_entries(left, nr_left, right, 0, left_short);
+ copy_entries(center, 0, right, left_short, target_center);
+ shift_down(right, nr_right - target_right);
+
+ } else if (nr_left < (target_left + target_center)) {
+ unsigned left_to_center = nr_left - target_left;
+ copy_entries(center, 0, left, target_left, left_to_center);
+ copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
+ shift_down(right, nr_right - target_right);
+
+ } else {
+ unsigned right_short = target_right - nr_right;
+ shift_up(right, right_short);
+ copy_entries(right, 0, left, nr_left - right_short, right_short);
+ copy_entries(center, 0, left, target_left, nr_left - target_left);
+ }
+
+ left->header.nr_entries = cpu_to_le32(target_left);
+ center->header.nr_entries = cpu_to_le32(target_center);
+ right->header.nr_entries = cpu_to_le32(target_right);
+}
+
/*
* Splits a node by creating a sibling node and shifting half the nodes
* contents across. Assumes there is a parent node, and it has room for
@@ -530,12 +640,10 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
*
* Where A* is a shadow of A.
*/
-static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
- uint64_t key)
+static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
+ struct dm_btree_value_type *vt, uint64_t key)
{
int r;
- size_t size;
- unsigned nr_left, nr_right;
struct dm_block *left, *right, *parent;
struct btree_node *ln, *rn, *pn;
__le64 location;
@@ -549,36 +657,18 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
ln = dm_block_data(left);
rn = dm_block_data(right);
- nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
- nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
-
- ln->header.nr_entries = cpu_to_le32(nr_left);
-
rn->header.flags = ln->header.flags;
- rn->header.nr_entries = cpu_to_le32(nr_right);
+ rn->header.nr_entries = cpu_to_le32(0);
rn->header.max_entries = ln->header.max_entries;
rn->header.value_size = ln->header.value_size;
- memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
-
- size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
- sizeof(uint64_t) : s->info->value_type.size;
- memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
- size * nr_right);
+ redistribute2(ln, rn);
- /*
- * Patch up the parent
- */
+ /* patch up the parent */
parent = shadow_parent(s);
-
pn = dm_block_data(parent);
- location = cpu_to_le64(dm_block_location(left));
- __dm_bless_for_disk(&location);
- memcpy_disk(value_ptr(pn, parent_index),
- &location, sizeof(__le64));
location = cpu_to_le64(dm_block_location(right));
__dm_bless_for_disk(&location);
-
r = insert_at(sizeof(__le64), pn, parent_index + 1,
le64_to_cpu(rn->keys[0]), &location);
if (r) {
@@ -586,6 +676,7 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
return r;
}
+ /* patch up the spine */
if (key < le64_to_cpu(rn->keys[0])) {
unlock_block(s->info, right);
s->nodes[1] = left;
@@ -598,6 +689,121 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
}
/*
+ * We often need to modify a sibling node. This function shadows a particular
+ * child of the given parent node. Making sure to update the parent to point
+ * to the new shadow.
+ */
+static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+ struct btree_node *parent, unsigned index,
+ struct dm_block **result)
+{
+ int r, inc;
+ dm_block_t root;
+ struct btree_node *node;
+
+ root = value64(parent, index);
+
+ r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
+ result, &inc);
+ if (r)
+ return r;
+
+ node = dm_block_data(*result);
+
+ if (inc)
+ inc_children(info->tm, node, vt);
+
+ *((__le64 *) value_ptr(parent, index)) =
+ cpu_to_le64(dm_block_location(*result));
+
+ return 0;
+}
+
+/*
+ * Splits two nodes into three. This is more work, but results in fuller
+ * nodes, so saves metadata space.
+ */
+static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
+ struct dm_btree_value_type *vt, uint64_t key)
+{
+ int r;
+ unsigned middle_index;
+ struct dm_block *left, *middle, *right, *parent;
+ struct btree_node *ln, *rn, *mn, *pn;
+ __le64 location;
+
+ parent = shadow_parent(s);
+ pn = dm_block_data(parent);
+
+ if (parent_index == 0) {
+ middle_index = 1;
+ left = shadow_current(s);
+ r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
+ if (r)
+ return r;
+ } else {
+ middle_index = parent_index;
+ right = shadow_current(s);
+ r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
+ if (r)
+ return r;
+ }
+
+ r = new_block(s->info, &middle);
+ if (r < 0)
+ return r;
+
+ ln = dm_block_data(left);
+ mn = dm_block_data(middle);
+ rn = dm_block_data(right);
+
+ mn->header.nr_entries = cpu_to_le32(0);
+ mn->header.flags = ln->header.flags;
+ mn->header.max_entries = ln->header.max_entries;
+ mn->header.value_size = ln->header.value_size;
+
+ redistribute3(ln, mn, rn);
+
+ /* patch up the parent */
+ pn->keys[middle_index] = rn->keys[0];
+ location = cpu_to_le64(dm_block_location(middle));
+ __dm_bless_for_disk(&location);
+ r = insert_at(sizeof(__le64), pn, middle_index,
+ le64_to_cpu(mn->keys[0]), &location);
+ if (r) {
+ if (shadow_current(s) != left)
+ unlock_block(s->info, left);
+
+ unlock_block(s->info, middle);
+
+ if (shadow_current(s) != right)
+ unlock_block(s->info, right);
+
+ return r;
+ }
+
+
+ /* patch up the spine */
+ if (key < le64_to_cpu(mn->keys[0])) {
+ unlock_block(s->info, middle);
+ unlock_block(s->info, right);
+ s->nodes[1] = left;
+ } else if (key < le64_to_cpu(rn->keys[0])) {
+ unlock_block(s->info, left);
+ unlock_block(s->info, right);
+ s->nodes[1] = middle;
+ } else {
+ unlock_block(s->info, left);
+ unlock_block(s->info, middle);
+ s->nodes[1] = right;
+ }
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
* Splits a node by creating two new children beneath the given node.
*
* Before:
@@ -690,6 +896,186 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
return 0;
}
+/*----------------------------------------------------------------*/
+
+/*
+ * Redistributes a node's entries with its left sibling.
+ */
+static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
+ unsigned parent_index, uint64_t key)
+{
+ int r;
+ struct dm_block *sib;
+ struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
+
+ r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
+ if (r)
+ return r;
+
+ left = dm_block_data(sib);
+ right = dm_block_data(shadow_current(s));
+ redistribute2(left, right);
+ *key_ptr(parent, parent_index) = right->keys[0];
+
+ if (key < le64_to_cpu(right->keys[0])) {
+ unlock_block(s->info, s->nodes[1]);
+ s->nodes[1] = sib;
+ } else {
+ unlock_block(s->info, sib);
+ }
+
+ return 0;
+}
+
+/*
+ * Redistributes a nodes entries with its right sibling.
+ */
+static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
+ unsigned parent_index, uint64_t key)
+{
+ int r;
+ struct dm_block *sib;
+ struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
+
+ r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
+ if (r)
+ return r;
+
+ left = dm_block_data(shadow_current(s));
+ right = dm_block_data(sib);
+ redistribute2(left, right);
+ *key_ptr(parent, parent_index + 1) = right->keys[0];
+
+ if (key < le64_to_cpu(right->keys[0])) {
+ unlock_block(s->info, sib);
+ } else {
+ unlock_block(s->info, s->nodes[1]);
+ s->nodes[1] = sib;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns the number of spare entries in a node.
+ */
+static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
+{
+ int r;
+ unsigned nr_entries;
+ struct dm_block *block;
+ struct btree_node *node;
+
+ r = bn_read_lock(info, b, &block);
+ if (r)
+ return r;
+
+ node = dm_block_data(block);
+ nr_entries = le32_to_cpu(node->header.nr_entries);
+ *space = le32_to_cpu(node->header.max_entries) - nr_entries;
+
+ unlock_block(info, block);
+ return 0;
+}
+
+/*
+ * Make space in a node, either by moving some entries to a sibling,
+ * or creating a new sibling node. SPACE_THRESHOLD defines the minimum
+ * number of free entries that must be in the sibling to make the move
+ * worth while. If the siblings are shared (eg, part of a snapshot),
+ * then they are not touched, since this break sharing and so consume
+ * more space than we save.
+ */
+#define SPACE_THRESHOLD 8
+static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
+ unsigned parent_index, uint64_t key)
+{
+ int r;
+ struct btree_node *parent = dm_block_data(shadow_parent(s));
+ unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
+ unsigned free_space;
+ int left_shared = 0, right_shared = 0;
+
+ /* Should we move entries to the left sibling? */
+ if (parent_index > 0) {
+ dm_block_t left_b = value64(parent, parent_index - 1);
+ r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
+ if (r)
+ return r;
+
+ if (!left_shared) {
+ r = get_node_free_space(s->info, left_b, &free_space);
+ if (r)
+ return r;
+
+ if (free_space >= SPACE_THRESHOLD)
+ return rebalance_left(s, vt, parent_index, key);
+ }
+ }
+
+ /* Should we move entries to the right sibling? */
+ if (parent_index < (nr_parent - 1)) {
+ dm_block_t right_b = value64(parent, parent_index + 1);
+ r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
+ if (r)
+ return r;
+
+ if (!right_shared) {
+ r = get_node_free_space(s->info, right_b, &free_space);
+ if (r)
+ return r;
+
+ if (free_space >= SPACE_THRESHOLD)
+ return rebalance_right(s, vt, parent_index, key);
+ }
+ }
+
+ /*
+ * We need to split the node, normally we split two nodes
+ * into three. But when inserting a sequence that is either
+ * monotonically increasing or decreasing it's better to split
+ * a single node into two.
+ */
+ if (left_shared || right_shared || (nr_parent <= 2) ||
+ (parent_index == 0) || (parent_index + 1 == nr_parent)) {
+ return split_one_into_two(s, parent_index, vt, key);
+ } else {
+ return split_two_into_three(s, parent_index, vt, key);
+ }
+}
+
+/*
+ * Does the node contain a particular key?
+ */
+static bool contains_key(struct btree_node *node, uint64_t key)
+{
+ int i = lower_bound(node, key);
+
+ if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
+ return true;
+
+ return false;
+}
+
+/*
+ * In general we preemptively make sure there's a free entry in every
+ * node on the spine when doing an insert. But we can avoid that with
+ * leaf nodes if we know it's an overwrite.
+ */
+static bool has_space_for_insert(struct btree_node *node, uint64_t key)
+{
+ if (node->header.nr_entries == node->header.max_entries) {
+ if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
+ /* we don't need space if it's an overwrite */
+ return contains_key(node, key);
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
struct dm_btree_value_type *vt,
uint64_t key, unsigned *index)
@@ -719,17 +1105,18 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
node = dm_block_data(shadow_current(s));
- if (node->header.nr_entries == node->header.max_entries) {
+ if (!has_space_for_insert(node, key)) {
if (top)
r = btree_split_beneath(s, key);
else
- r = btree_split_sibling(s, i, key);
+ r = rebalance_or_split(s, vt, i, key);
if (r < 0)
return r;
- }
- node = dm_block_data(shadow_current(s));
+ /* making space can cause the current node to change */
+ node = dm_block_data(shadow_current(s));
+ }
i = lower_bound(node, key);
@@ -753,6 +1140,77 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
return 0;
}
+static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
+ uint64_t key, int *index)
+{
+ int r, i = -1;
+ struct btree_node *node;
+
+ *index = 0;
+ for (;;) {
+ r = shadow_step(s, root, &s->info->value_type);
+ if (r < 0)
+ return r;
+
+ node = dm_block_data(shadow_current(s));
+
+ /*
+ * We have to patch up the parent node, ugly, but I don't
+ * see a way to do this automatically as part of the spine
+ * op.
+ */
+ if (shadow_has_parent(s) && i >= 0) {
+ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+
+ __dm_bless_for_disk(&location);
+ memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
+ &location, sizeof(__le64));
+ }
+
+ node = dm_block_data(shadow_current(s));
+ i = lower_bound(node, key);
+
+ BUG_ON(i < 0);
+ BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
+
+ if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
+ if (key != le64_to_cpu(node->keys[i]))
+ return -EINVAL;
+ break;
+ }
+
+ root = value64(node, i);
+ }
+
+ *index = i;
+ return 0;
+}
+
+int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
+ uint64_t key, int *index,
+ dm_block_t *new_root, struct dm_block **leaf)
+{
+ int r;
+ struct shadow_spine spine;
+
+ BUG_ON(info->levels > 1);
+ init_shadow_spine(&spine, info);
+ r = __btree_get_overwrite_leaf(&spine, root, key, index);
+ if (!r) {
+ *new_root = shadow_root(&spine);
+ *leaf = shadow_current(&spine);
+
+ /*
+ * Decrement the count so exit_shadow_spine() doesn't
+ * unlock the leaf.
+ */
+ spine.count--;
+ }
+ exit_shadow_spine(&spine);
+
+ return r;
+}
+
static bool need_insert(struct btree_node *node, uint64_t *keys,
unsigned level, unsigned index)
{
@@ -829,7 +1287,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
value_ptr(n, index),
value))) {
info->value_type.dec(info->value_type.context,
- value_ptr(n, index));
+ value_ptr(n, index), 1);
}
memcpy_disk(value_ptr(n, index),
value, info->value_type.size);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 3dc5bb1a4748..d2ae5aa4d00b 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -51,21 +51,21 @@ struct dm_btree_value_type {
*/
/*
- * The btree is making a duplicate of the value, for instance
+ * The btree is making a duplicate of a run of values, for instance
* because previously-shared btree nodes have now diverged.
* @value argument is the new copy that the copy function may modify.
* (Probably it just wants to increment a reference count
* somewhere.) This method is _not_ called for insertion of a new
* value: It is assumed the ref count is already 1.
*/
- void (*inc)(void *context, const void *value);
+ void (*inc)(void *context, const void *value, unsigned count);
/*
- * This value is being deleted. The btree takes care of freeing
+ * These values are being deleted. The btree takes care of freeing
* the memory pointed to by @value. Often the del function just
- * needs to decrement a reference count somewhere.
+ * needs to decrement a reference counts somewhere.
*/
- void (*dec)(void *context, const void *value);
+ void (*dec)(void *context, const void *value, unsigned count);
/*
* A test for equality between two values. When a value is
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index a213bf11738f..4a6a2a9b4eb4 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -6,6 +6,8 @@
#include "dm-space-map-common.h"
#include "dm-transaction-manager.h"
+#include "dm-btree-internal.h"
+#include "dm-persistent-data-internal.h"
#include <linux/bitops.h>
#include <linux/device-mapper.h>
@@ -409,12 +411,13 @@ int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
return r;
}
-static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
- int (*mutator)(void *context, uint32_t old, uint32_t *new),
- void *context, enum allocation_event *ev)
+/*----------------------------------------------------------------*/
+
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
+ uint32_t ref_count, int32_t *nr_allocations)
{
int r;
- uint32_t bit, old, ref_count;
+ uint32_t bit, old;
struct dm_block *nb;
dm_block_t index = b;
struct disk_index_entry ie_disk;
@@ -433,10 +436,9 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
return r;
}
ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
-
bm_le = dm_bitmap_data(nb);
- old = sm_lookup_bitmap(bm_le, bit);
+ old = sm_lookup_bitmap(bm_le, bit);
if (old > 2) {
r = sm_ll_lookup_big_ref_count(ll, b, &old);
if (r < 0) {
@@ -445,7 +447,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
}
}
- r = mutator(context, old, &ref_count);
if (r) {
dm_tm_unlock(ll->tm, nb);
return r;
@@ -453,7 +454,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
if (ref_count <= 2) {
sm_set_bitmap(bm_le, bit, ref_count);
-
dm_tm_unlock(ll->tm, nb);
if (old > 2) {
@@ -480,62 +480,459 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
}
if (ref_count && !old) {
- *ev = SM_ALLOC;
+ *nr_allocations = 1;
ll->nr_allocated++;
le32_add_cpu(&ie_disk.nr_free, -1);
if (le32_to_cpu(ie_disk.none_free_before) == bit)
ie_disk.none_free_before = cpu_to_le32(bit + 1);
} else if (old && !ref_count) {
- *ev = SM_FREE;
+ *nr_allocations = -1;
ll->nr_allocated--;
le32_add_cpu(&ie_disk.nr_free, 1);
ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
} else
- *ev = SM_NONE;
+ *nr_allocations = 0;
return ll->save_ie(ll, index, &ie_disk);
}
-static int set_ref_count(void *context, uint32_t old, uint32_t *new)
+/*----------------------------------------------------------------*/
+
+/*
+ * Holds useful intermediate results for the range based inc and dec
+ * operations.
+ */
+struct inc_context {
+ struct disk_index_entry ie_disk;
+ struct dm_block *bitmap_block;
+ void *bitmap;
+
+ struct dm_block *overflow_leaf;
+};
+
+static inline void init_inc_context(struct inc_context *ic)
+{
+ ic->bitmap_block = NULL;
+ ic->bitmap = NULL;
+ ic->overflow_leaf = NULL;
+}
+
+static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic)
+{
+ if (ic->bitmap_block)
+ dm_tm_unlock(ll->tm, ic->bitmap_block);
+ if (ic->overflow_leaf)
+ dm_tm_unlock(ll->tm, ic->overflow_leaf);
+}
+
+static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic)
+{
+ exit_inc_context(ll, ic);
+ init_inc_context(ic);
+}
+
+/*
+ * Confirms a btree node contains a particular key at an index.
+ */
+static bool contains_key(struct btree_node *n, uint64_t key, int index)
+{
+ return index >= 0 &&
+ index < le32_to_cpu(n->header.nr_entries) &&
+ le64_to_cpu(n->keys[index]) == key;
+}
+
+static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic)
{
- *new = *((uint32_t *) context);
+ int r;
+ int index;
+ struct btree_node *n;
+ __le32 *v_ptr;
+ uint32_t rc;
+
+ /*
+ * bitmap_block needs to be unlocked because getting the
+ * overflow_leaf may need to allocate, and thus use the space map.
+ */
+ reset_inc_context(ll, ic);
+
+ r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root,
+ b, &index, &ll->ref_count_root, &ic->overflow_leaf);
+ if (r < 0)
+ return r;
+
+ n = dm_block_data(ic->overflow_leaf);
+
+ if (!contains_key(n, b, index)) {
+ DMERR("overflow btree is missing an entry");
+ return -EINVAL;
+ }
+
+ v_ptr = value_ptr(n, index);
+ rc = le32_to_cpu(*v_ptr) + 1;
+ *v_ptr = cpu_to_le32(rc);
+
return 0;
}
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
- uint32_t ref_count, enum allocation_event *ev)
+static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic)
+{
+ int index;
+ struct btree_node *n;
+ __le32 *v_ptr;
+ uint32_t rc;
+
+ /*
+ * Do we already have the correct overflow leaf?
+ */
+ if (ic->overflow_leaf) {
+ n = dm_block_data(ic->overflow_leaf);
+ index = lower_bound(n, b);
+ if (contains_key(n, b, index)) {
+ v_ptr = value_ptr(n, index);
+ rc = le32_to_cpu(*v_ptr) + 1;
+ *v_ptr = cpu_to_le32(rc);
+
+ return 0;
+ }
+ }
+
+ return __sm_ll_inc_overflow(ll, b, ic);
+}
+
+static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic)
+{
+ int r, inc;
+ r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr),
+ &dm_sm_bitmap_validator, &ic->bitmap_block, &inc);
+ if (r < 0) {
+ DMERR("dm_tm_shadow_block() failed");
+ return r;
+ }
+ ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block));
+ ic->bitmap = dm_bitmap_data(ic->bitmap_block);
+ return 0;
+}
+
+/*
+ * Once shadow_bitmap has been called, which always happens at the start of inc/dec,
+ * we can reopen the bitmap with a simple write lock, rather than re calling
+ * dm_tm_shadow_block().
+ */
+static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic)
+{
+ if (!ic->bitmap_block) {
+ int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr),
+ &dm_sm_bitmap_validator, &ic->bitmap_block);
+ if (r) {
+ DMERR("unable to re-get write lock for bitmap");
+ return r;
+ }
+ ic->bitmap = dm_bitmap_data(ic->bitmap_block);
+ }
+
+ return 0;
+}
+
+/*
+ * Loops round incrementing entries in a single bitmap.
+ */
+static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b,
+ uint32_t bit, uint32_t bit_end,
+ int32_t *nr_allocations, dm_block_t *new_b,
+ struct inc_context *ic)
+{
+ int r;
+ __le32 le_rc;
+ uint32_t old;
+
+ for (; bit != bit_end; bit++, b++) {
+ /*
+ * We only need to drop the bitmap if we need to find a new btree
+ * leaf for the overflow. So if it was dropped last iteration,
+ * we now re-get it.
+ */
+ r = ensure_bitmap(ll, ic);
+ if (r)
+ return r;
+
+ old = sm_lookup_bitmap(ic->bitmap, bit);
+ switch (old) {
+ case 0:
+ /* inc bitmap, adjust nr_allocated */
+ sm_set_bitmap(ic->bitmap, bit, 1);
+ (*nr_allocations)++;
+ ll->nr_allocated++;
+ le32_add_cpu(&ic->ie_disk.nr_free, -1);
+ if (le32_to_cpu(ic->ie_disk.none_free_before) == bit)
+ ic->ie_disk.none_free_before = cpu_to_le32(bit + 1);
+ break;
+
+ case 1:
+ /* inc bitmap */
+ sm_set_bitmap(ic->bitmap, bit, 2);
+ break;
+
+ case 2:
+ /* inc bitmap and insert into overflow */
+ sm_set_bitmap(ic->bitmap, bit, 3);
+ reset_inc_context(ll, ic);
+
+ le_rc = cpu_to_le32(3);
+ __dm_bless_for_disk(&le_rc);
+ r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
+ &b, &le_rc, &ll->ref_count_root);
+ if (r < 0) {
+ DMERR("ref count insert failed");
+ return r;
+ }
+ break;
+
+ default:
+ /*
+ * inc within the overflow tree only.
+ */
+ r = sm_ll_inc_overflow(ll, b, ic);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ *new_b = b;
+ return 0;
+}
+
+/*
+ * Finds a bitmap that contains entries in the block range, and increments
+ * them.
+ */
+static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e,
+ int32_t *nr_allocations, dm_block_t *new_b)
{
- return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
+ int r;
+ struct inc_context ic;
+ uint32_t bit, bit_end;
+ dm_block_t index = b;
+
+ init_inc_context(&ic);
+
+ bit = do_div(index, ll->entries_per_block);
+ r = ll->load_ie(ll, index, &ic.ie_disk);
+ if (r < 0)
+ return r;
+
+ r = shadow_bitmap(ll, &ic);
+ if (r)
+ return r;
+
+ bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block);
+ r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic);
+
+ exit_inc_context(ll, &ic);
+
+ if (r)
+ return r;
+
+ return ll->save_ie(ll, index, &ic.ie_disk);
}
-static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e,
+ int32_t *nr_allocations)
{
- *new = old + 1;
+ *nr_allocations = 0;
+ while (b != e) {
+ int r = __sm_ll_inc(ll, b, e, nr_allocations, &b);
+ if (r)
+ return r;
+ }
+
return 0;
}
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+/*----------------------------------------------------------------*/
+
+static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b,
+ struct inc_context *ic)
{
- return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
+ reset_inc_context(ll, ic);
+ return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root,
+ &b, &ll->ref_count_root);
}
-static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
+static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b,
+ struct inc_context *ic, uint32_t *old_rc)
{
- if (!old) {
- DMERR_LIMIT("unable to decrement a reference count below 0");
+ int r;
+ int index = -1;
+ struct btree_node *n;
+ __le32 *v_ptr;
+ uint32_t rc;
+
+ reset_inc_context(ll, ic);
+ r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root,
+ b, &index, &ll->ref_count_root, &ic->overflow_leaf);
+ if (r < 0)
+ return r;
+
+ n = dm_block_data(ic->overflow_leaf);
+
+ if (!contains_key(n, b, index)) {
+ DMERR("overflow btree is missing an entry");
return -EINVAL;
}
- *new = old - 1;
+ v_ptr = value_ptr(n, index);
+ rc = le32_to_cpu(*v_ptr);
+ *old_rc = rc;
+
+ if (rc == 3) {
+ return __sm_ll_del_overflow(ll, b, ic);
+ } else {
+ rc--;
+ *v_ptr = cpu_to_le32(rc);
+ return 0;
+ }
+}
+
+static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b,
+ struct inc_context *ic, uint32_t *old_rc)
+{
+ /*
+ * Do we already have the correct overflow leaf?
+ */
+ if (ic->overflow_leaf) {
+ int index;
+ struct btree_node *n;
+ __le32 *v_ptr;
+ uint32_t rc;
+
+ n = dm_block_data(ic->overflow_leaf);
+ index = lower_bound(n, b);
+ if (contains_key(n, b, index)) {
+ v_ptr = value_ptr(n, index);
+ rc = le32_to_cpu(*v_ptr);
+ *old_rc = rc;
+
+ if (rc > 3) {
+ rc--;
+ *v_ptr = cpu_to_le32(rc);
+ return 0;
+ } else {
+ return __sm_ll_del_overflow(ll, b, ic);
+ }
+
+ }
+ }
+
+ return __sm_ll_dec_overflow(ll, b, ic, old_rc);
+}
+
+/*
+ * Loops round incrementing entries in a single bitmap.
+ */
+static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b,
+ uint32_t bit, uint32_t bit_end,
+ struct inc_context *ic,
+ int32_t *nr_allocations, dm_block_t *new_b)
+{
+ int r;
+ uint32_t old;
+
+ for (; bit != bit_end; bit++, b++) {
+ /*
+ * We only need to drop the bitmap if we need to find a new btree
+ * leaf for the overflow. So if it was dropped last iteration,
+ * we now re-get it.
+ */
+ r = ensure_bitmap(ll, ic);
+ if (r)
+ return r;
+
+ old = sm_lookup_bitmap(ic->bitmap, bit);
+ switch (old) {
+ case 0:
+ DMERR("unable to decrement block");
+ return -EINVAL;
+
+ case 1:
+ /* dec bitmap */
+ sm_set_bitmap(ic->bitmap, bit, 0);
+ (*nr_allocations)--;
+ ll->nr_allocated--;
+ le32_add_cpu(&ic->ie_disk.nr_free, 1);
+ ic->ie_disk.none_free_before =
+ cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit));
+ break;
+
+ case 2:
+ /* dec bitmap and insert into overflow */
+ sm_set_bitmap(ic->bitmap, bit, 1);
+ break;
+
+ case 3:
+ r = sm_ll_dec_overflow(ll, b, ic, &old);
+ if (r < 0)
+ return r;
+
+ if (old == 3) {
+ r = ensure_bitmap(ll, ic);
+ if (r)
+ return r;
+
+ sm_set_bitmap(ic->bitmap, bit, 2);
+ }
+ break;
+ }
+ }
+
+ *new_b = b;
return 0;
}
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e,
+ int32_t *nr_allocations, dm_block_t *new_b)
+{
+ int r;
+ uint32_t bit, bit_end;
+ struct inc_context ic;
+ dm_block_t index = b;
+
+ init_inc_context(&ic);
+
+ bit = do_div(index, ll->entries_per_block);
+ r = ll->load_ie(ll, index, &ic.ie_disk);
+ if (r < 0)
+ return r;
+
+ r = shadow_bitmap(ll, &ic);
+ if (r)
+ return r;
+
+ bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block);
+ r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b);
+ exit_inc_context(ll, &ic);
+
+ if (r)
+ return r;
+
+ return ll->save_ie(ll, index, &ic.ie_disk);
+}
+
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e,
+ int32_t *nr_allocations)
{
- return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
+ *nr_allocations = 0;
+ while (b != e) {
+ int r = __sm_ll_dec(ll, b, e, nr_allocations, &b);
+ if (r)
+ return r;
+ }
+
+ return 0;
}
+/*----------------------------------------------------------------*/
+
int sm_ll_commit(struct ll_disk *ll)
{
int r = 0;
@@ -687,28 +1084,92 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
/*----------------------------------------------------------------*/
+static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec)
+{
+ iec->dirty = false;
+ __dm_bless_for_disk(iec->ie);
+ return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
+ &iec->index, &iec->ie, &ll->bitmap_root);
+}
+
+static inline unsigned hash_index(dm_block_t index)
+{
+ return dm_hash_block(index, IE_CACHE_MASK);
+}
+
static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
struct disk_index_entry *ie)
{
- return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
+ int r;
+ unsigned h = hash_index(index);
+ struct ie_cache *iec = ll->ie_cache + h;
+
+ if (iec->valid) {
+ if (iec->index == index) {
+ memcpy(ie, &iec->ie, sizeof(*ie));
+ return 0;
+ }
+
+ if (iec->dirty) {
+ r = ie_cache_writeback(ll, iec);
+ if (r)
+ return r;
+ }
+ }
+
+ r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
+ if (!r) {
+ iec->valid = true;
+ iec->dirty = false;
+ iec->index = index;
+ memcpy(&iec->ie, ie, sizeof(*ie));
+ }
+
+ return r;
}
static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
struct disk_index_entry *ie)
{
- __dm_bless_for_disk(ie);
- return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
- &index, ie, &ll->bitmap_root);
+ int r;
+ unsigned h = hash_index(index);
+ struct ie_cache *iec = ll->ie_cache + h;
+
+ ll->bitmap_index_changed = true;
+ if (iec->valid) {
+ if (iec->index == index) {
+ memcpy(&iec->ie, ie, sizeof(*ie));
+ iec->dirty = true;
+ return 0;
+ }
+
+ if (iec->dirty) {
+ r = ie_cache_writeback(ll, iec);
+ if (r)
+ return r;
+ }
+ }
+
+ iec->valid = true;
+ iec->dirty = true;
+ iec->index = index;
+ memcpy(&iec->ie, ie, sizeof(*ie));
+ return 0;
}
static int disk_ll_init_index(struct ll_disk *ll)
{
+ unsigned i;
+ for (i = 0; i < IE_CACHE_SIZE; i++) {
+ struct ie_cache *iec = ll->ie_cache + i;
+ iec->valid = false;
+ iec->dirty = false;
+ }
return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
}
static int disk_ll_open(struct ll_disk *ll)
{
- /* nothing to do */
return 0;
}
@@ -719,7 +1180,16 @@ static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
static int disk_ll_commit(struct ll_disk *ll)
{
- return 0;
+ int r = 0;
+ unsigned i;
+
+ for (i = 0; i < IE_CACHE_SIZE; i++) {
+ struct ie_cache *iec = ll->ie_cache + i;
+ if (iec->valid && iec->dirty)
+ r = ie_cache_writeback(ll, iec);
+ }
+
+ return r;
}
int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index 87e17909ef52..706ceb85d680 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll);
typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
typedef int (*commit_fn)(struct ll_disk *ll);
+/*
+ * A lot of time can be wasted reading and writing the same
+ * index entry. So we cache a few entries.
+ */
+#define IE_CACHE_SIZE 64
+#define IE_CACHE_MASK (IE_CACHE_SIZE - 1)
+
+struct ie_cache {
+ bool valid;
+ bool dirty;
+ dm_block_t index;
+ struct disk_index_entry ie;
+};
+
struct ll_disk {
struct dm_transaction_manager *tm;
struct dm_btree_info bitmap_info;
@@ -79,6 +93,8 @@ struct ll_disk {
max_index_entries_fn max_entries;
commit_fn commit;
bool bitmap_index_changed:1;
+
+ struct ie_cache ie_cache[IE_CACHE_SIZE];
};
struct disk_sm_root {
@@ -96,12 +112,6 @@ struct disk_bitmap_header {
__le64 blocknr;
} __attribute__ ((packed, aligned(8)));
-enum allocation_event {
- SM_NONE,
- SM_ALLOC,
- SM_FREE,
-};
-
/*----------------------------------------------------------------*/
int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
@@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *result);
-int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
-int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
-int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
+
+/*
+ * The next three functions return (via nr_allocations) the net number of
+ * allocations that were made. This number may be negative if there were
+ * more frees than allocs.
+ */
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations);
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
int sm_ll_commit(struct ll_disk *ll);
int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index 61f56909e00b..d0a8d5e73c28 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
uint32_t count)
{
int r;
- uint32_t old_count;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- r = sm_ll_insert(&smd->ll, b, count, &ev);
+ r = sm_ll_insert(&smd->ll, b, count, &nr_allocations);
if (!r) {
- switch (ev) {
- case SM_NONE:
- break;
-
- case SM_ALLOC:
- /*
- * This _must_ be free in the prior transaction
- * otherwise we've lost atomicity.
- */
- smd->nr_allocated_this_transaction++;
- break;
-
- case SM_FREE:
- /*
- * It's only free if it's also free in the last
- * transaction.
- */
- r = sm_ll_lookup(&smd->old_ll, b, &old_count);
- if (r)
- return r;
-
- if (!old_count)
- smd->nr_allocated_this_transaction--;
- break;
- }
+ smd->nr_allocated_this_transaction += nr_allocations;
}
return r;
}
-static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- r = sm_ll_inc(&smd->ll, b, &ev);
- if (!r && (ev == SM_ALLOC))
- /*
- * This _must_ be free in the prior transaction
- * otherwise we've lost atomicity.
- */
- smd->nr_allocated_this_transaction++;
+ r = sm_ll_inc(&smd->ll, b, e, &nr_allocations);
+ if (!r)
+ smd->nr_allocated_this_transaction += nr_allocations;
return r;
}
-static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
- uint32_t old_count;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- r = sm_ll_dec(&smd->ll, b, &ev);
- if (!r && (ev == SM_FREE)) {
- /*
- * It's only free if it's also free in the last
- * transaction.
- */
- r = sm_ll_lookup(&smd->old_ll, b, &old_count);
- if (!r && !old_count)
- smd->nr_allocated_this_transaction--;
- }
+ r = sm_ll_dec(&smd->ll, b, e, &nr_allocations);
+ if (!r)
+ smd->nr_allocated_this_transaction += nr_allocations;
return r;
}
@@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
{
int r;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
+ if (r == -ENOSPC) {
+ /*
+ * There's no free block between smd->begin and the end of the metadata device.
+ * We search before smd->begin in case something has been freed.
+ */
+ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b);
+ }
+
if (r)
return r;
smd->begin = *b + 1;
- r = sm_ll_inc(&smd->ll, *b, &ev);
+ r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations);
if (!r) {
- BUG_ON(ev != SM_ALLOC);
- smd->nr_allocated_this_transaction++;
+ smd->nr_allocated_this_transaction += nr_allocations;
}
return r;
@@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm)
return r;
memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
- smd->begin = 0;
smd->nr_allocated_this_transaction = 0;
return 0;
@@ -235,8 +204,8 @@ static struct dm_space_map ops = {
.get_count = sm_disk_get_count,
.count_is_more_than_one = sm_disk_count_is_more_than_one,
.set_count = sm_disk_set_count,
- .inc_block = sm_disk_inc_block,
- .dec_block = sm_disk_dec_block,
+ .inc_blocks = sm_disk_inc_blocks,
+ .dec_blocks = sm_disk_dec_blocks,
.new_block = sm_disk_new_block,
.commit = sm_disk_commit,
.root_size = sm_disk_root_size,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 9e3c64ec2026..392ae26134a4 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -89,7 +89,8 @@ enum block_op_type {
struct block_op {
enum block_op_type type;
- dm_block_t block;
+ dm_block_t b;
+ dm_block_t e;
};
struct bop_ring_buffer {
@@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
}
static int brb_push(struct bop_ring_buffer *brb,
- enum block_op_type type, dm_block_t b)
+ enum block_op_type type, dm_block_t b, dm_block_t e)
{
struct block_op *bop;
unsigned next = brb_next(brb, brb->end);
@@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb,
bop = brb->bops + brb->end;
bop->type = type;
- bop->block = b;
+ bop->b = b;
+ bop->e = e;
brb->end = next;
@@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
return -ENODATA;
bop = brb->bops + brb->begin;
- result->type = bop->type;
- result->block = bop->block;
-
+ memcpy(result, bop, sizeof(*result));
return 0;
}
@@ -178,10 +178,9 @@ struct sm_metadata {
struct threshold threshold;
};
-static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
+static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e)
{
- int r = brb_push(&smm->uncommitted, type, b);
-
+ int r = brb_push(&smm->uncommitted, type, b, e);
if (r) {
DMERR("too many recursive allocations");
return -ENOMEM;
@@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t
static int commit_bop(struct sm_metadata *smm, struct block_op *op)
{
int r = 0;
- enum allocation_event ev;
+ int32_t nr_allocations;
switch (op->type) {
case BOP_INC:
- r = sm_ll_inc(&smm->ll, op->block, &ev);
+ r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations);
break;
case BOP_DEC:
- r = sm_ll_dec(&smm->ll, op->block, &ev);
+ r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations);
break;
}
@@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
i = brb_next(&smm->uncommitted, i)) {
struct block_op *op = smm->uncommitted.bops + i;
- if (op->block != b)
+ if (b < op->b || b >= op->e)
continue;
switch (op->type) {
@@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
struct block_op *op = smm->uncommitted.bops + i;
- if (op->block != b)
+ if (b < op->b || b >= op->e)
continue;
switch (op->type) {
@@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
uint32_t count)
{
int r, r2;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
if (smm->recursion_count) {
@@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
}
in(smm);
- r = sm_ll_insert(&smm->ll, b, count, &ev);
+ r = sm_ll_insert(&smm->ll, b, count, &nr_allocations);
r2 = out(smm);
return combine_errors(r, r2);
}
-static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r, r2 = 0;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- if (recursing(smm))
- r = add_bop(smm, BOP_INC, b);
- else {
+ if (recursing(smm)) {
+ r = add_bop(smm, BOP_INC, b, e);
+ if (r)
+ return r;
+ } else {
in(smm);
- r = sm_ll_inc(&smm->ll, b, &ev);
+ r = sm_ll_inc(&smm->ll, b, e, &nr_allocations);
r2 = out(smm);
}
return combine_errors(r, r2);
}
-static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r, r2 = 0;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
if (recursing(smm))
- r = add_bop(smm, BOP_DEC, b);
+ r = add_bop(smm, BOP_DEC, b, e);
else {
in(smm);
- r = sm_ll_dec(&smm->ll, b, &ev);
+ r = sm_ll_dec(&smm->ll, b, e, &nr_allocations);
r2 = out(smm);
}
@@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
{
int r, r2 = 0;
- enum allocation_event ev;
+ int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
+ if (r == -ENOSPC) {
+ /*
+ * There's no free block between smm->begin and the end of the metadata device.
+ * We search before smm->begin in case something has been freed.
+ */
+ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b);
+ }
+
if (r)
return r;
smm->begin = *b + 1;
if (recursing(smm))
- r = add_bop(smm, BOP_INC, *b);
+ r = add_bop(smm, BOP_INC, *b, *b + 1);
else {
in(smm);
- r = sm_ll_inc(&smm->ll, *b, &ev);
+ r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations);
r2 = out(smm);
}
@@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm)
return r;
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
- smm->begin = 0;
smm->allocated_this_transaction = 0;
return 0;
@@ -556,8 +564,8 @@ static const struct dm_space_map ops = {
.get_count = sm_metadata_get_count,
.count_is_more_than_one = sm_metadata_count_is_more_than_one,
.set_count = sm_metadata_set_count,
- .inc_block = sm_metadata_inc_block,
- .dec_block = sm_metadata_dec_block,
+ .inc_blocks = sm_metadata_inc_blocks,
+ .dec_blocks = sm_metadata_dec_blocks,
.new_block = sm_metadata_new_block,
.commit = sm_metadata_commit,
.root_size = sm_metadata_root_size,
@@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
return 0;
}
-static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
+ int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- return add_bop(smm, BOP_INC, b);
+ r = add_bop(smm, BOP_INC, b, e);
+ if (r)
+ return r;
+
+ return 0;
}
-static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
+static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
+ int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- return add_bop(smm, BOP_DEC, b);
+ r = add_bop(smm, BOP_DEC, b, e);
+ if (r)
+ return r;
+
+ return 0;
}
static int sm_bootstrap_commit(struct dm_space_map *sm)
@@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = {
.get_count = sm_bootstrap_get_count,
.count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
.set_count = sm_bootstrap_set_count,
- .inc_block = sm_bootstrap_inc_block,
- .dec_block = sm_bootstrap_dec_block,
+ .inc_blocks = sm_bootstrap_inc_blocks,
+ .dec_blocks = sm_bootstrap_dec_blocks,
.new_block = sm_bootstrap_new_block,
.commit = sm_bootstrap_commit,
.root_size = sm_bootstrap_root_size,
@@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = {
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
- int r, i;
+ int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
dm_block_t old_len = smm->ll.nr_blocks;
@@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* allocate any new blocks.
*/
do {
- for (i = old_len; !r && i < smm->begin; i++)
- r = add_bop(smm, BOP_INC, i);
-
+ r = add_bop(smm, BOP_INC, old_len, smm->begin);
if (r)
goto out;
@@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
dm_block_t superblock)
{
int r;
- dm_block_t i;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
smm->begin = superblock + 1;
@@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
* Now we need to update the newly created data structures with the
* allocated blocks that they were built from.
*/
- for (i = superblock; !r && i < smm->begin; i++)
- r = add_bop(smm, BOP_INC, i);
-
+ r = add_bop(smm, BOP_INC, superblock, smm->begin);
if (r)
return r;
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
index 3e6d1153b7c4..a015cd11f6e9 100644
--- a/drivers/md/persistent-data/dm-space-map.h
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -46,8 +46,8 @@ struct dm_space_map {
int (*commit)(struct dm_space_map *sm);
- int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
- int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
+ int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
+ int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
/*
* new_block will increment the returned block.
@@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm)
return sm->commit(sm);
}
+static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
+{
+ return sm->inc_blocks(sm, b, e);
+}
+
static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
{
- return sm->inc_block(sm, b);
+ return dm_sm_inc_blocks(sm, b, b + 1);
+}
+
+static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
+{
+ return sm->dec_blocks(sm, b, e);
}
static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
{
- return sm->dec_block(sm, b);
+ return dm_sm_dec_blocks(sm, b, b + 1);
}
static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index abe2c5dd0993..16643fc974e8 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
}
EXPORT_SYMBOL_GPL(dm_tm_inc);
+void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
+{
+ /*
+ * The non-blocking clone doesn't support this.
+ */
+ BUG_ON(tm->is_clone);
+
+ dm_sm_inc_blocks(tm->sm, b, e);
+}
+EXPORT_SYMBOL_GPL(dm_tm_inc_range);
+
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
{
/*
@@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
}
EXPORT_SYMBOL_GPL(dm_tm_dec);
+void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
+{
+ /*
+ * The non-blocking clone doesn't support this.
+ */
+ BUG_ON(tm->is_clone);
+
+ dm_sm_dec_blocks(tm->sm, b, e);
+}
+EXPORT_SYMBOL_GPL(dm_tm_dec_range);
+
+void dm_tm_with_runs(struct dm_transaction_manager *tm,
+ const __le64 *value_le, unsigned count, dm_tm_run_fn fn)
+{
+ uint64_t b, begin, end;
+ bool in_run = false;
+ unsigned i;
+
+ for (i = 0; i < count; i++, value_le++) {
+ b = le64_to_cpu(*value_le);
+
+ if (in_run) {
+ if (b == end)
+ end++;
+ else {
+ fn(tm, begin, end);
+ begin = b;
+ end = b + 1;
+ }
+ } else {
+ in_run = true;
+ begin = b;
+ end = b + 1;
+ }
+ }
+
+ if (in_run)
+ fn(tm, begin, end);
+}
+EXPORT_SYMBOL_GPL(dm_tm_with_runs);
+
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
uint32_t *result)
{
@@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
return dm_sm_get_count(tm->sm, b, result);
}
+int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
+ int *result)
+{
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ return dm_sm_count_is_more_than_one(tm->sm, b, result);
+}
+
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
{
return tm->bm;
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index f3a18be68f30..906c02ed0365 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
* Functions for altering the reference count of a block directly.
*/
void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
-
+void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
+void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
+
+/*
+ * Builds up runs of adjacent blocks, and then calls the given fn
+ * (typically dm_tm_inc/dec). Very useful when you have to perform
+ * the same tm operation on all values in a btree leaf.
+ */
+typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t);
+void dm_tm_with_runs(struct dm_transaction_manager *tm,
+ const __le64 *value_le, unsigned count, dm_tm_run_fn fn);
-int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
- uint32_t *result);
+int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result);
+
+/*
+ * Finds out if a given block is shared (ie. has a reference count higher
+ * than one).
+ */
+int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
+ int *result);
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e5d7411cba9b..62c8b6adac70 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -546,6 +546,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ if (bio->bi_pool != &mddev->bio_set)
+ md_account_bio(mddev, &bio);
+
orig_sector = sector;
zone = find_zone(mddev->private, &sector);
switch (conf->layout) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ced076ba560e..51f2547c2007 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -300,6 +300,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
+ if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ bio_end_io_acct(bio, r1_bio->start_time);
bio_endio(bio);
}
@@ -1210,7 +1212,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
int max_sectors;
int rdisk;
- bool print_msg = !!r1_bio;
+ bool r1bio_existed = !!r1_bio;
char b[BDEVNAME_SIZE];
/*
@@ -1220,7 +1222,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
- if (print_msg) {
+ if (r1bio_existed) {
/* Need to get the block device name carefully */
struct md_rdev *rdev;
rcu_read_lock();
@@ -1252,7 +1254,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (rdisk < 0) {
/* couldn't find anywhere to read from */
- if (print_msg) {
+ if (r1bio_existed) {
pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
mdname(mddev),
b,
@@ -1263,7 +1265,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
mirror = conf->mirrors + rdisk;
- if (print_msg)
+ if (r1bio_existed)
pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
mdname(mddev),
(unsigned long long)r1_bio->sector,
@@ -1292,6 +1294,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
r1_bio->read_disk = rdisk;
+ if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ r1_bio->start_time = bio_start_io_acct(bio);
+
read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
@@ -1461,6 +1466,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sectors = max_sectors;
}
+ if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ r1_bio->start_time = bio_start_io_acct(bio);
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index b7eb09e8c025..ccf10e59b116 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -158,6 +158,7 @@ struct r1bio {
sector_t sector;
int sectors;
unsigned long state;
+ unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 13f5e6b2a73d..16977e8e075d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -297,6 +297,8 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
+ if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
@@ -1184,6 +1186,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
+ if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ r10_bio->start_time = bio_start_io_acct(bio);
read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
@@ -1483,6 +1487,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio;
}
+ if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ r10_bio->start_time = bio_start_io_acct(bio);
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1461fd55311b..c34bb196790e 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -124,6 +124,7 @@ struct r10bio {
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
+ unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 841e1c1aa5e6..b8436e4930ed 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5311,8 +5311,6 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
- WARN_ON_ONCE(bio->bi_bdev->bd_partno);
-
chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
@@ -5364,11 +5362,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/
static void raid5_align_endio(struct bio *bi)
{
- struct bio* raid_bi = bi->bi_private;
+ struct md_io_acct *md_io_acct = bi->bi_private;
+ struct bio *raid_bi = md_io_acct->orig_bio;
struct mddev *mddev;
struct r5conf *conf;
struct md_rdev *rdev;
blk_status_t error = bi->bi_status;
+ unsigned long start_time = md_io_acct->start_time;
bio_put(bi);
@@ -5380,6 +5380,8 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
+ if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
+ bio_end_io_acct(raid_bi, start_time);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -5398,6 +5400,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
+ struct md_io_acct *md_io_acct;
+ bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
pr_debug("%s: non aligned\n", __func__);
@@ -5427,29 +5431,46 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
- bio_set_dev(align_bio, rdev->bdev);
- align_bio->bi_end_io = raid5_align_endio;
- align_bio->bi_private = raid_bio;
- align_bio->bi_iter.bi_sector = sector;
-
- raid_bio->bi_next = (void *)rdev;
-
- if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad,
+ if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
&bad_sectors)) {
- bio_put(align_bio);
+ bio_put(raid_bio);
rdev_dec_pending(rdev, mddev);
return 0;
}
+ align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set);
+ md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+ raid_bio->bi_next = (void *)rdev;
+ if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
+ md_io_acct->start_time = bio_start_io_acct(raid_bio);
+ md_io_acct->orig_bio = raid_bio;
+
+ bio_set_dev(align_bio, rdev->bdev);
+ align_bio->bi_end_io = raid5_align_endio;
+ align_bio->bi_private = md_io_acct;
+ align_bio->bi_iter.bi_sector = sector;
+
/* No reshape active, so we can trust rdev->data_offset */
align_bio->bi_iter.bi_sector += rdev->data_offset;
- spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
- conf->device_lock);
- atomic_inc(&conf->active_aligned_reads);
- spin_unlock_irq(&conf->device_lock);
+ did_inc = false;
+ if (conf->quiesce == 0) {
+ atomic_inc(&conf->active_aligned_reads);
+ did_inc = true;
+ }
+ /* need a memory barrier to detect the race with raid5_quiesce() */
+ if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
+ /* quiesce is in progress, so we need to undo io activation and wait
+ * for it to finish
+ */
+ if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_quiescent);
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
+ conf->device_lock);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
+ }
if (mddev->gendisk)
trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -5798,6 +5819,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
+ md_account_bio(mddev, &bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
@@ -6930,7 +6952,7 @@ static struct attribute *raid5_attrs[] = {
&ppl_write_hint.attr,
NULL,
};
-static struct attribute_group raid5_attrs_group = {
+static const struct attribute_group raid5_attrs_group = {
.name = NULL,
.attrs = raid5_attrs,
};
@@ -8336,7 +8358,10 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
* active stripes can drain
*/
r5c_flush_cache(conf, INT_MAX);
- conf->quiesce = 2;
+ /* need a memory barrier to make sure read_one_chunk() sees
+ * quiesce started and reverts to slow (locked) path.
+ */
+ smp_store_release(&conf->quiesce, 2);
wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,