diff options
Diffstat (limited to 'drivers/md')
55 files changed, 2635 insertions, 805 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f2014385d48b..0602e82a9516 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -47,7 +47,7 @@ config MD_AUTODETECT If unsure, say Y. config MD_LINEAR - tristate "Linear (append) mode" + tristate "Linear (append) mode (deprecated)" depends on BLK_DEV_MD help If you say Y here, then your multiple devices driver will be able to @@ -158,7 +158,7 @@ config MD_RAID456 If unsure, say Y. config MD_MULTIPATH - tristate "Multipath I/O support" + tristate "Multipath I/O support (deprecated)" depends on BLK_DEV_MD help MD_MULTIPATH provides a simple multi-path personality for use @@ -169,7 +169,7 @@ config MD_MULTIPATH If unsure, say N. config MD_FAULTY - tristate "Faulty test module for MD" + tristate "Faulty test module for MD (deprecated)" depends on BLK_DEV_MD help The "faulty" module allows for a block device that occasionally returns diff --git a/drivers/md/Makefile b/drivers/md/Makefile index ef7ddc27685c..a74aaf8b1445 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif +ifeq ($(CONFIG_BLK_DEV_ZONED),y) +dm-mod-objs += dm-zone.o +endif + ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0a4551e165ab..5fc989a6d452 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,7 +364,6 @@ struct cached_dev { /* The rest of this all shows up in sysfs */ unsigned int sequential_cutoff; - unsigned int readahead; unsigned int io_disable:1; unsigned int verify:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 29c231758293..6d1de889baeb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -880,9 +880,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + unsigned int size_limit; s->cache_missed = 1; @@ -892,14 +892,10 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) - reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_bdev->bd_disk) - - bio_end_sector(bio)); - - s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + /* Limitation for valid replace key size and cache_bio bvecs number */ + size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS, + (1 << KEY_SIZE_BITS) - 1); + s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio)); s->iop.replace_key = KEY(s->iop.inode, bio->bi_iter.bi_sector + s->insert_bio_sectors, @@ -911,7 +907,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->iop.replace = true; - miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); + miss = bio_next_split(bio, s->insert_bio_sectors, GFP_NOIO, + &s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; @@ -933,9 +930,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; - if (reada) - bch_mark_cache_readahead(s->iop.c, s->d); - s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..4c7ee5fedb9d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -46,7 +46,6 @@ read_attribute(cache_misses); read_attribute(cache_bypass_hits); read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); -read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); read_attribute(bypassed); @@ -64,7 +63,6 @@ SHOW(bch_stats) DIV_SAFE(var(cache_hits) * 100, var(cache_hits) + var(cache_misses))); - var_print(cache_readaheads); var_print(cache_miss_collisions); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var @@ -86,7 +84,6 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_bypass_hits, &sysfs_cache_bypass_misses, &sysfs_cache_hit_ratio, - &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, &sysfs_bypassed, NULL @@ -113,7 +110,6 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) acc->total.cache_misses = 0; acc->total.cache_bypass_hits = 0; acc->total.cache_bypass_misses = 0; - acc->total.cache_readaheads = 0; acc->total.cache_miss_collisions = 0; acc->total.sectors_bypassed = 0; } @@ -145,7 +141,6 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_misses); scale_stat(&stats->cache_bypass_hits); scale_stat(&stats->cache_bypass_misses); - scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); scale_stat(&stats->sectors_bypassed); } @@ -168,7 +163,6 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_misses); move_stat(cache_bypass_hits); move_stat(cache_bypass_misses); - move_stat(cache_readaheads); move_stat(cache_miss_collisions); move_stat(sectors_bypassed); @@ -209,14 +203,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, mark_cache_stats(&c->accounting.collector, hit, bypass); } -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) -{ - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - - atomic_inc(&dc->accounting.collector.cache_readaheads); - atomic_inc(&c->accounting.collector.cache_readaheads); -} - void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..ca4f435f7216 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -7,7 +7,6 @@ struct cache_stat_collector { atomic_t cache_misses; atomic_t cache_bypass_hits; atomic_t cache_bypass_misses; - atomic_t cache_readaheads; atomic_t cache_miss_collisions; atomic_t sectors_bypassed; }; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index bea8c4429ae8..185246a0d855 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -890,13 +890,9 @@ static void bcache_device_free(struct bcache_device *d) if (disk_added) del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); - + blk_cleanup_disk(disk); ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); - if (disk_added) - put_disk(disk); } bioset_exit(&d->bio_split); @@ -946,7 +942,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto err; - d->disk = alloc_disk(BCACHE_MINORS); + d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) goto err; @@ -955,14 +951,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->major = bcache_major; d->disk->first_minor = idx_to_first_minor(idx); + d->disk->minors = BCACHE_MINORS; d->disk->fops = ops; d->disk->private_data = d; - q = blk_alloc_queue(NUMA_NO_NODE); - if (!q) - return -ENOMEM; - - d->disk->queue = q; + q = d->disk->queue; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cc89f3156d1a..05ac1d6fbbf3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -137,7 +137,6 @@ rw_attribute(io_disable); rw_attribute(discard); rw_attribute(running); rw_attribute(label); -rw_attribute(readahead); rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); @@ -260,7 +259,6 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u"); var_hprint(sequential_cutoff); - var_hprint(readahead); sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); @@ -365,7 +363,6 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); - d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) bch_cache_accounting_clear(&dc->accounting); @@ -538,7 +535,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_running, &sysfs_state, &sysfs_label, - &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_bypass_torture_test, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 6ab01ff25747..8e4ced5a2516 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -8,6 +8,7 @@ #include "dm-bio-prison-v2.h" #include "dm-bio-record.h" #include "dm-cache-metadata.h" +#include "dm-io-tracker.h" #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> @@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, /*----------------------------------------------------------------*/ -struct io_tracker { - spinlock_t lock; - - /* - * Sectors of in-flight IO. - */ - sector_t in_flight; - - /* - * The time, in jiffies, when this device became idle (if it is - * indeed idle). - */ - unsigned long idle_time; - unsigned long last_update_time; -}; - -static void iot_init(struct io_tracker *iot) -{ - spin_lock_init(&iot->lock); - iot->in_flight = 0ul; - iot->idle_time = 0ul; - iot->last_update_time = jiffies; -} - -static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) -{ - if (iot->in_flight) - return false; - - return time_after(jiffies, iot->idle_time + jifs); -} - -static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) -{ - bool r; - - spin_lock_irq(&iot->lock); - r = __iot_idle_for(iot, jifs); - spin_unlock_irq(&iot->lock); - - return r; -} - -static void iot_io_begin(struct io_tracker *iot, sector_t len) -{ - spin_lock_irq(&iot->lock); - iot->in_flight += len; - spin_unlock_irq(&iot->lock); -} - -static void __iot_io_end(struct io_tracker *iot, sector_t len) -{ - if (!len) - return; - - iot->in_flight -= len; - if (!iot->in_flight) - iot->idle_time = jiffies; -} - -static void iot_io_end(struct io_tracker *iot, sector_t len) -{ - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); - __iot_io_end(iot, len); - spin_unlock_irqrestore(&iot->lock, flags); -} - -/*----------------------------------------------------------------*/ - /* * Represents a chunk of future work. 'input' allows continuations to pass * values between themselves, typically error values. @@ -470,7 +400,7 @@ struct cache { struct batcher committer; struct work_struct commit_ws; - struct io_tracker tracker; + struct dm_io_tracker tracker; mempool_t migration_pool; @@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) if (accountable_bio(cache, bio)) { pb = get_per_bio_data(bio); pb->len = bio_sectors(bio); - iot_io_begin(&cache->tracker, pb->len); + dm_iot_io_begin(&cache->tracker, pb->len); } } @@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) { struct per_bio_data *pb = get_per_bio_data(bio); - iot_io_end(&cache->tracker, pb->len); + dm_iot_io_end(&cache->tracker, pb->len); } static void accounted_request(struct cache *cache, struct bio *bio) @@ -1642,7 +1572,7 @@ enum busy { static enum busy spare_migration_bandwidth(struct cache *cache) { - bool idle = iot_idle_for(&cache->tracker, HZ); + bool idle = dm_iot_idle_for(&cache->tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; @@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) batcher_init(&cache->committer, commit_op, cache, issue_op, cache, cache->wq); - iot_init(&cache->tracker); + dm_iot_init(&cache->tracker); init_rwsem(&cache->background_work_lock); prevent_background_work(cache); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 5953ff2bd260..edc1553c4eea 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -114,8 +114,27 @@ struct mapped_device { bool init_tio_pdu:1; struct srcu_struct io_barrier; + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_zones; + unsigned int *zwp_offset; +#endif }; +/* + * Bits for the flags field of struct mapped_device. + */ +#define DMF_BLOCK_IO_FOR_SUSPEND 0 +#define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 +#define DMF_EMULATE_ZONE_APPEND 9 + void disable_discard(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); @@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md) return &md->stats; } +static inline bool dm_emulate_zone_append(struct mapped_device *md) +{ + if (blk_queue_is_zoned(md->queue)) + return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + return false; +} + #define DM_TABLE_MAX_DEPTH 16 struct dm_table { @@ -173,6 +199,45 @@ struct dm_table { #endif }; +/* + * One of these is allocated per clone bio. + */ +#define DM_TIO_MAGIC 7282014 +struct dm_target_io { + unsigned int magic; + struct dm_io *io; + struct dm_target *ti; + unsigned int target_bio_nr; + unsigned int *len_ptr; + bool inside_dm_io; + struct bio clone; +}; + +/* + * One of these is allocated per original bio. + * It contains the first clone used for that original. + */ +#define DM_IO_MAGIC 5191977 +struct dm_io { + unsigned int magic; + struct mapped_device *md; + blk_status_t status; + atomic_t io_count; + struct bio *orig_bio; + unsigned long start_time; + spinlock_t endio_lock; + struct dm_stats_aux stats_aux; + /* last member of dm_target_io is 'struct bio' */ + struct dm_target_io tio; +}; + +static inline void dm_io_inc_pending(struct dm_io *io) +{ + atomic_inc(&io->io_count); +} + +void dm_io_dec_pending(struct dm_io *io, blk_status_t error); + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b0ab080f2567..50f4cbd600d5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct crypt_config *cc = ti->private; - sector_t sector = cc->start + dm_target_offset(ti, args->next_sector); - args->start = cc->start; - return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(cc->dev->bdev, cc->start, + cc->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); } #else #define crypt_report_zones NULL @@ -3281,14 +3280,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; - /* - * For zoned block devices, we need to preserve the issuer write - * ordering. To do so, disable write workqueues and force inline - * encryption completion. - */ if (bdev_is_zoned(cc->dev->bdev)) { + /* + * For zoned block devices, we need to preserve the issuer write + * ordering. To do so, disable write workqueues and force inline + * encryption completion. + */ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags); + + /* + * All zone append writes to a zone of a zoned block device will + * have the same BIO sector, the start of the zone. When the + * cypher IV mode uses sector values, all data targeting a + * zone will be encrypted using the first sector numbers of the + * zone. This will not result in write errors but will + * cause most reads to fail as reads will use the sector values + * for the actual data locations, resulting in IV mismatch. + * To avoid this problem, ask DM core to emulate zone append + * operations with regular writes. + */ + DMDEBUG("Zone append operations will be emulated"); + ti->emulate_zone_append = true; } if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index d9ac7372108c..3b748393fca5 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -363,28 +363,32 @@ static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata core->root = le64_to_cpu(disk->root); } -static void ws_inc(void *context, const void *value) +static void ws_inc(void *context, const void *value, unsigned count) { struct era_metadata *md = context; struct writeset_disk ws_d; dm_block_t b; + unsigned i; - memcpy(&ws_d, value, sizeof(ws_d)); - b = le64_to_cpu(ws_d.root); - - dm_tm_inc(md->tm, b); + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_tm_inc(md->tm, b); + } } -static void ws_dec(void *context, const void *value) +static void ws_dec(void *context, const void *value, unsigned count) { struct era_metadata *md = context; struct writeset_disk ws_d; dm_block_t b; + unsigned i; - memcpy(&ws_d, value, sizeof(ws_d)); - b = le64_to_cpu(ws_d.root); - - dm_bitset_del(&md->bitset_info, b); + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_bitset_del(&md->bitset_info, b); + } } static int ws_eq(void *context, const void *value1, const void *value2) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b7fee9936f05..5877220c01ed 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct flakey_c *fc = ti->private; - sector_t sector = flakey_map_sector(ti, args->next_sector); - args->start = fc->start; - return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(fc->dev->bdev, fc->start, + flakey_map_sector(ti, args->next_sector), + args, nr_zones); } #else #define flakey_report_zones NULL diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 781942aeddd1..20f2510db1f6 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -66,14 +66,14 @@ struct superblock { __u8 magic[8]; __u8 version; __u8 log2_interleave_sectors; - __u16 integrity_tag_size; - __u32 journal_sections; - __u64 provided_data_sectors; /* userspace uses this value */ - __u32 flags; + __le16 integrity_tag_size; + __le32 journal_sections; + __le64 provided_data_sectors; /* userspace uses this value */ + __le32 flags; __u8 log2_sectors_per_block; __u8 log2_blocks_per_bitmap_bit; __u8 pad[2]; - __u64 recalc_sector; + __le64 recalc_sector; __u8 pad2[8]; __u8 salt[SALT_SIZE]; }; @@ -86,16 +86,16 @@ struct superblock { #define JOURNAL_ENTRY_ROUNDUP 8 -typedef __u64 commit_id_t; +typedef __le64 commit_id_t; #define JOURNAL_MAC_PER_SECTOR 8 struct journal_entry { union { struct { - __u32 sector_lo; - __u32 sector_hi; + __le32 sector_lo; + __le32 sector_hi; } s; - __u64 sector; + __le64 sector; } u; commit_id_t last_bytes[]; /* __u8 tag[0]; */ @@ -806,7 +806,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result } if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { - uint64_t section_le; + __le64 section_le; r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); if (unlikely(r < 0)) { @@ -1640,7 +1640,7 @@ static void integrity_end_io(struct bio *bio) static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, const char *data, char *result) { - __u64 sector_le = cpu_to_le64(sector); + __le64 sector_le = cpu_to_le64(sector); SHASH_DESC_ON_STACK(req, ic->internal_hash); int r; unsigned digest_size; @@ -2689,30 +2689,26 @@ next_chunk: if (unlikely(dm_integrity_failed(ic))) goto err; - if (!ic->discard) { - io_req.bi_op = REQ_OP_READ; - io_req.bi_op_flags = 0; - io_req.mem.type = DM_IO_VMA; - io_req.mem.ptr.addr = ic->recalc_buffer; - io_req.notify.fn = NULL; - io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; - io_loc.sector = get_data_sector(ic, area, offset); - io_loc.count = n_sectors; + io_req.bi_op = REQ_OP_READ; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = get_data_sector(ic, area, offset); + io_loc.count = n_sectors; - r = dm_io(&io_req, 1, &io_loc, NULL); - if (unlikely(r)) { - dm_integrity_io_error(ic, "reading data", r); - goto err; - } + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } - t = ic->recalc_tags; - for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); - t += ic->tag_size; - } - } else { - t = ic->recalc_tags + (n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; + t = ic->recalc_tags; + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tag_size; } metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); @@ -3826,7 +3822,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) for (i = 0; i < ic->journal_sections; i++) { struct scatterlist sg; struct skcipher_request *section_req; - __u32 section_le = cpu_to_le32(i); + __le32 section_le = cpu_to_le32(i); memset(crypt_iv, 0x00, ivsize); memset(crypt_data, 0x00, crypt_len); @@ -4368,13 +4364,11 @@ try_smaller_buffer: goto bad; } INIT_WORK(&ic->recalc_work, integrity_recalc); - if (!ic->discard) { - ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); - if (!ic->recalc_buffer) { - ti->error = "Cannot allocate buffer for recalculating"; - r = -ENOMEM; - goto bad; - } + ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); + if (!ic->recalc_buffer) { + ti->error = "Cannot allocate buffer for recalculating"; + r = -ENOMEM; + goto bad; } ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block, ic->tag_size, GFP_KERNEL); @@ -4383,9 +4377,6 @@ try_smaller_buffer: r = -ENOMEM; goto bad; } - if (ic->discard) - memset(ic->recalc_tags, DISCARD_FILLER, - (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size); } else { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { ti->error = "Recalculate can only be specified with internal_hash"; @@ -4579,7 +4570,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-io-tracker.h b/drivers/md/dm-io-tracker.h new file mode 100644 index 000000000000..bdcc6273ebf0 --- /dev/null +++ b/drivers/md/dm-io-tracker.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2021 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_IO_TRACKER_H +#define DM_IO_TRACKER_H + +#include <linux/jiffies.h> + +struct dm_io_tracker { + spinlock_t lock; + + /* + * Sectors of in-flight IO. + */ + sector_t in_flight; + + /* + * The time, in jiffies, when this device became idle + * (if it is indeed idle). + */ + unsigned long idle_time; + unsigned long last_update_time; +}; + +static inline void dm_iot_init(struct dm_io_tracker *iot) +{ + spin_lock_init(&iot->lock); + iot->in_flight = 0ul; + iot->idle_time = 0ul; + iot->last_update_time = jiffies; +} + +static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j) +{ + bool r = false; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = time_after(jiffies, iot->idle_time + j); + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot) +{ + unsigned long r = 0; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = jiffies - iot->idle_time; + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len) +{ + spin_lock_irq(&iot->lock); + iot->in_flight += len; + spin_unlock_irq(&iot->lock); +} + +static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len) +{ + unsigned long flags; + + if (!len) + return; + + spin_lock_irqsave(&iot->lock, flags); + iot->in_flight -= len; + if (!iot->in_flight) + iot->idle_time = jiffies; + spin_unlock_irqrestore(&iot->lock, flags); +} + +#endif diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 1bbe4a34ef4c..37b03ab7e5c9 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -341,7 +341,7 @@ static void client_free_pages(struct dm_kcopyd_client *kc) struct kcopyd_job { struct dm_kcopyd_client *kc; struct list_head list; - unsigned long flags; + unsigned flags; /* * Error state of the job. @@ -418,7 +418,7 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs, * constraint and sequential writes that are at the right position. */ list_for_each_entry(job, jobs, list) { - if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { + if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { list_del(&job->list); return job; } @@ -437,9 +437,8 @@ static struct kcopyd_job *pop(struct list_head *jobs, struct dm_kcopyd_client *kc) { struct kcopyd_job *job = NULL; - unsigned long flags; - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); if (!list_empty(jobs)) { if (jobs == &kc->io_jobs) @@ -449,7 +448,7 @@ static struct kcopyd_job *pop(struct list_head *jobs, list_del(&job->list); } } - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); return job; } @@ -467,12 +466,11 @@ static void push(struct list_head *jobs, struct kcopyd_job *job) static void push_head(struct list_head *jobs, struct kcopyd_job *job) { - unsigned long flags; struct dm_kcopyd_client *kc = job->kc; - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); list_add(&job->list, jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); } /* @@ -525,7 +523,7 @@ static void complete_io(unsigned long error, void *context) else job->read_err = 1; - if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) { push(&kc->complete_jobs, job); wake(kc); return; @@ -565,7 +563,7 @@ static int run_io_job(struct kcopyd_job *job) * If we need to write sequentially and some reads or writes failed, * no point in continuing. */ - if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && job->master_job->write_err) { job->write_err = job->master_job->write_err; return -EIO; @@ -648,7 +646,6 @@ static void do_work(struct work_struct *work) struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); struct blk_plug plug; - unsigned long flags; /* * The order that these are called is *very* important. @@ -657,9 +654,9 @@ static void do_work(struct work_struct *work) * list. io jobs call wake when they complete and it all * starts again. */ - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); @@ -709,7 +706,7 @@ static void segment_complete(int read_err, unsigned long write_err, * Only dispatch more work if there hasn't been an error. */ if ((!job->read_err && !job->write_err) || - test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) { /* get the next chunk of work */ progress = job->progress; count = job->source.count - progress; @@ -801,10 +798,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, * we need to write sequentially. If one of the destination is a * host-aware device, then leave it to the caller to choose what to do. */ - if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { + if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { for (i = 0; i < job->num_dests; i++) { if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { - set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags); + job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); break; } } @@ -813,9 +810,9 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, /* * If we need to write sequentially, errors cannot be ignored. */ - if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && - test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) - clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags); + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) + job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR); if (from) { job->source = *from; @@ -983,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) kfree(kc); } EXPORT_SYMBOL(dm_kcopyd_client_destroy); + +void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc) +{ + flush_workqueue(kc->kcopyd_wq); +} +EXPORT_SYMBOL(dm_kcopyd_client_flush); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 92db0f5e7f28..c91f1e2e2f65 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct linear_c *lc = ti->private; - sector_t sector = linear_map_sector(ti, args->next_sector); - args->start = lc->start; - return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(lc->dev->bdev, lc->start, + linear_map_sector(ti, args->next_sector), + args, nr_zones); } #else #define linear_report_zones NULL diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 077655cd4fae..cb8e83bfb1a7 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -91,7 +91,6 @@ static int ioa_add_path(struct path_selector *ps, struct dm_path *path, cpumask_set_cpu(cpu, s->path_mask); s->path_map[cpu] = pi; refcount_inc(&pi->refcount); - continue; } if (refcount_dec_and_test(&pi->refcount)) { diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b0a82f29a2e4..ebb4810cc3b4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -364,7 +364,7 @@ static void recover(struct mirror_set *ms, struct dm_region *reg) /* hand to kcopyd */ if (!errors_handled(ms)) - set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + flags |= BIT(DM_KCOPYD_IGNORE_ERROR); dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9c3bc3711b33..0dbd48cbdff9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = { int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) { - struct request_queue *q; struct dm_target *immutable_tgt; int err; @@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); - if (IS_ERR(q)) { - err = PTR_ERR(q); + err = blk_mq_init_allocated_queue(md->tag_set, md->queue); + if (err) goto out_tag_set; - } - + elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a2acb014c13a..751ec5ea1dbb 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -855,7 +855,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) static uint32_t __minimum_chunk_size(struct origin *o) { struct dm_snapshot *snap; - unsigned chunk_size = 0; + unsigned chunk_size = rounddown_pow_of_two(UINT_MAX); if (o) list_for_each_entry(snap, &o->snapshots, list) @@ -1409,6 +1409,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!s->store->chunk_size) { ti->error = "Chunk size not set"; + r = -EINVAL; goto bad_read_metadata; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ee47a332b462..0543cdf89e92 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -249,7 +249,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * If the target is mapped to zoned block device(s), check * that the zones are not partially mapped. */ - if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) { + if (bdev_is_zoned(bdev)) { unsigned int zone_sectors = bdev_zone_sectors(bdev); if (start & (zone_sectors - 1)) { @@ -1244,7 +1244,7 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, return args.err; } -static struct blk_ksm_ll_ops dm_ksm_ll_ops = { +static const struct blk_ksm_ll_ops dm_ksm_ll_ops = { .keyslot_evict = dm_keyslot_evict, }; @@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti, return blk_queue_stable_writes(q); } -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits) +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) { bool wc = false, fua = false; int page_size = PAGE_SIZE; + int r; /* * Copy table's limits to the DM device's request_queue @@ -2065,19 +2066,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); /* - * For a zoned target, the number of zones should be updated for the - * correct value to be exposed in sysfs queue/nr_zones. For a BIO based - * target, this is all that is needed. + * For a zoned target, setup the zones related queue attributes + * and resources necessary for zone append emulation if necessary. */ -#ifdef CONFIG_BLK_DEV_ZONED if (blk_queue_is_zoned(q)) { - WARN_ON_ONCE(queue_is_mq(q)); - q->nr_zones = blkdev_nr_zones(t->md->disk); + r = dm_set_zones_restrictions(t, q); + if (r) + return r; } -#endif dm_update_keyslot_manager(q, t); blk_queue_update_readahead(q); + + return 0; } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index e75b20480e46..c88ed14d49e6 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -311,28 +311,53 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) *t = v & ((1 << 24) - 1); } -static void data_block_inc(void *context, const void *value_le) +/* + * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as + * possible. 'with_runs' reads contiguous runs of blocks, and calls the + * given sm function. + */ +typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t); + +static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn) { - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; + uint64_t b, begin, end; uint32_t t; + bool in_run = false; + unsigned i; - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_inc_block(sm, b); + for (i = 0; i < count; i++, value_le++) { + /* We know value_le is 8 byte aligned */ + unpack_block_time(le64_to_cpu(*value_le), &b, &t); + + if (in_run) { + if (b == end) { + end++; + } else { + fn(sm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(sm, begin, end); } -static void data_block_dec(void *context, const void *value_le) +static void data_block_inc(void *context, const void *value_le, unsigned count) { - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; - uint32_t t; + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_inc_blocks); +} - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_dec_block(sm, b); +static void data_block_dec(void *context, const void *value_le, unsigned count) +{ + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_dec_blocks); } static int data_block_equal(void *context, const void *value1_le, const void *value2_le) @@ -349,27 +374,25 @@ static int data_block_equal(void *context, const void *value1_le, const void *va return b1 == b2; } -static void subtree_inc(void *context, const void *value) +static void subtree_inc(void *context, const void *value, unsigned count) { struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; + const __le64 *root_le = value; + unsigned i; - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - dm_tm_inc(info->tm, root); + for (i = 0; i < count; i++, root_le++) + dm_tm_inc(info->tm, le64_to_cpu(*root_le)); } -static void subtree_dec(void *context, const void *value) +static void subtree_dec(void *context, const void *value, unsigned count) { struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; + const __le64 *root_le = value; + unsigned i; - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - if (dm_btree_del(info, root)) - DMERR("btree delete failed"); + for (i = 0; i < count; i++, root_le++) + if (dm_btree_del(info, le64_to_cpu(*root_le))) + DMERR("btree delete failed"); } static int subtree_equal(void *context, const void *value1_le, const void *value2_le) @@ -1761,11 +1784,7 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int r = 0; pmd_write_lock(pmd); - for (; b != e; b++) { - r = dm_sm_inc_block(pmd->data_sm, b); - if (r) - break; - } + r = dm_sm_inc_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; @@ -1776,11 +1795,7 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int r = 0; pmd_write_lock(pmd); - for (; b != e; b++) { - r = dm_sm_dec_block(pmd->data_sm, b); - if (r) - break; - } + r = dm_sm_dec_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index 29385dc470d5..db61a1f43ae9 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -15,7 +15,7 @@ #define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s static bool require_signatures; -module_param(require_signatures, bool, false); +module_param(require_signatures, bool, 0444); MODULE_PARM_DESC(require_signatures, "Verify the roothash of dm-verity hash tree"); diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index aecc246ade26..e21e29e81bbf 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -15,6 +15,8 @@ #include <linux/dax.h> #include <linux/pfn_t.h> #include <linux/libnvdimm.h> +#include <linux/delay.h> +#include "dm-io-tracker.h" #define DM_MSG_PREFIX "writecache" @@ -28,6 +30,7 @@ #define AUTOCOMMIT_MSEC 1000 #define MAX_AGE_DIV 16 #define MAX_AGE_UNSPECIFIED -1UL +#define PAUSE_WRITEBACK (HZ * 3) #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -123,6 +126,7 @@ struct dm_writecache { size_t freelist_high_watermark; size_t freelist_low_watermark; unsigned long max_age; + unsigned long pause; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -171,17 +175,22 @@ struct dm_writecache { bool flush_on_suspend:1; bool cleaner:1; bool cleaner_set:1; + bool metadata_only:1; + bool pause_set:1; unsigned high_wm_percent_value; unsigned low_wm_percent_value; unsigned autocommit_time_value; unsigned max_age_value; + unsigned pause_value; unsigned writeback_all; struct workqueue_struct *writeback_wq; struct work_struct writeback_work; struct work_struct flush_work; + struct dm_io_tracker iot; + struct dm_io_client *dm_io; raw_spinlock_t endio_list_lock; @@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE >> SECTOR_SHIFT; + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; @@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio) writecache_flush(wc); if (writecache_has_error(wc)) goto unlock_error; + if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + goto unlock_remap_origin; goto unlock_submit; } else { + if (dm_bio_get_target_bio_nr(bio)) + goto unlock_remap_origin; writecache_offload_bio(wc, bio); goto unlock_return; } @@ -1360,24 +1373,29 @@ read_next_block: } else { do { bool found_entry = false; + bool search_used = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); if (e) { - if (!writecache_entry_is_committed(wc, e)) + if (!writecache_entry_is_committed(wc, e)) { + search_used = true; goto bio_copy; + } if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { wc->overwrote_committed = true; + search_used = true; goto bio_copy; } found_entry = true; } else { - if (unlikely(wc->cleaner)) + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { - if (!found_entry) { + if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); if (e) { @@ -1404,13 +1422,31 @@ bio_copy: sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); while (bio_size < bio->bi_iter.bi_size) { - struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); - if (!f) - break; - write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + - (bio_size >> SECTOR_SHIFT), wc->seq_count); - writecache_insert_entry(wc, f); - wc->uncommitted_blocks++; + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + } else { + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; + } bio_size += wc->block_size; current_cache_sec += wc->block_size >> SECTOR_SHIFT; } @@ -1438,6 +1474,12 @@ bio_copy: } unlock_remap_origin: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } bio_set_dev(bio, wc->dev->bdev); wc_unlock(wc); return DM_MAPIO_REMAPPED; @@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t { struct dm_writecache *wc = ti->private; - if (bio->bi_private != NULL) { + if (bio->bi_private == (void *)1) { int dir = bio_data_dir(bio); if (atomic_dec_and_test(&wc->bio_in_progress[dir])) if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) wake_up(&wc->bio_in_progress_wait[dir]); + } else if (bio->bi_private == (void *)2) { + dm_iot_io_end(&wc->iot, 1); } return 0; } @@ -1642,7 +1686,7 @@ pop_from_list: return 0; } -static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) +static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) { struct dm_writecache *wc = wb->wc; unsigned block_size = wc->block_size; @@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba max_pages = WB_LIST_INLINE; } - BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); + BUG_ON(!wc_add_block(wb, e)); wb->wc_list[0] = e; wb->wc_list_n = 1; @@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (read_original_sector(wc, f) != read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) break; - if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) + if (!wc_add_block(wb, f)) break; wbl->size--; list_del(&f->lru); @@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work) struct writeback_list wbl; unsigned long n_walked; + if (!WC_MODE_PMEM(wc)) { + /* Wait for any active kcopyd work on behalf of ssd writeback */ + dm_kcopyd_client_flush(wc->dm_kcopyd); + } + + if (likely(wc->pause != 0)) { + while (1) { + unsigned long idle; + if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || + unlikely(dm_suspended(wc->ti))) + break; + idle = dm_iot_idle_time(&wc->iot); + if (idle >= wc->pause) + break; + idle = wc->pause - idle; + if (idle > HZ) + idle = HZ; + schedule_timeout_idle(idle); + } + } + wc_lock(wc); restart: if (writecache_has_error(wc)) { @@ -1822,8 +1887,9 @@ restart: n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && - likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { - queue_work(wc->writeback_wq, &wc->writeback_work); + likely(!wc->writeback_all)) { + if (likely(!dm_suspended(wc->ti))) + queue_work(wc->writeback_wq, &wc->writeback_work); break; } @@ -1845,15 +1911,13 @@ restart: if (unlikely(read_original_sector(wc, f) == read_original_sector(wc, e))) { BUG_ON(!f->write_in_progress); - list_del(&e->lru); - list_add(&e->lru, &skipped); + list_move(&e->lru, &skipped); cond_resched(); continue; } } wc->writeback_size++; - list_del(&e->lru); - list_add(&e->lru, &wbl.list); + list_move(&e->lru, &wbl.list); wbl.size++; e->write_in_progress = true; e->wc_list_contiguous = 1; @@ -1888,8 +1952,7 @@ restart: // break; wc->writeback_size++; - list_del(&g->lru); - list_add(&g->lru, &wbl.list); + list_move(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; g->wc_list_contiguous = BIO_MAX_VECS; @@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 16, "Invalid number of feature args"}, + {0, 18, "Invalid number of feature args"}, }; as.argc = argc; @@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&wc->writeback_work, writecache_writeback); INIT_WORK(&wc->flush_work, writecache_flush_work); + dm_iot_init(&wc->iot); + raw_spin_lock_init(&wc->endio_list_lock); INIT_LIST_HEAD(&wc->endio_list); wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); @@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } else { + wc->pause = PAUSE_WRITEBACK; r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); if (r) { ti->error = "Could not allocate mempool"; @@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->writeback_fua = false; wc->writeback_fua_set = true; } else goto invalid_optional; + } else if (!strcasecmp(string, "metadata_only")) { + wc->metadata_only = true; + } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { + unsigned pause_msecs; + if (WC_MODE_PMEM(wc)) + goto invalid_optional; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) + goto invalid_optional; + if (pause_msecs > 60000) + goto invalid_optional; + wc->pause = msecs_to_jiffies(pause_msecs); + wc->pause_set = true; + wc->pause_value = pause_msecs; } else { invalid_optional: r = -EINVAL; @@ -2463,7 +2543,7 @@ overflow: goto bad; } - ti->num_flush_bios = 1; + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; ti->flush_supported = true; ti->num_discard_bios = 1; @@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; if (wc->writeback_fua_set) extra_args++; + if (wc->metadata_only) + extra_args++; + if (wc->pause_set) + extra_args += 2; DMEMIT("%u", extra_args); if (wc->start_sector_set) @@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + if (wc->metadata_only) + DMEMIT(" metadata_only"); + if (wc->pause_set) + DMEMIT(" pause_writeback %u", wc->pause_value); break; } } static struct target_type writecache_target = { .name = "writecache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c new file mode 100644 index 000000000000..6d82a34438c8 --- /dev/null +++ b/drivers/md/dm-zone.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> + +#include "dm-core.h" + +#define DM_MSG_PREFIX "zone" + +#define DM_ZONE_INVALID_WP_OFST UINT_MAX + +/* + * For internal zone reports bypassing the top BIO submission path. + */ +static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + report_zones_cb cb, void *data) +{ + struct gendisk *disk = md->disk; + int ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; + + do { + struct dm_target *tgt; + + tgt = dm_table_find_target(t, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) + return -EIO; + + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); + if (ret < 0) + return ret; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + + return args.zone_idx; +} + +/* + * User facing dm device block device report zone operation. This calls the + * report_zones operation for each target of a device table. This operation is + * generally implemented by targets using dm_report_zones(). + */ +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct mapped_device *md = disk->private_data; + struct dm_table *map; + int srcu_idx, ret; + + if (dm_suspended_md(md)) + return -EAGAIN; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return -EIO; + + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} + +/* + * Helper for drivers of zoned targets to implement struct target_type + * report_zones operation. + */ +int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + /* + * Set the target mapping start sector first so that + * dm_report_zones_cb() can correctly remap zone information. + */ + args->start = start; + + return blkdev_report_zones(bdev, sector, nr_zones, + dm_report_zones_cb, args); +} +EXPORT_SYMBOL_GPL(dm_report_zones); + +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + struct request_queue *q = md->queue; + + if (!blk_queue_is_zoned(q)) + return false; + + switch (bio_op(bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return !op_is_flush(bio->bi_opf) && bio_sectors(bio); + default: + return false; + } +} + +void dm_cleanup_zoned_dev(struct mapped_device *md) +{ + struct request_queue *q = md->queue; + + if (q) { + kfree(q->conv_zones_bitmap); + q->conv_zones_bitmap = NULL; + kfree(q->seq_zones_wlock); + q->seq_zones_wlock = NULL; + } + + kvfree(md->zwp_offset); + md->zwp_offset = NULL; + md->nr_zones = 0; +} + +static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. Use 0 as for an empty zone. + */ + return 0; + } +} + +static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct mapped_device *md = data; + struct request_queue *q = md->queue; + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!q->conv_zones_bitmap) { + q->conv_zones_bitmap = + kcalloc(BITS_TO_LONGS(q->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!q->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, q->conv_zones_bitmap); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!q->seq_zones_wlock) { + q->seq_zones_wlock = + kcalloc(BITS_TO_LONGS(q->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!q->seq_zones_wlock) + return -ENOMEM; + } + if (!md->zwp_offset) { + md->zwp_offset = + kvcalloc(q->nr_zones, sizeof(unsigned int), + GFP_KERNEL); + if (!md->zwp_offset) + return -ENOMEM; + } + md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); + + break; + default: + DMERR("Invalid zone type 0x%x at sectors %llu", + (int)zone->type, zone->start); + return -ENODEV; + } + + return 0; +} + +/* + * Revalidate the zones of a mapped device to initialize resource necessary + * for zone append emulation. Note that we cannot simply use the block layer + * blk_revalidate_disk_zones() function here as the mapped device is suspended + * (this is called from __bind() context). + */ +static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +{ + struct request_queue *q = md->queue; + unsigned int noio_flag; + int ret; + + /* + * Check if something changed. If yes, cleanup the current resources + * and reallocate everything. + */ + if (!q->nr_zones || q->nr_zones != md->nr_zones) + dm_cleanup_zoned_dev(md); + if (md->nr_zones) + return 0; + + /* + * Scan all zones to initialize everything. Ensure that all vmalloc + * operations in this context are done as if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones, + dm_zone_revalidate_cb, md); + memalloc_noio_restore(noio_flag); + if (ret < 0) + goto err; + if (ret != q->nr_zones) { + ret = -EIO; + goto err; + } + + md->nr_zones = q->nr_zones; + + return 0; + +err: + DMERR("Revalidate zones failed %d", ret); + dm_cleanup_zoned_dev(md); + return ret; +} + +static int device_not_zone_append_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !blk_queue_is_zoned(bdev_get_queue(dev->bdev)); +} + +static bool dm_table_supports_zone_append(struct dm_table *t) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->emulate_zone_append) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL)) + return false; + } + + return true; +} + +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +{ + struct mapped_device *md = t->md; + + /* + * For a zoned target, the number of zones should be updated for the + * correct value to be exposed in sysfs queue/nr_zones. + */ + WARN_ON_ONCE(queue_is_mq(q)); + q->nr_zones = blkdev_nr_zones(md->disk); + + /* Check if zone append is natively supported */ + if (dm_table_supports_zone_append(t)) { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + dm_cleanup_zoned_dev(md); + return 0; + } + + /* + * Mark the mapped device as needing zone append emulation and + * initialize the emulation resources once the capacity is set. + */ + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!get_capacity(md->disk)) + return 0; + + return dm_revalidate_zones(md, t); +} + +static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + unsigned int *wp_offset = data; + + *wp_offset = dm_get_zone_wp_offset(zone); + + return 0; +} + +static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, + unsigned int *wp_ofst) +{ + sector_t sector = zno * blk_queue_zone_sectors(md->queue); + unsigned int noio_flag; + struct dm_table *t; + int srcu_idx, ret; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return -EIO; + + /* + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, sector, 1, + dm_update_zone_wp_offset_cb, wp_ofst); + memalloc_noio_restore(noio_flag); + + dm_put_live_table(md, srcu_idx); + + if (ret != 1) + return -EIO; + + return 0; +} + +/* + * First phase of BIO mapping for targets with zone append emulation: + * check all BIO that change a zone writer pointer and change zone + * append operations into regular write operations. + */ +static bool dm_zone_map_bio_begin(struct mapped_device *md, + struct bio *orig_bio, struct bio *clone) +{ + sector_t zsectors = blk_queue_zone_sectors(md->queue); + unsigned int zno = bio_zone_no(orig_bio); + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* + * If the target zone is in an error state, recover by inspecting the + * zone to get its current write pointer position. Note that since the + * target zone is already locked, a BIO issuing context should never + * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. + */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { + if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) + return false; + WRITE_ONCE(md->zwp_offset[zno], zwp_offset); + } + + switch (bio_op(orig_bio)) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + return true; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + /* Writes must be aligned to the zone write pointer */ + if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) + return false; + break; + case REQ_OP_ZONE_APPEND: + /* + * Change zone append operations into a non-mergeable regular + * writes directed at the current write pointer position of the + * target zone. + */ + clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | + (orig_bio->bi_opf & (~REQ_OP_MASK)); + clone->bi_iter.bi_sector = + orig_bio->bi_iter.bi_sector + zwp_offset; + break; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return false; + } + + /* Cannot write to a full zone */ + if (zwp_offset >= zsectors) + return false; + + return true; +} + +/* + * Second phase of BIO mapping for targets with zone append emulation: + * update the zone write pointer offset array to account for the additional + * data written to a zone. Note that at this point, the remapped clone BIO + * may already have completed, so we do not touch it. + */ +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, + struct bio *orig_bio, + unsigned int nr_sectors) +{ + unsigned int zno = bio_zone_no(orig_bio); + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* The clone BIO may already have been completed and failed */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) + return BLK_STS_IOERR; + + /* Update the zone wp offset */ + switch (bio_op(orig_bio)) { + case REQ_OP_ZONE_RESET: + WRITE_ONCE(md->zwp_offset[zno], 0); + return BLK_STS_OK; + case REQ_OP_ZONE_FINISH: + WRITE_ONCE(md->zwp_offset[zno], + blk_queue_zone_sectors(md->queue)); + return BLK_STS_OK; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + case REQ_OP_ZONE_APPEND: + /* + * Check that the target did not truncate the write operation + * emulating a zone append. + */ + if (nr_sectors != bio_sectors(orig_bio)) { + DMWARN_LIMIT("Truncated write for zone append"); + return BLK_STS_IOERR; + } + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return BLK_STS_IOERR; + } +} + +static inline void dm_zone_lock(struct request_queue *q, + unsigned int zno, struct bio *clone) +{ + if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) + return; + + wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); + bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static inline void dm_zone_unlock(struct request_queue *q, + unsigned int zno, struct bio *clone) +{ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock)); + clear_bit_unlock(zno, q->seq_zones_wlock); + smp_mb__after_atomic(); + wake_up_bit(q->seq_zones_wlock, zno); + + bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static bool dm_need_zone_wp_tracking(struct bio *orig_bio) +{ + /* + * Special processing is not needed for operations that do not need the + * zone write lock, that is, all operations that target conventional + * zones and all operations that do not modify directly a sequential + * zone write pointer. + */ + if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio)) + return false; + switch (bio_op(orig_bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_APPEND: + return bio_zone_is_seq(orig_bio); + default: + return false; + } +} + +/* + * Special IO mapping for targets needing zone append emulation. + */ +int dm_zone_map_bio(struct dm_target_io *tio) +{ + struct dm_io *io = tio->io; + struct dm_target *ti = tio->ti; + struct mapped_device *md = io->md; + struct request_queue *q = md->queue; + struct bio *orig_bio = io->orig_bio; + struct bio *clone = &tio->clone; + unsigned int zno; + blk_status_t sts; + int r; + + /* + * IOs that do not change a zone write pointer do not need + * any additional special processing. + */ + if (!dm_need_zone_wp_tracking(orig_bio)) + return ti->type->map(ti, clone); + + /* Lock the target zone */ + zno = bio_zone_no(orig_bio); + dm_zone_lock(q, zno, clone); + + /* + * Check that the bio and the target zone write pointer offset are + * both valid, and if the bio is a zone append, remap it to a write. + */ + if (!dm_zone_map_bio_begin(md, orig_bio, clone)) { + dm_zone_unlock(q, zno, clone); + return DM_MAPIO_KILL; + } + + /* + * The target map function may issue and complete the IO quickly. + * Take an extra reference on the IO to make sure it does disappear + * until we run dm_zone_map_bio_end(). + */ + dm_io_inc_pending(io); + + /* Let the target do its work */ + r = ti->type->map(ti, clone); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* + * The target submitted the clone BIO. The target zone will + * be unlocked on completion of the clone. + */ + sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); + break; + case DM_MAPIO_REMAPPED: + /* + * The target only remapped the clone BIO. In case of error, + * unlock the target zone here as the clone will not be + * submitted. + */ + sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); + if (sts != BLK_STS_OK) + dm_zone_unlock(q, zno, clone); + break; + case DM_MAPIO_REQUEUE: + case DM_MAPIO_KILL: + default: + dm_zone_unlock(q, zno, clone); + sts = BLK_STS_IOERR; + break; + } + + /* Drop the extra reference on the IO */ + dm_io_dec_pending(io, sts); + + if (sts != BLK_STS_OK) + return DM_MAPIO_KILL; + + return r; +} + +/* + * IO completion callback called from clone_endio(). + */ +void dm_zone_endio(struct dm_io *io, struct bio *clone) +{ + struct mapped_device *md = io->md; + struct request_queue *q = md->queue; + struct bio *orig_bio = io->orig_bio; + unsigned int zwp_offset; + unsigned int zno; + + /* + * For targets that do not emulate zone append, we only need to + * handle native zone-append bios. + */ + if (!dm_emulate_zone_append(md)) { + /* + * Get the offset within the zone of the written sector + * and add that to the original bio sector position. + */ + if (clone->bi_status == BLK_STS_OK && + bio_op(clone) == REQ_OP_ZONE_APPEND) { + sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1; + + orig_bio->bi_iter.bi_sector += + clone->bi_iter.bi_sector & mask; + } + + return; + } + + /* + * For targets that do emulate zone append, if the clone BIO does not + * own the target zone write lock, we have nothing to do. + */ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + zno = bio_zone_no(orig_bio); + + if (clone->bi_status != BLK_STS_OK) { + /* + * BIOs that modify a zone write pointer may leave the zone + * in an unknown state in case of failure (e.g. the write + * pointer was only partially advanced). In this case, set + * the target zone write pointer as invalid unless it is + * already being updated. + */ + WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); + } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { + /* + * Get the written sector for zone append operation that were + * emulated using regular write operations. + */ + zwp_offset = READ_ONCE(md->zwp_offset[zno]); + if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) + WRITE_ONCE(md->zwp_offset[zno], + DM_ZONE_INVALID_WP_OFST); + else + orig_bio->bi_iter.bi_sector += + zwp_offset - bio_sectors(orig_bio); + } + + dm_zone_unlock(q, zno, clone); +} diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 039d17b28938..ee4626d08557 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1390,6 +1390,13 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return -ENXIO; } + /* + * Devices that have zones with a capacity smaller than the zone size + * (e.g. NVMe zoned namespaces) are not supported. + */ + if (blkz->capacity != blkz->len) + return -ENXIO; + switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 9c0ecc9568a4..d58db9a27e6c 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -134,7 +134,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, dst_zone_block = dmz_start_block(zmd, dst_zone); if (dmz_is_seq(dst_zone)) - set_bit(DM_KCOPYD_WRITE_SEQ, &flags); + flags |= BIT(DM_KCOPYD_WRITE_SEQ); while (block < end_block) { if (src_zone->dev->flags & DMZ_BDEV_DYING) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ca2aedd8ee7d..2c5f9e585211 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -74,38 +74,6 @@ struct clone_info { unsigned sector_count; }; -/* - * One of these is allocated per clone bio. - */ -#define DM_TIO_MAGIC 7282014 -struct dm_target_io { - unsigned magic; - struct dm_io *io; - struct dm_target *ti; - unsigned target_bio_nr; - unsigned *len_ptr; - bool inside_dm_io; - struct bio clone; -}; - -/* - * One of these is allocated per original bio. - * It contains the first clone used for that original. - */ -#define DM_IO_MAGIC 5191977 -struct dm_io { - unsigned magic; - struct mapped_device *md; - blk_status_t status; - atomic_t io_count; - struct bio *orig_bio; - unsigned long start_time; - spinlock_t endio_lock; - struct dm_stats_aux stats_aux; - /* last member of dm_target_io is 'struct bio' */ - struct dm_target_io tio; -}; - #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) #define DM_IO_BIO_OFFSET \ (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) @@ -137,19 +105,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define MINOR_ALLOCED ((void *)-1) -/* - * Bits for the md->flags field. - */ -#define DMF_BLOCK_IO_FOR_SUSPEND 0 -#define DMF_SUSPENDED 1 -#define DMF_FROZEN 2 -#define DMF_FREEING 3 -#define DMF_DELETING 4 -#define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_DEFERRED_REMOVE 6 -#define DMF_SUSPENDED_INTERNALLY 7 -#define DMF_POST_SUSPENDING 8 - #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; @@ -444,84 +399,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -#ifdef CONFIG_BLK_DEV_ZONED -int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) -{ - struct dm_report_zones_args *args = data; - sector_t sector_diff = args->tgt->begin - args->start; - - /* - * Ignore zones beyond the target range. - */ - if (zone->start >= args->start + args->tgt->len) - return 0; - - /* - * Remap the start sector and write pointer position of the zone - * to match its position in the target range. - */ - zone->start += sector_diff; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { - if (zone->cond == BLK_ZONE_COND_FULL) - zone->wp = zone->start + zone->len; - else if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->wp = zone->start; - else - zone->wp += sector_diff; - } - - args->next_sector = zone->start + zone->len; - return args->orig_cb(zone, args->zone_idx++, args->orig_data); -} -EXPORT_SYMBOL_GPL(dm_report_zones_cb); - -static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) -{ - struct mapped_device *md = disk->private_data; - struct dm_table *map; - int srcu_idx, ret; - struct dm_report_zones_args args = { - .next_sector = sector, - .orig_data = data, - .orig_cb = cb, - }; - - if (dm_suspended_md(md)) - return -EAGAIN; - - map = dm_get_live_table(md, &srcu_idx); - if (!map) { - ret = -EIO; - goto out; - } - - do { - struct dm_target *tgt; - - tgt = dm_table_find_target(map, args.next_sector); - if (WARN_ON_ONCE(!tgt->type->report_zones)) { - ret = -EIO; - goto out; - } - - args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, - nr_zones - args.zone_idx); - if (ret < 0) - goto out; - } while (args.zone_idx < nr_zones && - args.next_sector < get_capacity(disk)); - - ret = args.zone_idx; -out: - dm_put_live_table(md, srcu_idx); - return ret; -} -#else -#define dm_blk_report_zones NULL -#endif /* CONFIG_BLK_DEV_ZONED */ - static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) { @@ -903,7 +780,7 @@ static int __noflush_suspending(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static void dec_pending(struct dm_io *io, blk_status_t error) +void dm_io_dec_pending(struct dm_io *io, blk_status_t error) { unsigned long flags; blk_status_t io_error; @@ -919,22 +796,27 @@ static void dec_pending(struct dm_io *io, blk_status_t error) } if (atomic_dec_and_test(&io->io_count)) { + bio = io->orig_bio; if (io->status == BLK_STS_DM_REQUEUE) { /* * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) + if (__noflush_suspending(md) && + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { /* NOTE early return due to BLK_STS_DM_REQUEUE below */ - bio_list_add_head(&md->deferred, io->orig_bio); - else - /* noflush suspend was interrupted. */ + bio_list_add_head(&md->deferred, bio); + } else { + /* + * noflush suspend was interrupted or this is + * a write to a zoned target. + */ io->status = BLK_STS_IOERR; + } spin_unlock_irqrestore(&md->deferred_lock, flags); } io_error = io->status; - bio = io->orig_bio; end_io_acct(io); free_io(md, io); @@ -994,7 +876,6 @@ static void clone_endio(struct bio *bio) struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - struct bio *orig_bio = io->orig_bio; struct request_queue *q = bio->bi_bdev->bd_disk->queue; if (unlikely(error == BLK_STS_TARGET)) { @@ -1009,23 +890,22 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } - /* - * For zone-append bios get offset in zone of the written - * sector and add that to the original bio sector pos. - */ - if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { - sector_t written_sector = bio->bi_iter.bi_sector; - struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue; - u64 mask = (u64)blk_queue_zone_sectors(q) - 1; - - orig_bio->bi_iter.bi_sector += written_sector & mask; - } + if (blk_queue_is_zoned(q)) + dm_zone_endio(io, bio); if (endio) { int r = endio(tio->ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - error = BLK_STS_DM_REQUEUE; + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: break; @@ -1044,7 +924,7 @@ static void clone_endio(struct bio *bio) } free_tio(tio); - dec_pending(io, error); + dm_io_dec_pending(io, error); } /* @@ -1237,8 +1117,8 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, - * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management + * operations and REQ_OP_ZONE_APPEND (zone append writes). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1268,9 +1148,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_opf & REQ_PREFLUSH); + BUG_ON(op_is_zone_mgmt(bio_op(bio))); + BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; } @@ -1308,7 +1192,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) * anything, the target has assumed ownership of * this io. */ - atomic_inc(&io->io_count); + dm_io_inc_pending(io); sector = clone->bi_iter.bi_sector; if (unlikely(swap_bios_limit(ti, clone))) { @@ -1319,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) down(&md->swap_bios_semaphore); } - r = ti->type->map(ti, clone); + /* + * Check if the IO needs a special mapping due to zone append emulation + * on zoned target. In this case, dm_zone_map_bio() calls the target + * map operation. + */ + if (dm_emulate_zone_append(io->md)) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + switch (r) { case DM_MAPIO_SUBMITTED: break; @@ -1334,7 +1227,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) up(&md->swap_bios_semaphore); } free_tio(tio); - dec_pending(io, BLK_STS_IOERR); + dm_io_dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: if (unlikely(swap_bios_limit(ti, clone))) { @@ -1342,7 +1235,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) up(&md->swap_bios_semaphore); } free_tio(tio); - dec_pending(io, BLK_STS_DM_REQUEUE); + dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); break; default: DMWARN("unimplemented target map return value: %d", r); @@ -1631,7 +1524,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, if (bio->bi_opf & REQ_PREFLUSH) { error = __send_empty_flush(&ci); - /* dec_pending submits any data associated with flush */ + /* dm_io_dec_pending submits any data associated with flush */ } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; @@ -1672,7 +1565,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, } /* drop the extra reference count */ - dec_pending(ci.io, errno_to_blk_status(error)); + dm_io_dec_pending(ci.io, errno_to_blk_status(error)); return ret; } @@ -1801,13 +1694,13 @@ static void cleanup_mapped_device(struct mapped_device *md) md->disk->private_data = NULL; spin_unlock(&_minor_lock); del_gendisk(md->disk); - put_disk(md->disk); } - if (md->queue) { + if (md->queue) dm_queue_destroy_keyslot_manager(md->queue); - blk_cleanup_queue(md->queue); - } + + if (md->disk) + blk_cleanup_disk(md->disk); cleanup_srcu_struct(&md->io_barrier); @@ -1817,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md) mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); + dm_cleanup_zoned_dev(md); } /* @@ -1869,13 +1763,10 @@ static struct mapped_device *alloc_dev(int minor) * established. If request-based table is loaded: blk-mq will * override accordingly. */ - md->queue = blk_alloc_queue(numa_node_id); - if (!md->queue) - goto bad; - - md->disk = alloc_disk_node(1, md->numa_node_id); + md->disk = blk_alloc_disk(md->numa_node_id); if (!md->disk) goto bad; + md->queue = md->disk->queue; init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); @@ -1888,6 +1779,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk->major = _major; md->disk->first_minor = minor; + md->disk->minors = 1; md->disk->fops = &dm_blk_dops; md->disk->queue = md->queue; md->disk->private_data = md; @@ -2062,11 +1954,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, goto out; } + ret = dm_table_set_restrictions(t, q, limits); + if (ret) { + old_map = ERR_PTR(ret); + goto out; + } + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); - dm_table_set_restrictions(t, q, limits); if (old_map) dm_sync_table(md); @@ -2185,7 +2082,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) DMERR("Cannot calculate initial queue limits"); return r; } - dm_table_set_restrictions(t, md->queue, &limits); + r = dm_table_set_restrictions(t, md->queue, &limits); + if (r) + return r; + blk_register_queue(md->disk); return 0; @@ -2328,7 +2228,7 @@ static bool md_in_flight_bios(struct mapped_device *md) return sum != 0; } -static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state) +static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) { int r = 0; DEFINE_WAIT(wait); @@ -2351,7 +2251,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state return r; } -static int dm_wait_for_completion(struct mapped_device *md, long task_state) +static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) { int r = 0; @@ -2478,7 +2378,7 @@ static void unlock_fs(struct mapped_device *md) * are being added to md->deferred list. */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, - unsigned suspend_flags, long task_state, + unsigned suspend_flags, unsigned int task_state, int dmf_suspended_flag) { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index b441ad772c18..742d9c80efe1 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -45,6 +45,8 @@ struct dm_dev_internal { struct dm_table; struct dm_md_mempools; +struct dm_target_io; +struct dm_io; /*----------------------------------------------------------------- * Internal table functions. @@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits); +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_presuspend_undo_targets(struct dm_table *t); @@ -100,6 +102,30 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); */ #define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t)) +/* + * Zoned targets related functions. + */ +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +void dm_zone_endio(struct dm_io *io, struct bio *clone); +#ifdef CONFIG_BLK_DEV_ZONED +void dm_cleanup_zoned_dev(struct mapped_device *md); +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); +int dm_zone_map_bio(struct dm_target_io *io); +#else +static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} +#define dm_blk_report_zones NULL +static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + return false; +} +static inline int dm_zone_map_bio(struct dm_target_io *tio) +{ + return DM_MAPIO_KILL; +} +#endif + /*----------------------------------------------------------------- * A registry of target types. *---------------------------------------------------------------*/ diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ea3130e11680..e29c6298ef5c 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2616,7 +2616,7 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -struct attribute_group md_bitmap_group = { +const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index fda4cb3f936f..c0dc6f2ef4a3 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -357,7 +357,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD"); +MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)"); MODULE_ALIAS("md-personality-10"); /* faulty */ MODULE_ALIAS("md-faulty"); MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 63ed8329a98d..1ff51647a682 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -312,7 +312,7 @@ static void linear_exit (void) module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ MODULE_ALIAS("md-linear"); MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 776bbe542db5..e7d6486f090f 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -471,7 +471,7 @@ static void __exit multipath_exit (void) module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD"); +MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)"); MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ MODULE_ALIAS("md-multipath"); MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 49f897fbb89b..ae8fe54ea358 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -441,30 +441,6 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); -struct md_io { - struct mddev *mddev; - bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; - struct block_device *orig_bi_bdev; - unsigned long start_time; -}; - -static void md_end_io(struct bio *bio) -{ - struct md_io *md_io = bio->bi_private; - struct mddev *mddev = md_io->mddev; - - bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev); - - bio->bi_end_io = md_io->orig_bi_end_io; - bio->bi_private = md_io->orig_bi_private; - - mempool_free(md_io, &mddev->md_io_pool); - - if (bio->bi_end_io) - bio->bi_end_io(bio); -} - static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); @@ -489,21 +465,6 @@ static blk_qc_t md_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_end_io != md_end_io) { - struct md_io *md_io; - - md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); - md_io->mddev = mddev; - md_io->orig_bi_end_io = bio->bi_end_io; - md_io->orig_bi_private = bio->bi_private; - md_io->orig_bi_bdev = bio->bi_bdev; - - bio->bi_end_io = md_end_io; - bio->bi_private = md_io; - - md_io->start_time = bio_start_io_acct(bio); - } - /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; @@ -824,7 +785,7 @@ out_free_new: return ERR_PTR(error); } -static struct attribute_group md_redundancy_group; +static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) { @@ -841,7 +802,7 @@ void mddev_unlock(struct mddev *mddev) * test it under the same mutex to ensure its correct value * is seen. */ - struct attribute_group *to_remove = mddev->to_remove; + const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); @@ -2379,7 +2340,15 @@ int md_integrity_register(struct mddev *mddev) bdev_get_integrity(reference->bdev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) { + if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || + (mddev->level != 1 && mddev->level != 10 && + bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + /* + * No need to handle the failure of bioset_integrity_create, + * because the function is called by md_run() -> pers->run(), + * md_run calls bioset_exit -> bioset_integrity_free in case + * of failure case. + */ pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; @@ -5538,7 +5507,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_degraded.attr, NULL, }; -static struct attribute_group md_redundancy_group = { +static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; @@ -5598,17 +5567,16 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) + if (mddev->gendisk) { del_gendisk(mddev->gendisk); - if (mddev->queue) - blk_cleanup_queue(mddev->queue); - if (mddev->gendisk) - put_disk(mddev->gendisk); + blk_cleanup_disk(mddev->gendisk); + } percpu_ref_exit(&mddev->writes_pending); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - mempool_exit(&mddev->md_io_pool); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); kfree(mddev); } @@ -5705,26 +5673,14 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, - sizeof(struct md_io)); - if (error) - goto abort; - error = -ENOMEM; - mddev->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!mddev->queue) + disk = blk_alloc_disk(NUMA_NO_NODE); + if (!disk) goto abort; - blk_set_stacking_limits(&mddev->queue->limits); - - disk = alloc_disk(1 << shift); - if (!disk) { - blk_cleanup_queue(mddev->queue); - mddev->queue = NULL; - goto abort; - } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; + disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) @@ -5733,7 +5689,9 @@ static int md_alloc(dev_t dev, char *name) sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; - disk->queue = mddev->queue; + + mddev->queue = disk->queue; + blk_set_stacking_limits(&mddev->queue->limits); blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really @@ -5907,7 +5865,14 @@ int md_run(struct mddev *mddev) if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - return err; + goto exit_bio_set; + } + if (mddev->level != 1 && mddev->level != 10 && + !bioset_initialized(&mddev->io_acct_set)) { + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + if (err) + goto exit_sync_set; } spin_lock(&pers_lock); @@ -6035,6 +6000,7 @@ int md_run(struct mddev *mddev) blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -6084,8 +6050,12 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->bio_set); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); +exit_sync_set: bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6309,6 +6279,8 @@ void md_stop(struct mddev *mddev) __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); } EXPORT_SYMBOL_GPL(md_stop); @@ -8613,6 +8585,41 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_end_io_acct(struct bio *bio) +{ + struct md_io_acct *md_io_acct = bio->bi_private; + struct bio *orig_bio = md_io_acct->orig_bio; + + orig_bio->bi_status = bio->bi_status; + + bio_end_io_acct(orig_bio, md_io_acct->start_time); + bio_put(bio); + bio_endio(orig_bio); +} + +/* + * Used by personalities that don't already clone the bio and thus can't + * easily add the timestamp to their extended bio structure. + */ +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ + struct md_io_acct *md_io_acct; + struct bio *clone; + + if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue)) + return; + + clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(clone, struct md_io_acct, bio_clone); + md_io_acct->orig_bio = *bio; + md_io_acct->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_io_acct; + clone->bi_private = md_io_acct; + *bio = clone; +} +EXPORT_SYMBOL_GPL(md_account_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index fb7eab58cfd5..832547cf038f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -395,10 +395,10 @@ struct mddev { * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions - * with bdev->bd_mutex. + * with disk->open_mutex. * Lock ordering is: - * reconfig_mutex -> bd_mutex - * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + * reconfig_mutex -> disk->open_mutex + * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; @@ -481,13 +481,13 @@ struct mddev { atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; - struct attribute_group *to_remove; + const struct attribute_group *to_remove; struct bio_set bio_set; struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - mempool_t md_io_pool; + struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -613,7 +613,7 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern struct attribute_group md_bitmap_group; +extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { @@ -684,6 +684,12 @@ struct md_thread { void *private; }; +struct md_io_acct { + struct bio *orig_bio; + unsigned long start_time; + struct bio bio_clone; +}; + #define THREAD_WAKEUP 0 static inline void safe_put_page(struct page *p) @@ -715,6 +721,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); +void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 185dc60360b5..3a963d783a86 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab, * in an array block. */ static void on_entries(struct dm_array_info *info, struct array_block *ab, - void (*fn)(void *, const void *)) + void (*fn)(void *, const void *, unsigned)) { - unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); - - for (i = 0; i < nr_entries; i++) - fn(info->value_type.context, element_at(info, ab, i)); + unsigned nr_entries = le32_to_cpu(ab->nr_entries); + fn(info->value_type.context, element_at(info, ab, 0), nr_entries); } /* @@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, static void fill_ablock(struct dm_array_info *info, struct array_block *ab, const void *value, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta, i; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i < new_nr; i++) { - if (vt->inc) - vt->inc(vt->context, value); + delta = new_nr - nr_entries; + if (vt->inc) + vt->inc(vt->context, value, delta); + for (i = nr_entries; i < new_nr; i++) memcpy(element_at(info, ab, i), value, vt->size); - } ab->nr_entries = cpu_to_le32(new_nr); } @@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab, static void trim_ablock(struct dm_array_info *info, struct array_block *ab, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i > new_nr; i--) - if (vt->dec) - vt->dec(vt->context, element_at(info, ab, i - 1)); + delta = nr_entries - new_nr; + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta); ab->nr_entries = cpu_to_le32(new_nr); } @@ -573,16 +569,17 @@ static int grow(struct resize *resize) * These are the value_type functions for the btree elements, which point * to array blocks. */ -static void block_inc(void *context, const void *value) +static void block_inc(void *context, const void *value, unsigned count) { - __le64 block_le; + const __le64 *block_le = value; struct dm_array_info *info = context; + unsigned i; - memcpy(&block_le, value, sizeof(block_le)); - dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); + for (i = 0; i < count; i++, block_le++) + dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le)); } -static void block_dec(void *context, const void *value) +static void __block_dec(void *context, const void *value) { int r; uint64_t b; @@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value) dm_tm_dec(info->btree_info.tm, b); } +static void block_dec(void *context, const void *value, unsigned count) +{ + unsigned i; + for (i = 0; i < count; i++, value += sizeof(__le64)) + __block_dec(context, value); +} + static int block_equal(void *context, const void *value1, const void *value2) { return !memcmp(value1, value2, sizeof(__le64)); @@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_ return r; if (vt->inc) - vt->inc(vt->context, element_at(info, ab, i)); + vt->inc(vt->context, element_at(info, ab, i), 1); } ab->nr_entries = cpu_to_le32(new_nr); @@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root, old_value = element_at(info, ab, entry); if (vt->dec && (!vt->equal || !vt->equal(vt->context, old_value, value))) { - vt->dec(vt->context, old_value); + vt->dec(vt->context, old_value, 1); if (vt->inc) - vt->inc(vt->context, value); + vt->inc(vt->context, value, 1); } memcpy(old_value, value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index b1788853a355..893edb426dba 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator; extern void init_le64_type(struct dm_transaction_manager *tm, struct dm_btree_value_type *vt); +/* + * This returns a shadowed btree leaf that you may modify. In practise + * this means overwrites only, since an insert could cause a node to + * be split. Useful if you need access to the old value to calculate the + * new one. + * + * This only works with single level btrees. The given key must be present in + * the tree, otherwise -EINVAL will be returned. + */ +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index eff04fa23dfa..70532335c7c7 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); } - *new_root = shadow_root(&spine); + if (!r) + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; @@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, if (k >= keys[last_level] && k < end_key) { if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); keys[last_level] = k + 1ull; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 2061ab865567..f5bd76ed8fe6 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s) return s->root; } -static void le64_inc(void *context, const void *value_le) +static void le64_inc(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_inc(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_inc_range); } -static void le64_dec(void *context, const void *value_le) +static void le64_dec(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_dec(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_dec_range); } static int le64_equal(void *context, const void *value1_le, const void *value2_le) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index ef6e78d45d5b..0703ca7a7d9a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -71,15 +71,13 @@ static int upper_bound(struct btree_node *n, uint64_t key) void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { - unsigned i; uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) - for (i = 0; i < nr_entries; i++) - dm_tm_inc(tm, value64(n, i)); + dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); + else if (vt->inc) - for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, value_ptr(n, i)); + vt->inc(vt->context, value_ptr(n, 0), nr_entries); } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, @@ -318,13 +316,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) goto out; } else { - if (info->value_type.dec) { - unsigned i; - - for (i = 0; i < f->nr_children; i++) - info->value_type.dec(info->value_type.context, - value_ptr(f->n, i)); - } + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, 0), f->nr_children); pop_frame(s); } } @@ -500,6 +494,122 @@ out: EXPORT_SYMBOL_GPL(dm_btree_lookup_next); +/*----------------------------------------------------------------*/ + +/* + * Copies entries from one region of a btree node to another. The regions + * must not overlap. + */ +static void copy_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Moves entries from one region fo a btree node to another. The regions + * may overlap. + */ +static void move_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Erases the first 'count' entries of a btree node, shifting following + * entries down into their place. + */ +static void shift_down(struct btree_node *n, unsigned count) +{ + move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); +} + +/* + * Moves entries in a btree node up 'count' places, making space for + * new entries at the start of the node. + */ +static void shift_up(struct btree_node *n, unsigned count) +{ + move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); +} + +/* + * Redistributes entries between two btree nodes to make them + * have similar numbers of entries. + */ +static void redistribute2(struct btree_node *left, struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total = nr_left + nr_right; + unsigned target_left = total / 2; + unsigned target_right = total - target_left; + + if (nr_left < target_left) { + unsigned delta = target_left - nr_left; + copy_entries(left, nr_left, right, 0, delta); + shift_down(right, delta); + } else if (nr_left > target_left) { + unsigned delta = nr_left - target_left; + if (nr_right) + shift_up(right, delta); + copy_entries(right, 0, left, target_left, delta); + } + + left->header.nr_entries = cpu_to_le32(target_left); + right->header.nr_entries = cpu_to_le32(target_right); +} + +/* + * Redistribute entries between three nodes. Assumes the central + * node is empty. + */ +static void redistribute3(struct btree_node *left, struct btree_node *center, + struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_center = le32_to_cpu(center->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total, target_left, target_center, target_right; + + BUG_ON(nr_center); + + total = nr_left + nr_right; + target_left = total / 3; + target_center = (total - target_left) / 2; + target_right = (total - target_left - target_center); + + if (nr_left < target_left) { + unsigned left_short = target_left - nr_left; + copy_entries(left, nr_left, right, 0, left_short); + copy_entries(center, 0, right, left_short, target_center); + shift_down(right, nr_right - target_right); + + } else if (nr_left < (target_left + target_center)) { + unsigned left_to_center = nr_left - target_left; + copy_entries(center, 0, left, target_left, left_to_center); + copy_entries(center, left_to_center, right, 0, target_center - left_to_center); + shift_down(right, nr_right - target_right); + + } else { + unsigned right_short = target_right - nr_right; + shift_up(right, right_short); + copy_entries(right, 0, left, nr_left - right_short, right_short); + copy_entries(center, 0, left, target_left, nr_left - target_left); + } + + left->header.nr_entries = cpu_to_le32(target_left); + center->header.nr_entries = cpu_to_le32(target_center); + right->header.nr_entries = cpu_to_le32(target_right); +} + /* * Splits a node by creating a sibling node and shifting half the nodes * contents across. Assumes there is a parent node, and it has room for @@ -530,12 +640,10 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup_next); * * Where A* is a shadow of A. */ -static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, - uint64_t key) +static int split_one_into_two(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) { int r; - size_t size; - unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; struct btree_node *ln, *rn, *pn; __le64 location; @@ -549,36 +657,18 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, ln = dm_block_data(left); rn = dm_block_data(right); - nr_left = le32_to_cpu(ln->header.nr_entries) / 2; - nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; - - ln->header.nr_entries = cpu_to_le32(nr_left); - rn->header.flags = ln->header.flags; - rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.nr_entries = cpu_to_le32(0); rn->header.max_entries = ln->header.max_entries; rn->header.value_size = ln->header.value_size; - memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); - - size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? - sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), - size * nr_right); + redistribute2(ln, rn); - /* - * Patch up the parent - */ + /* patch up the parent */ parent = shadow_parent(s); - pn = dm_block_data(parent); - location = cpu_to_le64(dm_block_location(left)); - __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index), - &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&location); - r = insert_at(sizeof(__le64), pn, parent_index + 1, le64_to_cpu(rn->keys[0]), &location); if (r) { @@ -586,6 +676,7 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, return r; } + /* patch up the spine */ if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); s->nodes[1] = left; @@ -598,6 +689,121 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, } /* + * We often need to modify a sibling node. This function shadows a particular + * child of the given parent node. Making sure to update the parent to point + * to the new shadow. + */ +static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, + struct dm_block **result) +{ + int r, inc; + dm_block_t root; + struct btree_node *node; + + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + result, &inc); + if (r) + return r; + + node = dm_block_data(*result); + + if (inc) + inc_children(info->tm, node, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(*result)); + + return 0; +} + +/* + * Splits two nodes into three. This is more work, but results in fuller + * nodes, so saves metadata space. + */ +static int split_two_into_three(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) +{ + int r; + unsigned middle_index; + struct dm_block *left, *middle, *right, *parent; + struct btree_node *ln, *rn, *mn, *pn; + __le64 location; + + parent = shadow_parent(s); + pn = dm_block_data(parent); + + if (parent_index == 0) { + middle_index = 1; + left = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index + 1, &right); + if (r) + return r; + } else { + middle_index = parent_index; + right = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index - 1, &left); + if (r) + return r; + } + + r = new_block(s->info, &middle); + if (r < 0) + return r; + + ln = dm_block_data(left); + mn = dm_block_data(middle); + rn = dm_block_data(right); + + mn->header.nr_entries = cpu_to_le32(0); + mn->header.flags = ln->header.flags; + mn->header.max_entries = ln->header.max_entries; + mn->header.value_size = ln->header.value_size; + + redistribute3(ln, mn, rn); + + /* patch up the parent */ + pn->keys[middle_index] = rn->keys[0]; + location = cpu_to_le64(dm_block_location(middle)); + __dm_bless_for_disk(&location); + r = insert_at(sizeof(__le64), pn, middle_index, + le64_to_cpu(mn->keys[0]), &location); + if (r) { + if (shadow_current(s) != left) + unlock_block(s->info, left); + + unlock_block(s->info, middle); + + if (shadow_current(s) != right) + unlock_block(s->info, right); + + return r; + } + + + /* patch up the spine */ + if (key < le64_to_cpu(mn->keys[0])) { + unlock_block(s->info, middle); + unlock_block(s->info, right); + s->nodes[1] = left; + } else if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, left); + unlock_block(s->info, right); + s->nodes[1] = middle; + } else { + unlock_block(s->info, left); + unlock_block(s->info, middle); + s->nodes[1] = right; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* * Splits a node by creating two new children beneath the given node. * * Before: @@ -690,6 +896,186 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) return 0; } +/*----------------------------------------------------------------*/ + +/* + * Redistributes a node's entries with its left sibling. + */ +static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); + if (r) + return r; + + left = dm_block_data(sib); + right = dm_block_data(shadow_current(s)); + redistribute2(left, right); + *key_ptr(parent, parent_index) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } else { + unlock_block(s->info, sib); + } + + return 0; +} + +/* + * Redistributes a nodes entries with its right sibling. + */ +static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); + if (r) + return r; + + left = dm_block_data(shadow_current(s)); + right = dm_block_data(sib); + redistribute2(left, right); + *key_ptr(parent, parent_index + 1) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, sib); + } else { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } + + return 0; +} + +/* + * Returns the number of spare entries in a node. + */ +static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space) +{ + int r; + unsigned nr_entries; + struct dm_block *block; + struct btree_node *node; + + r = bn_read_lock(info, b, &block); + if (r) + return r; + + node = dm_block_data(block); + nr_entries = le32_to_cpu(node->header.nr_entries); + *space = le32_to_cpu(node->header.max_entries) - nr_entries; + + unlock_block(info, block); + return 0; +} + +/* + * Make space in a node, either by moving some entries to a sibling, + * or creating a new sibling node. SPACE_THRESHOLD defines the minimum + * number of free entries that must be in the sibling to make the move + * worth while. If the siblings are shared (eg, part of a snapshot), + * then they are not touched, since this break sharing and so consume + * more space than we save. + */ +#define SPACE_THRESHOLD 8 +static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_parent(s)); + unsigned nr_parent = le32_to_cpu(parent->header.nr_entries); + unsigned free_space; + int left_shared = 0, right_shared = 0; + + /* Should we move entries to the left sibling? */ + if (parent_index > 0) { + dm_block_t left_b = value64(parent, parent_index - 1); + r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); + if (r) + return r; + + if (!left_shared) { + r = get_node_free_space(s->info, left_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_left(s, vt, parent_index, key); + } + } + + /* Should we move entries to the right sibling? */ + if (parent_index < (nr_parent - 1)) { + dm_block_t right_b = value64(parent, parent_index + 1); + r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); + if (r) + return r; + + if (!right_shared) { + r = get_node_free_space(s->info, right_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_right(s, vt, parent_index, key); + } + } + + /* + * We need to split the node, normally we split two nodes + * into three. But when inserting a sequence that is either + * monotonically increasing or decreasing it's better to split + * a single node into two. + */ + if (left_shared || right_shared || (nr_parent <= 2) || + (parent_index == 0) || (parent_index + 1 == nr_parent)) { + return split_one_into_two(s, parent_index, vt, key); + } else { + return split_two_into_three(s, parent_index, vt, key); + } +} + +/* + * Does the node contain a particular key? + */ +static bool contains_key(struct btree_node *node, uint64_t key) +{ + int i = lower_bound(node, key); + + if (i >= 0 && le64_to_cpu(node->keys[i]) == key) + return true; + + return false; +} + +/* + * In general we preemptively make sure there's a free entry in every + * node on the spine when doing an insert. But we can avoid that with + * leaf nodes if we know it's an overwrite. + */ +static bool has_space_for_insert(struct btree_node *node, uint64_t key) +{ + if (node->header.nr_entries == node->header.max_entries) { + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + /* we don't need space if it's an overwrite */ + return contains_key(node, key); + } + + return false; + } + + return true; +} + static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, struct dm_btree_value_type *vt, uint64_t key, unsigned *index) @@ -719,17 +1105,18 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, node = dm_block_data(shadow_current(s)); - if (node->header.nr_entries == node->header.max_entries) { + if (!has_space_for_insert(node, key)) { if (top) r = btree_split_beneath(s, key); else - r = btree_split_sibling(s, i, key); + r = rebalance_or_split(s, vt, i, key); if (r < 0) return r; - } - node = dm_block_data(shadow_current(s)); + /* making space can cause the current node to change */ + node = dm_block_data(shadow_current(s)); + } i = lower_bound(node, key); @@ -753,6 +1140,77 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, return 0; } +static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, + uint64_t key, int *index) +{ + int r, i = -1; + struct btree_node *node; + + *index = 0; + for (;;) { + r = shadow_step(s, root, &s->info->value_type); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + i = lower_bound(node, key); + + BUG_ON(i < 0); + BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + if (key != le64_to_cpu(node->keys[i])) + return -EINVAL; + break; + } + + root = value64(node, i); + } + + *index = i; + return 0; +} + +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf) +{ + int r; + struct shadow_spine spine; + + BUG_ON(info->levels > 1); + init_shadow_spine(&spine, info); + r = __btree_get_overwrite_leaf(&spine, root, key, index); + if (!r) { + *new_root = shadow_root(&spine); + *leaf = shadow_current(&spine); + + /* + * Decrement the count so exit_shadow_spine() doesn't + * unlock the leaf. + */ + spine.count--; + } + exit_shadow_spine(&spine); + + return r; +} + static bool need_insert(struct btree_node *node, uint64_t *keys, unsigned level, unsigned index) { @@ -829,7 +1287,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, value_ptr(n, index), value))) { info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); } memcpy_disk(value_ptr(n, index), value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index 3dc5bb1a4748..d2ae5aa4d00b 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -51,21 +51,21 @@ struct dm_btree_value_type { */ /* - * The btree is making a duplicate of the value, for instance + * The btree is making a duplicate of a run of values, for instance * because previously-shared btree nodes have now diverged. * @value argument is the new copy that the copy function may modify. * (Probably it just wants to increment a reference count * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, const void *value); + void (*inc)(void *context, const void *value, unsigned count); /* - * This value is being deleted. The btree takes care of freeing + * These values are being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just - * needs to decrement a reference count somewhere. + * needs to decrement a reference counts somewhere. */ - void (*dec)(void *context, const void *value); + void (*dec)(void *context, const void *value, unsigned count); /* * A test for equality between two values. When a value is diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index a213bf11738f..4a6a2a9b4eb4 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -6,6 +6,8 @@ #include "dm-space-map-common.h" #include "dm-transaction-manager.h" +#include "dm-btree-internal.h" +#include "dm-persistent-data-internal.h" #include <linux/bitops.h> #include <linux/device-mapper.h> @@ -409,12 +411,13 @@ int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, return r; } -static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - int (*mutator)(void *context, uint32_t old, uint32_t *new), - void *context, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, int32_t *nr_allocations) { int r; - uint32_t bit, old, ref_count; + uint32_t bit, old; struct dm_block *nb; dm_block_t index = b; struct disk_index_entry ie_disk; @@ -433,10 +436,9 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return r; } ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); - bm_le = dm_bitmap_data(nb); - old = sm_lookup_bitmap(bm_le, bit); + old = sm_lookup_bitmap(bm_le, bit); if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); if (r < 0) { @@ -445,7 +447,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } } - r = mutator(context, old, &ref_count); if (r) { dm_tm_unlock(ll->tm, nb); return r; @@ -453,7 +454,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); - dm_tm_unlock(ll->tm, nb); if (old > 2) { @@ -480,62 +480,459 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } if (ref_count && !old) { - *ev = SM_ALLOC; + *nr_allocations = 1; ll->nr_allocated++; le32_add_cpu(&ie_disk.nr_free, -1); if (le32_to_cpu(ie_disk.none_free_before) == bit) ie_disk.none_free_before = cpu_to_le32(bit + 1); } else if (old && !ref_count) { - *ev = SM_FREE; + *nr_allocations = -1; ll->nr_allocated--; le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); } else - *ev = SM_NONE; + *nr_allocations = 0; return ll->save_ie(ll, index, &ie_disk); } -static int set_ref_count(void *context, uint32_t old, uint32_t *new) +/*----------------------------------------------------------------*/ + +/* + * Holds useful intermediate results for the range based inc and dec + * operations. + */ +struct inc_context { + struct disk_index_entry ie_disk; + struct dm_block *bitmap_block; + void *bitmap; + + struct dm_block *overflow_leaf; +}; + +static inline void init_inc_context(struct inc_context *ic) +{ + ic->bitmap_block = NULL; + ic->bitmap = NULL; + ic->overflow_leaf = NULL; +} + +static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + if (ic->bitmap_block) + dm_tm_unlock(ll->tm, ic->bitmap_block); + if (ic->overflow_leaf) + dm_tm_unlock(ll->tm, ic->overflow_leaf); +} + +static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + exit_inc_context(ll, ic); + init_inc_context(ic); +} + +/* + * Confirms a btree node contains a particular key at an index. + */ +static bool contains_key(struct btree_node *n, uint64_t key, int index) +{ + return index >= 0 && + index < le32_to_cpu(n->header.nr_entries) && + le64_to_cpu(n->keys[index]) == key; +} + +static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) { - *new = *((uint32_t *) context); + int r; + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * bitmap_block needs to be unlocked because getting the + * overflow_leaf may need to allocate, and thus use the space map. + */ + reset_inc_context(ll, ic); + + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); + return -EINVAL; + } + + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + return 0; } -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) +static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) +{ + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + + return 0; + } + } + + return __sm_ll_inc_overflow(ll, b, ic); +} + +static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + int r, inc; + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block)); + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + return 0; +} + +/* + * Once shadow_bitmap has been called, which always happens at the start of inc/dec, + * we can reopen the bitmap with a simple write lock, rather than re calling + * dm_tm_shadow_block(). + */ +static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + if (!ic->bitmap_block) { + int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block); + if (r) { + DMERR("unable to re-get write lock for bitmap"); + return r; + } + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + } + + return 0; +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + int32_t *nr_allocations, dm_block_t *new_b, + struct inc_context *ic) +{ + int r; + __le32 le_rc; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + /* inc bitmap, adjust nr_allocated */ + sm_set_bitmap(ic->bitmap, bit, 1); + (*nr_allocations)++; + ll->nr_allocated++; + le32_add_cpu(&ic->ie_disk.nr_free, -1); + if (le32_to_cpu(ic->ie_disk.none_free_before) == bit) + ic->ie_disk.none_free_before = cpu_to_le32(bit + 1); + break; + + case 1: + /* inc bitmap */ + sm_set_bitmap(ic->bitmap, bit, 2); + break; + + case 2: + /* inc bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 3); + reset_inc_context(ll, ic); + + le_rc = cpu_to_le32(3); + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + break; + + default: + /* + * inc within the overflow tree only. + */ + r = sm_ll_inc_overflow(ll, b, ic); + if (r < 0) + return r; + } + } + + *new_b = b; + return 0; +} + +/* + * Finds a bitmap that contains entries in the block range, and increments + * them. + */ +static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) { - return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); + int r; + struct inc_context ic; + uint32_t bit, bit_end; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic); + + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); } -static int inc_ref_count(void *context, uint32_t old, uint32_t *new) +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - *new = old + 1; + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_inc(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + return 0; } -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic) { - return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); + reset_inc_context(ll, ic); + return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, + &b, &ll->ref_count_root); } -static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) { - if (!old) { - DMERR_LIMIT("unable to decrement a reference count below 0"); + int r; + int index = -1; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + reset_inc_context(ll, ic); + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); return -EINVAL; } - *new = old - 1; + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc == 3) { + return __sm_ll_del_overflow(ll, b, ic); + } else { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } +} + +static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) +{ + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc > 3) { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } else { + return __sm_ll_del_overflow(ll, b, ic); + } + + } + } + + return __sm_ll_dec_overflow(ll, b, ic, old_rc); +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + struct inc_context *ic, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + DMERR("unable to decrement block"); + return -EINVAL; + + case 1: + /* dec bitmap */ + sm_set_bitmap(ic->bitmap, bit, 0); + (*nr_allocations)--; + ll->nr_allocated--; + le32_add_cpu(&ic->ie_disk.nr_free, 1); + ic->ie_disk.none_free_before = + cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit)); + break; + + case 2: + /* dec bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 1); + break; + + case 3: + r = sm_ll_dec_overflow(ll, b, ic, &old); + if (r < 0) + return r; + + if (old == 3) { + r = ensure_bitmap(ll, ic); + if (r) + return r; + + sm_set_bitmap(ic->bitmap, bit, 2); + } + break; + } + } + + *new_b = b; return 0; } -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t bit, bit_end; + struct inc_context ic; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b); + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_dec(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + + return 0; } +/*----------------------------------------------------------------*/ + int sm_ll_commit(struct ll_disk *ll) { int r = 0; @@ -687,28 +1084,92 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, /*----------------------------------------------------------------*/ +static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec) +{ + iec->dirty = false; + __dm_bless_for_disk(iec->ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &iec->index, &iec->ie, &ll->bitmap_root); +} + +static inline unsigned hash_index(dm_block_t index) +{ + return dm_hash_block(index, IE_CACHE_MASK); +} + static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + if (iec->valid) { + if (iec->index == index) { + memcpy(ie, &iec->ie, sizeof(*ie)); + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + if (!r) { + iec->valid = true; + iec->dirty = false; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + } + + return r; } static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - __dm_bless_for_disk(ie); - return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, - &index, ie, &ll->bitmap_root); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + ll->bitmap_index_changed = true; + if (iec->valid) { + if (iec->index == index) { + memcpy(&iec->ie, ie, sizeof(*ie)); + iec->dirty = true; + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + iec->valid = true; + iec->dirty = true; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + return 0; } static int disk_ll_init_index(struct ll_disk *ll) { + unsigned i; + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + iec->valid = false; + iec->dirty = false; + } return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); } static int disk_ll_open(struct ll_disk *ll) { - /* nothing to do */ return 0; } @@ -719,7 +1180,16 @@ static dm_block_t disk_ll_max_entries(struct ll_disk *ll) static int disk_ll_commit(struct ll_disk *ll) { - return 0; + int r = 0; + unsigned i; + + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + if (iec->valid && iec->dirty) + r = ie_cache_writeback(ll, iec); + } + + return r; } int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 87e17909ef52..706ceb85d680 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll); typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); typedef int (*commit_fn)(struct ll_disk *ll); +/* + * A lot of time can be wasted reading and writing the same + * index entry. So we cache a few entries. + */ +#define IE_CACHE_SIZE 64 +#define IE_CACHE_MASK (IE_CACHE_SIZE - 1) + +struct ie_cache { + bool valid; + bool dirty; + dm_block_t index; + struct disk_index_entry ie; +}; + struct ll_disk { struct dm_transaction_manager *tm; struct dm_btree_info bitmap_info; @@ -79,6 +93,8 @@ struct ll_disk { max_index_entries_fn max_entries; commit_fn commit; bool bitmap_index_changed:1; + + struct ie_cache ie_cache[IE_CACHE_SIZE]; }; struct disk_sm_root { @@ -96,12 +112,6 @@ struct disk_bitmap_header { __le64 blocknr; } __attribute__ ((packed, aligned(8))); -enum allocation_event { - SM_NONE, - SM_ALLOC, - SM_FREE, -}; - /*----------------------------------------------------------------*/ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); @@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, dm_block_t begin, dm_block_t end, dm_block_t *result); -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); + +/* + * The next three functions return (via nr_allocations) the net number of + * allocations that were made. This number may be negative if there were + * more frees than allocs. + */ +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); int sm_ll_commit(struct ll_disk *ll); int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 61f56909e00b..d0a8d5e73c28 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_insert(&smd->ll, b, count, &ev); + r = sm_ll_insert(&smd->ll, b, count, &nr_allocations); if (!r) { - switch (ev) { - case SM_NONE: - break; - - case SM_ALLOC: - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; - break; - - case SM_FREE: - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - break; - } + smd->nr_allocated_this_transaction += nr_allocations; } return r; } -static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_inc(&smd->ll, b, &ev); - if (!r && (ev == SM_ALLOC)) - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; + r = sm_ll_inc(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } -static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (!r && !old_count) - smd->nr_allocated_this_transaction--; - } + r = sm_ll_dec(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } @@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + if (r) return r; smd->begin = *b + 1; - r = sm_ll_inc(&smd->ll, *b, &ev); + r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations); if (!r) { - BUG_ON(ev != SM_ALLOC); - smd->nr_allocated_this_transaction++; + smd->nr_allocated_this_transaction += nr_allocations; } return r; @@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm) return r; memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; smd->nr_allocated_this_transaction = 0; return 0; @@ -235,8 +204,8 @@ static struct dm_space_map ops = { .get_count = sm_disk_get_count, .count_is_more_than_one = sm_disk_count_is_more_than_one, .set_count = sm_disk_set_count, - .inc_block = sm_disk_inc_block, - .dec_block = sm_disk_dec_block, + .inc_blocks = sm_disk_inc_blocks, + .dec_blocks = sm_disk_dec_blocks, .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 9e3c64ec2026..392ae26134a4 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -89,7 +89,8 @@ enum block_op_type { struct block_op { enum block_op_type type; - dm_block_t block; + dm_block_t b; + dm_block_t e; }; struct bop_ring_buffer { @@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) } static int brb_push(struct bop_ring_buffer *brb, - enum block_op_type type, dm_block_t b) + enum block_op_type type, dm_block_t b, dm_block_t e) { struct block_op *bop; unsigned next = brb_next(brb, brb->end); @@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb, bop = brb->bops + brb->end; bop->type = type; - bop->block = b; + bop->b = b; + bop->e = e; brb->end = next; @@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) return -ENODATA; bop = brb->bops + brb->begin; - result->type = bop->type; - result->block = bop->block; - + memcpy(result, bop, sizeof(*result)); return 0; } @@ -178,10 +178,9 @@ struct sm_metadata { struct threshold threshold; }; -static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e) { - int r = brb_push(&smm->uncommitted, type, b); - + int r = brb_push(&smm->uncommitted, type, b, e); if (r) { DMERR("too many recursive allocations"); return -ENOMEM; @@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t static int commit_bop(struct sm_metadata *smm, struct block_op *op) { int r = 0; - enum allocation_event ev; + int32_t nr_allocations; switch (op->type) { case BOP_INC: - r = sm_ll_inc(&smm->ll, op->block, &ev); + r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations); break; case BOP_DEC: - r = sm_ll_dec(&smm->ll, op->block, &ev); + r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations); break; } @@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, i = brb_next(&smm->uncommitted, i)) { struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r, r2; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (smm->recursion_count) { @@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, } in(smm); - r = sm_ll_insert(&smm->ll, b, count, &ev); + r = sm_ll_insert(&smm->ll, b, count, &nr_allocations); r2 = out(smm); return combine_errors(r, r2); } -static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - if (recursing(smm)) - r = add_bop(smm, BOP_INC, b); - else { + if (recursing(smm)) { + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + } else { in(smm); - r = sm_ll_inc(&smm->ll, b, &ev); + r = sm_ll_inc(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } return combine_errors(r, r2); } -static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (recursing(smm)) - r = add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); else { in(smm); - r = sm_ll_dec(&smm->ll, b, &ev); + r = sm_ll_dec(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } @@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + if (r) return r; smm->begin = *b + 1; if (recursing(smm)) - r = add_bop(smm, BOP_INC, *b); + r = add_bop(smm, BOP_INC, *b, *b + 1); else { in(smm); - r = sm_ll_inc(&smm->ll, *b, &ev); + r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations); r2 = out(smm); } @@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm) return r; memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; smm->allocated_this_transaction = 0; return 0; @@ -556,8 +564,8 @@ static const struct dm_space_map ops = { .get_count = sm_metadata_get_count, .count_is_more_than_one = sm_metadata_count_is_more_than_one, .set_count = sm_metadata_set_count, - .inc_block = sm_metadata_inc_block, - .dec_block = sm_metadata_dec_block, + .inc_blocks = sm_metadata_inc_blocks, + .dec_blocks = sm_metadata_dec_blocks, .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, @@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) return 0; } -static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_INC, b); + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + + return 0; } -static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); + if (r) + return r; + + return 0; } static int sm_bootstrap_commit(struct dm_space_map *sm) @@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = { .get_count = sm_bootstrap_get_count, .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, .set_count = sm_bootstrap_set_count, - .inc_block = sm_bootstrap_inc_block, - .dec_block = sm_bootstrap_dec_block, + .inc_blocks = sm_bootstrap_inc_blocks, + .dec_blocks = sm_bootstrap_dec_blocks, .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, @@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = { static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - int r, i; + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); dm_block_t old_len = smm->ll.nr_blocks; @@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) * allocate any new blocks. */ do { - for (i = old_len; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, old_len, smm->begin); if (r) goto out; @@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm, dm_block_t superblock) { int r; - dm_block_t i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); smm->begin = superblock + 1; @@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, * Now we need to update the newly created data structures with the * allocated blocks that they were built from. */ - for (i = superblock; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, superblock, smm->begin); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 3e6d1153b7c4..a015cd11f6e9 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -46,8 +46,8 @@ struct dm_space_map { int (*commit)(struct dm_space_map *sm); - int (*inc_block)(struct dm_space_map *sm, dm_block_t b); - int (*dec_block)(struct dm_space_map *sm, dm_block_t b); + int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); + int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); /* * new_block will increment the returned block. @@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm) return sm->commit(sm); } +static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->inc_blocks(sm, b, e); +} + static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) { - return sm->inc_block(sm, b); + return dm_sm_inc_blocks(sm, b, b + 1); +} + +static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->dec_blocks(sm, b, e); } static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) { - return sm->dec_block(sm, b); + return dm_sm_dec_blocks(sm, b, b + 1); } static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index abe2c5dd0993..16643fc974e8 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_inc); +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_inc_range); + void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) { /* @@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_dec); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_dec_range); + +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn) +{ + uint64_t b, begin, end; + bool in_run = false; + unsigned i; + + for (i = 0; i < count; i++, value_le++) { + b = le64_to_cpu(*value_le); + + if (in_run) { + if (b == end) + end++; + else { + fn(tm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(tm, begin, end); +} +EXPORT_SYMBOL_GPL(dm_tm_with_runs); + int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result) { @@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, return dm_sm_get_count(tm->sm, b, result); } +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_count_is_more_than_one(tm->sm, b, result); +} + struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) { return tm->bm; diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index f3a18be68f30..906c02ed0365 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); * Functions for altering the reference count of a block directly. */ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); - +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); + +/* + * Builds up runs of adjacent blocks, and then calls the given fn + * (typically dm_tm_inc/dec). Very useful when you have to perform + * the same tm operation on all values in a btree leaf. + */ +typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t); +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn); -int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, - uint32_t *result); +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result); + +/* + * Finds out if a given block is shared (ie. has a reference count higher + * than one). + */ +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result); struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e5d7411cba9b..62c8b6adac70 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -546,6 +546,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + if (bio->bi_pool != &mddev->bio_set) + md_account_bio(mddev, &bio); + orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ced076ba560e..51f2547c2007 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -300,6 +300,8 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -1210,7 +1212,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); int max_sectors; int rdisk; - bool print_msg = !!r1_bio; + bool r1bio_existed = !!r1_bio; char b[BDEVNAME_SIZE]; /* @@ -1220,7 +1222,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (print_msg) { + if (r1bio_existed) { /* Need to get the block device name carefully */ struct md_rdev *rdev; rcu_read_lock(); @@ -1252,7 +1254,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (print_msg) { + if (r1bio_existed) { pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", mdname(mddev), b, @@ -1263,7 +1265,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } mirror = conf->mirrors + rdisk; - if (print_msg) + if (r1bio_existed) pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", mdname(mddev), (unsigned long long)r1_bio->sector, @@ -1292,6 +1294,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, r1_bio->read_disk = rdisk; + if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); + read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r1_bio->bios[rdisk] = read_bio; @@ -1461,6 +1466,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index b7eb09e8c025..ccf10e59b116 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -158,6 +158,7 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 13f5e6b2a73d..16977e8e075d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -297,6 +297,8 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1184,6 +1186,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1483,6 +1487,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1461fd55311b..c34bb196790e 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -124,6 +124,7 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 841e1c1aa5e6..b8436e4930ed 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5311,8 +5311,6 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - WARN_ON_ONCE(bio->bi_bdev->bd_partno); - chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); @@ -5364,11 +5362,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct bio* raid_bi = bi->bi_private; + struct md_io_acct *md_io_acct = bi->bi_private; + struct bio *raid_bi = md_io_acct->orig_bio; struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; blk_status_t error = bi->bi_status; + unsigned long start_time = md_io_acct->start_time; bio_put(bi); @@ -5380,6 +5380,8 @@ static void raid5_align_endio(struct bio *bi) rdev_dec_pending(rdev, conf->mddev); if (!error) { + if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) + bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5398,6 +5400,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; + struct md_io_acct *md_io_acct; + bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); @@ -5427,29 +5431,46 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); - bio_set_dev(align_bio, rdev->bdev); - align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = raid_bio; - align_bio->bi_iter.bi_sector = sector; - - raid_bio->bi_next = (void *)rdev; - - if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, + if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, &bad_sectors)) { - bio_put(align_bio); + bio_put(raid_bio); rdev_dec_pending(rdev, mddev); return 0; } + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + raid_bio->bi_next = (void *)rdev; + if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) + md_io_acct->start_time = bio_start_io_acct(raid_bio); + md_io_acct->orig_bio = raid_bio; + + bio_set_dev(align_bio, rdev->bdev); + align_bio->bi_end_io = raid5_align_endio; + align_bio->bi_private = md_io_acct; + align_bio->bi_iter.bi_sector = sector; + /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + did_inc = false; + if (conf->quiesce == 0) { + atomic_inc(&conf->active_aligned_reads); + did_inc = true; + } + /* need a memory barrier to detect the race with raid5_quiesce() */ + if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { + /* quiesce is in progress, so we need to undo io activation and wait + * for it to finish + */ + if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + } if (mddev->gendisk) trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), @@ -5798,6 +5819,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL; + md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; @@ -6930,7 +6952,7 @@ static struct attribute *raid5_attrs[] = { &ppl_write_hint.attr, NULL, }; -static struct attribute_group raid5_attrs_group = { +static const struct attribute_group raid5_attrs_group = { .name = NULL, .attrs = raid5_attrs, }; @@ -8336,7 +8358,10 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) * active stripes can drain */ r5c_flush_cache(conf, INT_MAX); - conf->quiesce = 2; + /* need a memory barrier to make sure read_one_chunk() sees + * quiesce started and reverts to slow (locked) path. + */ + smp_store_release(&conf->quiesce, 2); wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, |