From 27f5411a718c431c20007e3a2fbba6589942d04f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 20 Apr 2020 16:46:59 +0300 Subject: dm crypt: support using encrypted keys Allow one to use "encrypted" in addition to "user" and "logon" key types for device encryption. Signed-off-by: Dmitry Baryshkov Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 1 + drivers/md/dm-crypt.c | 76 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d6d5ab23c088..8b0c646d2f59 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,7 @@ config DM_UNSTRIPED config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM + depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3df90daba89e..91787cde369b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -34,7 +34,9 @@ #include #include #include /* for struct rtattr and RTA macros only */ +#include #include +#include #include @@ -2215,12 +2217,47 @@ static bool contains_whitespace(const char *str) return false; } +static int set_key_user(struct crypt_config *cc, struct key *key) +{ + const struct user_key_payload *ukp; + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (cc->key_size != ukp->datalen) + return -EINVAL; + + memcpy(cc->key, ukp->data, cc->key_size); + + return 0; +} + +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static int set_key_encrypted(struct crypt_config *cc, struct key *key) +{ + const struct encrypted_key_payload *ekp; + + ekp = key->payload.data[0]; + if (!ekp) + return -EKEYREVOKED; + + if (cc->key_size != ekp->decrypted_datalen) + return -EINVAL; + + memcpy(cc->key, ekp->decrypted_data, cc->key_size); + + return 0; +} +#endif /* CONFIG_ENCRYPTED_KEYS */ + static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { char *new_key_string, *key_desc; int ret; + struct key_type *type; struct key *key; - const struct user_key_payload *ukp; + int (*set_key)(struct crypt_config *cc, struct key *key); /* * Reject key_string with whitespace. dm core currently lacks code for @@ -2236,16 +2273,26 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) return -EINVAL; - if (strncmp(key_string, "logon:", key_desc - key_string + 1) && - strncmp(key_string, "user:", key_desc - key_string + 1)) + if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) { + type = &key_type_logon; + set_key = set_key_user; + } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { + type = &key_type_user; + set_key = set_key_user; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) + } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + type = &key_type_encrypted; + set_key = set_key_encrypted; +#endif + } else { return -EINVAL; + } new_key_string = kstrdup(key_string, GFP_KERNEL); if (!new_key_string) return -ENOMEM; - key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, - key_desc + 1, NULL); + key = request_key(type, key_desc + 1, NULL); if (IS_ERR(key)) { kzfree(new_key_string); return PTR_ERR(key); @@ -2253,23 +2300,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string down_read(&key->sem); - ukp = user_key_payload_locked(key); - if (!ukp) { - up_read(&key->sem); - key_put(key); - kzfree(new_key_string); - return -EKEYREVOKED; - } - - if (cc->key_size != ukp->datalen) { + ret = set_key(cc, key); + if (ret < 0) { up_read(&key->sem); key_put(key); kzfree(new_key_string); - return -EINVAL; + return ret; } - memcpy(cc->key, ukp->data, cc->key_size); - up_read(&key->sem); key_put(key); @@ -2323,7 +2361,7 @@ static int get_key_size(char **key_string) return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; } -#endif +#endif /* CONFIG_KEYS */ static int crypt_set_key(struct crypt_config *cc, char *key) { @@ -3282,7 +3320,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 20, 0}, + .version = {1, 21, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit v1.2.3-59-g8ed1b From 2361ae595352dec015d14292f1b539242d8446d6 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 20 Apr 2020 22:29:09 +0200 Subject: dm mpath: switch paths in dm_blk_ioctl() code path SCSI LUN passthrough code such as qemu's "scsi-block" device model pass every IO to the host via SG_IO ioctls. Currently, dm-multipath calls choose_pgpath() only in the block IO code path, not in the ioctl code path (unless current_pgpath is NULL). This has the effect that no path switching and thus no load balancing is done for SCSI-passthrough IO, unless the active path fails. Fix this by using the same logic in multipath_prepare_ioctl() as in multipath_clone_and_map(). Note: The allegedly best path selection algorithm, service-time, still wouldn't work perfectly, because the io size of the current request is always set to 0. Changing that for the IO passthrough case would require the ioctl cmd and arg to be passed to dm's prepare_ioctl() method. Signed-off-by: Martin Wilck Reviewed-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3e500098132f..e0c800cf87a9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1918,7 +1918,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, int r; current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath) + if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) current_pgpath = choose_pgpath(m, 0); if (current_pgpath) { -- cgit v1.2.3-59-g8ed1b From d3c7b35c20d60650bac8b55c17b194adda03a979 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 9 Mar 2020 23:26:38 +0100 Subject: dm: add emulated block size target This new target is similar to the linear target except that it emulates a smaller logical block size on a device with a larger logical block size. Its main purpose is to emulate 512 byte sectors on 4K native disks (i.e. 512e). See Documentation/admin-guide/device-mapper/dm-ebs.rst for details. Reviewed-by: Damien Le Moal Signed-off-by: Heinz Mauelshagen Signed-off-by: Randy Dunlap [Kconfig fixes] Signed-off-by: Zheng Bin [static fixes] Signed-off-by: Mike Snitzer --- Documentation/admin-guide/device-mapper/dm-ebs.rst | 51 +++ drivers/md/Kconfig | 8 + drivers/md/Makefile | 2 + drivers/md/dm-ebs-target.c | 444 +++++++++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 Documentation/admin-guide/device-mapper/dm-ebs.rst create mode 100644 drivers/md/dm-ebs-target.c (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-ebs.rst b/Documentation/admin-guide/device-mapper/dm-ebs.rst new file mode 100644 index 000000000000..534fa38e8862 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-ebs.rst @@ -0,0 +1,51 @@ +====== +dm-ebs +====== + + +This target is similar to the linear target except that it emulates +a smaller logical block size on a device with a larger logical block +size. Its main purpose is to provide emulation of 512 byte sectors on +devices that do not provide this emulation (i.e. 4K native disks). + +Supported emulated logical block sizes 512, 1024, 2048 and 4096. + +Underlying block size can be set to > 4K to test buffering larger units. + + +Table parameters +---------------- + [] + +Mandatory parameters: + + : + Full pathname to the underlying block-device, + or a "major:minor" device-number. + : + Starting sector within the device; + has to be a multiple of . + : + Number of sectors defining the logical block size to be emulated; + 1, 2, 4, 8 sectors of 512 bytes supported. + +Optional parameter: + + : + Number of sectors defining the logical block size of . + 2^N supported, e.g. 8 = emulate 8 sectors of 512 bytes = 4KiB. + If not provided, the logical block size of will be used. + + +Examples: + +Emulate 1 sector = 512 bytes logical block size on /dev/sda starting at +offset 1024 sectors with underlying devices block size automatically set: + +ebs /dev/sda 1024 1 + +Emulate 2 sector = 1KiB logical block size on /dev/sda starting at +offset 128 sectors, enforce 2KiB underlying device block size. +This presumes 2KiB logical blocksize on /dev/sda or less to work: + +ebs /dev/sda 128 2 4 diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8b0c646d2f59..6cb6188a61df 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -337,6 +337,14 @@ config DM_WRITECACHE The writecache target doesn't cache reads because reads are supposed to be cached in standard RAM. +config DM_EBS + tristate "Emulated block size target (EXPERIMENTAL)" + depends on BLK_DEV_DM + select DM_BUFIO + help + dm-ebs emulates smaller logical block size on backing devices + with larger ones (e.g. 512 byte sectors on 4K native disks). + config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d91a7edcd2ab..9a2d673f94bc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,6 +17,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o +dm-ebs-y += dm-ebs-target.o dm-era-y += dm-era-target.o dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o @@ -65,6 +66,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o +obj-$(CONFIG_DM_EBS) += dm-ebs.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_CLONE) += dm-clone.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c new file mode 100644 index 000000000000..c2bd21fa7002 --- /dev/null +++ b/drivers/md/dm-ebs-target.c @@ -0,0 +1,444 @@ +/* + * Copyright (C) 2020 Red Hat GmbH + * + * This file is released under the GPL. + * + * Device-mapper target to emulate smaller logical block + * size on backing devices exposing (natively) larger ones. + * + * E.g. 512 byte sector emulation on 4K native disks. + */ + +#include "dm.h" +#include +#include +#include + +#define DM_MSG_PREFIX "ebs" + +static void ebs_dtr(struct dm_target *ti); + +/* Emulated block size context. */ +struct ebs_c { + struct dm_dev *dev; /* Underlying device to emulate block size on. */ + struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ + struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ + struct work_struct ws; /* Work item used for ^. */ + struct bio_list bios_in; /* Worker bios input list. */ + spinlock_t lock; /* Guard bios input list above. */ + sector_t start; /* table line argument, see ebs_ctr below. */ + unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ + unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */ + unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ + bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ +}; + +static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) +{ + return sector >> ec->block_shift; +} + +static inline sector_t __block_mod(sector_t sector, unsigned int bs) +{ + return sector & (bs - 1); +} + +/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */ +static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) +{ + sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); + + return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); +} + +static inline bool __ebs_check_bs(unsigned int bs) +{ + return bs && is_power_of_2(bs); +} + +/* + * READ/WRITE: + * + * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. + */ +static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter) +{ + int r = 0; + unsigned char *ba, *pa; + unsigned int cur_len; + unsigned int bv_len = bv->bv_len; + unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); + sector_t block = __sector_to_block(ec, iter->bi_sector); + struct dm_buffer *b; + + if (unlikely(!bv->bv_page || !bv_len)) + return -EIO; + + pa = page_address(bv->bv_page) + bv->bv_offset; + + /* Handle overlapping page <-> blocks */ + while (bv_len) { + cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); + + /* Avoid reading for writes in case bio vector's page overwrites block completely. */ + if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) + ba = dm_bufio_read(ec->bufio, block, &b); + else + ba = dm_bufio_new(ec->bufio, block, &b); + + if (unlikely(IS_ERR(ba))) { + /* + * Carry on with next buffer, if any, to issue all possible + * data but return error. + */ + r = PTR_ERR(ba); + } else { + /* Copy data to/from bio to buffer if read/new was successful above. */ + ba += buf_off; + if (rw == READ) { + memcpy(pa, ba, cur_len); + flush_dcache_page(bv->bv_page); + } else { + flush_dcache_page(bv->bv_page); + memcpy(ba, pa, cur_len); + dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); + } + + dm_bufio_release(b); + } + + pa += cur_len; + bv_len -= cur_len; + buf_off = 0; + block++; + } + + return r; +} + +/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ +static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) +{ + int r = 0, rr; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_bvec(bv, bio, iter) { + rr = __ebs_rw_bvec(ec, rw, &bv, &iter); + if (rr) + r = rr; + } + + return r; +} + +/* 'Discard' blocks, i.e. release them from the bufio cache. */ +static int __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t blocks, sector = bio->bi_iter.bi_sector; + + blocks = __nr_blocks(ec, bio); + for (; blocks--; sector += ec->u_bs) + dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); + + return 0; +} + +/* Worker funtion to process incoming bios. */ +static void __ebs_process_bios(struct work_struct *ws) +{ + int r; + bool write = false; + sector_t block1, block2; + struct ebs_c *ec = container_of(ws, struct ebs_c, ws); + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&ec->lock); + bios = ec->bios_in; + bio_list_init(&ec->bios_in); + spin_unlock_irq(&ec->lock); + + /* Prefetch all read and any mis-aligned write buffers */ + bio_list_for_each(bio, &bios) { + block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); + if (bio_op(bio) == REQ_OP_READ) + dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); + else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { + block2 = __sector_to_block(ec, bio_end_sector(bio)); + if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) + dm_bufio_prefetch(ec->bufio, block1, 1); + if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) + dm_bufio_prefetch(ec->bufio, block2, 1); + } + } + + bio_list_for_each(bio, &bios) { + r = -EIO; + if (bio_op(bio) == REQ_OP_READ) + r = __ebs_rw_bio(ec, READ, bio); + else if (bio_op(bio) == REQ_OP_WRITE) { + write = true; + r = __ebs_rw_bio(ec, WRITE, bio); + } else if (bio_op(bio) == REQ_OP_DISCARD) { + /* FIXME: (optionally) call dm_bufio_discard_buffers() once upstream. */ + r = __ebs_forget_bio(ec, bio); + } + + if (r < 0) + bio->bi_status = errno_to_blk_status(r); + } + + /* + * We write dirty buffers after processing I/O on them + * but before we endio thus addressing REQ_FUA/REQ_SYNC. + */ + r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; + + while ((bio = bio_list_pop(&bios))) { + /* Any other request is endioed. */ + if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) + bio_io_error(bio); + else + bio_endio(bio); + } +} + +/* + * Construct an emulated block size mapping: [] + * + * : path of the underlying device + * : offset in 512 bytes sectors into + * : emulated block size in units of 512 bytes exposed to the upper layer + * []: underlying block size in units of 512 bytes imposed on the lower layer; + * optional, if not supplied, retrieve logical block size from underlying device + */ +static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned short tmp1; + unsigned long long tmp; + char dummy; + struct ebs_c *ec; + + if (argc < 3 || argc > 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); + if (!ec) { + ti->error = "Cannot allocate ebs context"; + return -ENOMEM; + } + + r = -EINVAL; + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || + tmp != (sector_t)tmp || + (sector_t)tmp >= ti->len) { + ti->error = "Invalid device offset sector"; + goto bad; + } + ec->start = tmp; + + if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || + !__ebs_check_bs(tmp1) || + to_bytes(tmp1) > PAGE_SIZE) { + ti->error = "Invalid emulated block size"; + goto bad; + } + ec->e_bs = tmp1; + + if (argc > 3) { + if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { + ti->error = "Invalid underlying block size"; + goto bad; + } + ec->u_bs = tmp1; + ec->u_bs_set = true; + } else + ec->u_bs_set = false; + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); + if (r) { + ti->error = "Device lookup failed"; + ec->dev = NULL; + goto bad; + } + + r = -EINVAL; + if (!ec->u_bs_set) { + ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); + if (!__ebs_check_bs(ec->u_bs)) { + ti->error = "Invalid retrieved underlying block size"; + goto bad; + } + } + + if (!ec->u_bs_set && ec->e_bs == ec->u_bs) + DMINFO("Emulation superfluous: emulated equal to underlying block size"); + + if (__block_mod(ec->start, ec->u_bs)) { + ti->error = "Device offset must be multiple of underlying block size"; + goto bad; + } + + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); + if (IS_ERR(ec->bufio)) { + ti->error = "Cannot create dm bufio client"; + r = PTR_ERR(ec->bufio); + ec->bufio = NULL; + goto bad; + } + + ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!ec->wq) { + ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; + r = -ENOMEM; + goto bad; + } + + ec->block_shift = __ffs(ec->u_bs); + INIT_WORK(&ec->ws, &__ebs_process_bios); + bio_list_init(&ec->bios_in); + spin_lock_init(&ec->lock); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 0; + ti->num_write_same_bios = 0; + ti->num_write_zeroes_bios = 0; + return 0; +bad: + ebs_dtr(ti); + return r; +} + +static void ebs_dtr(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + + if (ec->wq) + destroy_workqueue(ec->wq); + if (ec->bufio) + dm_bufio_client_destroy(ec->bufio); + if (ec->dev) + dm_put_device(ti, ec->dev); + kfree(ec); +} + +static int ebs_map(struct dm_target *ti, struct bio *bio) +{ + struct ebs_c *ec = ti->private; + + bio_set_dev(bio, ec->dev->bdev); + bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely(bio->bi_opf & REQ_OP_FLUSH)) + return DM_MAPIO_REMAPPED; + /* + * Only queue for bufio processing in case of partial or overlapping buffers + * -or- + * emulation with ebs == ubs aiming for tests of dm-bufio overhead. + */ + if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || + __block_mod(bio_end_sector(bio), ec->u_bs) || + ec->e_bs == ec->u_bs)) { + spin_lock_irq(&ec->lock); + bio_list_add(&ec->bios_in, bio); + spin_unlock_irq(&ec->lock); + + queue_work(ec->wq, &ec->ws); + + return DM_MAPIO_SUBMITTED; + } + + /* Forget any buffer content relative to this direct backing device I/O. */ + __ebs_forget_bio(ec, bio); + + return DM_MAPIO_REMAPPED; +} + +static void ebs_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct ebs_c *ec = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + *result = '\0'; + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", + ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); + break; + } +} + +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct ebs_c *ec = ti->private; + struct dm_dev *dev = ec->dev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + *bdev = dev->bdev; + return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT); +} + +static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct ebs_c *ec = ti->private; + + limits->logical_block_size = to_bytes(ec->e_bs); + limits->physical_block_size = to_bytes(ec->u_bs); + limits->alignment_offset = limits->physical_block_size; + blk_limits_io_min(limits, limits->logical_block_size); +} + +static int ebs_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct ebs_c *ec = ti->private; + + return fn(ti, ec->dev, ec->start, ti->len, data); +} + +static struct target_type ebs_target = { + .name = "ebs", + .version = {1, 0, 0}, + .features = DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = ebs_ctr, + .dtr = ebs_dtr, + .map = ebs_map, + .status = ebs_status, + .io_hints = ebs_io_hints, + .prepare_ioctl = ebs_prepare_ioctl, + .iterate_devices = ebs_iterate_devices, +}; + +static int __init dm_ebs_init(void) +{ + int r = dm_register_target(&ebs_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void dm_ebs_exit(void) +{ + dm_unregister_target(&ebs_target); +} + +module_init(dm_ebs_init); +module_exit(dm_ebs_exit); + +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_DESCRIPTION(DM_NAME " emulated block size target"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From 6fbeb0048e6b93f7b7f195864f3ddc876ac4d42e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 7 Feb 2020 15:59:25 -0500 Subject: dm bufio: implement discard Add functions dm_bufio_issue_discard and dm_bufio_discard_buffers. dm_bufio_issue_discard sends discard request to the underlying device. dm_bufio_discard_buffers frees buffers in the range and then calls dm_bufio_issue_discard. Also, factor out block_to_sector for reuse in dm_bufio_issue_discard. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 69 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/dm-bufio.h | 12 +++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..bf289be1ee3a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -631,6 +631,19 @@ dmio: submit_bio(bio); } +static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) +{ + sector_t sector; + + if (likely(c->sectors_per_block_bits >= 0)) + sector = block << c->sectors_per_block_bits; + else + sector = block * (c->block_size >> SECTOR_SHIFT); + sector += c->start; + + return sector; +} + static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; @@ -639,11 +652,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff b->end_io = end_io; - if (likely(b->c->sectors_per_block_bits >= 0)) - sector = b->block << b->c->sectors_per_block_bits; - else - sector = b->block * (b->c->block_size >> SECTOR_SHIFT); - sector += b->c->start; + sector = block_to_sector(b->c, b->block); if (rw != REQ_OP_WRITE) { n_sectors = b->c->block_size >> SECTOR_SHIFT; @@ -1325,6 +1334,56 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); +/* + * Use dm-io to send a discard request to flush the device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + struct dm_io_request io_req = { + .bi_op = REQ_OP_DISCARD, + .bi_op_flags = REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = block_to_sector(c, block), + .count = block_to_sector(c, count), + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + sector_t i; + + for (i = block; i < block + count; i++) { + struct dm_buffer *b; + dm_bufio_lock(c); + b = __find(c, i); + if (b && likely(!b->hold_count)) { + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __free_buffer_wake(b); + } + dm_bufio_unlock(c); + } + + return dm_bufio_issue_discard(c, block, count); +} +EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); + /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 3c8b7d274bd9..07e1f163e299 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -118,6 +118,18 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); */ int dm_bufio_issue_flush(struct dm_bufio_client *c); +/* + * Send a discard request to the underlying device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); + +/* + * Free the specified range of buffers. If a buffer is held by other process, it + * is not freed. If a buffer is dirty, it is discarded without writeback. + * Finally, send the discard request to the device. + */ +int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); + /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. -- cgit v1.2.3-59-g8ed1b From a5089a95d84c1e861e2d1f549ae368c8e89e3674 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 29 Apr 2020 16:47:03 +0200 Subject: dm ebs: pass discards down to underlying device Make use of dm_bufio_issue_discard() to pass discards down to the underlying device. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-ebs-target.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index c2bd21fa7002..ae3f5fad3b39 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -132,16 +132,43 @@ static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) return r; } -/* 'Discard' blocks, i.e. release them from the bufio cache. */ -static int __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +/* + * Discard bio's blocks, i.e. pass discards down. + * + * Avoid discarding partial blocks at beginning and end; + * return 0 in case no blocks can be discarded as a result. + */ +static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t block, blocks, sector = bio->bi_iter.bi_sector; + + block = __sector_to_block(ec, sector); + blocks = __nr_blocks(ec, bio); + + /* + * Partial first underlying block (__nr_blocks() may have + * resulted in one block). + */ + if (__block_mod(sector, ec->u_bs)) { + block++; + blocks--; + } + + /* Partial last underlying block if any. */ + if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) + blocks--; + + return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; +} + +/* Release blocks them from the bufio cache. */ +static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) { sector_t blocks, sector = bio->bi_iter.bi_sector; blocks = __nr_blocks(ec, bio); for (; blocks--; sector += ec->u_bs) dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); - - return 0; } /* Worker funtion to process incoming bios. */ @@ -183,8 +210,8 @@ static void __ebs_process_bios(struct work_struct *ws) write = true; r = __ebs_rw_bio(ec, WRITE, bio); } else if (bio_op(bio) == REQ_OP_DISCARD) { - /* FIXME: (optionally) call dm_bufio_discard_buffers() once upstream. */ - r = __ebs_forget_bio(ec, bio); + __ebs_forget_bio(ec, bio); + r = __ebs_discard_bio(ec, bio); } if (r < 0) @@ -409,7 +436,7 @@ static int ebs_iterate_devices(struct dm_target *ti, static struct target_type ebs_target = { .name = "ebs", - .version = {1, 0, 0}, + .version = {1, 0, 1}, .features = DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, .ctr = ebs_ctr, -- cgit v1.2.3-59-g8ed1b From a86fe8be514534363c8fb12a3a38bdba6354316b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Apr 2020 07:29:48 +0000 Subject: dm integrity: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: drivers/md/dm-integrity.c: In function 'integrity_metadata': drivers/md/dm-integrity.c:1557:12: warning: variable 'save_metadata_offset' set but not used [-Wunused-but-set-variable] drivers/md/dm-integrity.c:1556:12: warning: variable 'save_metadata_block' set but not used [-Wunused-but-set-variable] They are never used, so remove it. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4094c47eca7f..3726b987151e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1553,8 +1553,6 @@ static void integrity_metadata(struct work_struct *w) char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; sector_t sector; unsigned sectors_to_process; - sector_t save_metadata_block; - unsigned save_metadata_offset; if (unlikely(ic->mode == 'R')) goto skip_io; @@ -1605,8 +1603,6 @@ static void integrity_metadata(struct work_struct *w) goto skip_io; } - save_metadata_block = dio->metadata_block; - save_metadata_offset = dio->metadata_offset; sector = dio->range.logical_sector; sectors_to_process = dio->range.n_sectors; -- cgit v1.2.3-59-g8ed1b From 9431cf6efc3659eaa1cdd591e02a09045bc9983f Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Wed, 15 Apr 2020 19:57:31 +0800 Subject: dm persistent data: switch exit_ro_spine to return void In commit 4c7da06f5a78 ("dm persistent data: eliminate unnecessary return values"), r value in exit_ro_spine will not change, so exit_ro_spine doesn't need a return value. Signed-off-by: Zhiqiang Liu Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-btree-internal.h | 2 +- drivers/md/persistent-data/dm-btree-spine.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index a240990a7f33..55a4096f1334 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -68,7 +68,7 @@ struct ro_spine { }; void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); -int exit_ro_spine(struct ro_spine *s); +void exit_ro_spine(struct ro_spine *s); int ro_step(struct ro_spine *s, dm_block_t new_child); void ro_pop(struct ro_spine *s); struct btree_node *ro_node(struct ro_spine *s); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index b27b8091a1ca..e03cb9e48773 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -132,15 +132,13 @@ void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) s->nodes[1] = NULL; } -int exit_ro_spine(struct ro_spine *s) +void exit_ro_spine(struct ro_spine *s) { - int r = 0, i; + int i; for (i = 0; i < s->count; i++) { unlock_block(s->info, s->nodes[i]); } - - return r; } int ro_step(struct ro_spine *s, dm_block_t new_child) -- cgit v1.2.3-59-g8ed1b From 499c18045eab16656ef4159c35b05865038f9f25 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 19 Apr 2020 04:34:00 -0400 Subject: dm writecache: remove superfluous test in persistent_memory_claim Remove superfluous test if dax_dev is NULL - dax_direct_access already does this test. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 613c171b1b6d..d29d3e234e01 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -234,10 +234,6 @@ static int persistent_memory_claim(struct dm_writecache *wc) wc->memory_vmapped = false; - if (!wc->ssd_dev->dax_dev) { - r = -EOPNOTSUPP; - goto err1; - } s = wc->memory_map_size; p = s >> PAGE_SHIFT; if (!p) { -- cgit v1.2.3-59-g8ed1b From 48338daaa00e6137a43fa5d0e54b763aa34f450b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 29 Apr 2020 12:30:03 -0400 Subject: dm writecache: improve performance on DDR persistent memory (Optane) When testing the dm-writecache target on a real DDR persistent memory (Intel Optane), it turned out that explicit cache flushing using the clflushopt instruction performs better than non-temporal stores for block sizes 1k, 2k and 4k. The dm-writecache target is singlethreaded (all the copying is done while holding the writecache lock), so it benefits from clwb, see: http://lore.kernel.org/r/alpine.LRH.2.02.2004160411460.7833@file01.intranet.prod.int.rdu2.redhat.com Add a new function memcpy_flushcache_optimized() that tests if clflushopt is present - and if it is, we use it instead of memcpy_flushcache. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d29d3e234e01..74f3c506f084 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1139,6 +1139,42 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, return r; } +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) +{ + /* + * clflushopt performs better with block size 1024, 2048, 4096 + * non-temporal stores perform better with block size 512 + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + * + * NOTE: this happens to be the case now (with dm-writecache's single + * threaded model) but re-evaluate this once memcpy_flushcache() is + * enabled to use movdir64b which might invalidate this performance + * advantage seen with cache-allocating-writes plus flushing. + */ +#ifdef CONFIG_X86 + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && + likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 768)) { + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64); + return; + } +#endif + memcpy_flushcache(dest, source, size); +} + static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; @@ -1164,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data } } else { flush_dcache_page(bio_page(bio)); - memcpy_flushcache(data, buf, size); + memcpy_flushcache_optimized(data, buf, size); } bvec_kunmap_irq(buf, &flags); -- cgit v1.2.3-59-g8ed1b From 087615bf3acdafd0ba7c7c9ed5286e7b7c80fe1b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 30 Apr 2020 16:48:29 -0400 Subject: dm mpath: pass IO start time to path selector The HST path selector needs this information to perform path prediction. For request-based mpath, struct request's io_start_time_ns is used, while for bio-based, use the start_time stored in dm_io. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 9 ++++++--- drivers/md/dm-path-selector.h | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-service-time.c | 2 +- drivers/md/dm.c | 9 +++++++++ include/linux/device-mapper.h | 2 ++ 6 files changed, 20 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e0c800cf87a9..74246d7c7d68 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -567,7 +567,8 @@ static void multipath_release_clone(struct request *clone, if (pgpath && pgpath->pg->ps.type->end_io) pgpath->pg->ps.type->end_io(&pgpath->pg->ps, &pgpath->path, - mpio->nr_bytes); + mpio->nr_bytes, + clone->io_start_time_ns); } blk_put_request(clone); @@ -1617,7 +1618,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + clone->io_start_time_ns); } return r; @@ -1661,7 +1663,8 @@ done: struct path_selector *ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + dm_start_time_ns_from_clone(clone)); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index b6eb5365b1a4..c47bc0e20275 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -74,7 +74,7 @@ struct path_selector_type { int (*start_io) (struct path_selector *ps, struct dm_path *path, size_t nr_bytes); int (*end_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); + size_t nr_bytes, u64 start_time); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 969c4f1a3633..5fd018d18418 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path, } static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index f006a9005593..9cfda665e9eb 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct dm_path *path, } static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, u64 start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..2fcb932eb4bd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -675,6 +675,15 @@ static bool md_in_flight(struct mapped_device *md) return md_in_flight_bios(md); } +u64 dm_start_time_ns_from_clone(struct bio *bio) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + struct dm_io *io = tio->io; + + return jiffies_to_nsecs(io->start_time); +} +EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index af48d9da3916..934037d938b9 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -332,6 +332,8 @@ void *dm_per_bio_data(struct bio *bio, size_t data_size); struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size); unsigned dm_bio_get_target_bio_nr(const struct bio *bio); +u64 dm_start_time_ns_from_clone(struct bio *bio); + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); -- cgit v1.2.3-59-g8ed1b From 2613eab11996c8d1439c2a44fbca52807be7faa6 Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Thu, 30 Apr 2020 16:48:30 -0400 Subject: dm mpath: add Historical Service Time Path Selector This new selector keeps an exponential moving average of the service time for each path (losely defined as delta between start_io and end_io), and uses this along with the number of inflight requests to estimate future service time for a path. Since we don't have a prober to account for temporally slow paths, re-try "slow" paths every once in a while (num_paths * historical_service_time). To account for fast paths transitioning to slow, if a path has not completed any request within (num_paths * historical_service_time), limit the number of outstanding requests. To account for low volume situations where number of inflight IOs would be zero, the last finish time of each path is factored in. Signed-off-by: Khazhismel Kumykov Co-developed-by: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 11 + drivers/md/Makefile | 1 + drivers/md/dm-historical-service-time.c | 561 ++++++++++++++++++++++++++++++++ 3 files changed, 573 insertions(+) create mode 100644 drivers/md/dm-historical-service-time.c (limited to 'drivers') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 6cb6188a61df..6665b56865b7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -452,6 +452,17 @@ config DM_MULTIPATH_ST If unsure, say N. +config DM_MULTIPATH_HST + tristate "I/O Path Selector based on historical service time" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time by comparing estimated service time (based on historical + service time). + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 9a2d673f94bc..31840f95cd40 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c new file mode 100644 index 000000000000..186f91e2752c --- /dev/null +++ b/drivers/md/dm-historical-service-time.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Historical Service Time + * + * Keeps a time-weighted exponential moving average of the historical + * service time. Estimates future service time based on the historical + * service time and the number of outstanding requests. + * + * Marks paths stale if they have not finished within hst * + * num_paths. If a path is stale and unused, we will send a single + * request to probe in case the path has improved. This situation + * generally arises if the path is so much worse than others that it + * will never have the best estimated service time, or if the entire + * multipath device is unused. If a path is stale and in use, limit the + * number of requests it can receive with the assumption that the path + * has become degraded. + * + * To avoid repeatedly calculating exponents for time weighting, times + * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) + * ns, and the weighting is pre-calculated. + * + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include + + +#define DM_MSG_PREFIX "multipath historical-service-time" +#define HST_MIN_IO 1 +#define HST_VERSION "0.1.1" + +#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ +#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) +#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) +#define HST_FIXED_95 972 +#define HST_MAX_INFLIGHT HST_FIXED_1 +#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ +#define HST_WEIGHT_COUNT 64ULL + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + int valid_count; + spinlock_t lock; + + unsigned int weights[HST_WEIGHT_COUNT]; + unsigned int threshold_multiplier; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + + spinlock_t lock; + + u64 historical_service_time; /* Fixed point */ + + u64 stale_after; + u64 last_finish; + + u64 outstanding; +}; + +/** + * fixed_power - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + * + * (see: kernel/sched/loadavg.c) + */ +static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * Calculate the next value of an exponential moving average + * a_1 = a_0 * e + a * (1 - e) + * + * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @weight: [0, HST_FIXED_1] + * + * Note: + * To account for multiple periods in the same calculation, + * a_n = a_0 * e^n + a * (1 - e^n), + * so call fixed_ema(last, next, pow(weight, N)) + */ +static u64 fixed_ema(u64 last, u64 next, u64 weight) +{ + last *= weight; + last += next * (HST_FIXED_1 - weight); + last += 1ULL << (HST_FIXED_SHIFT - 1); + return last >> HST_FIXED_SHIFT; +} + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + s->valid_count = 0; + } + + return s; +} + +/* + * Get the weight for a given time span. + */ +static u64 hst_weight(struct path_selector *ps, u64 delta) +{ + struct selector *s = ps->context; + int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, + HST_WEIGHT_COUNT - 1); + + return s->weights[bucket]; +} + +/* + * Set up the weights array. + * + * weights[len-1] = 0 + * weights[n] = base ^ (n + 1) + */ +static void hst_set_weights(struct path_selector *ps, unsigned int base) +{ + struct selector *s = ps->context; + int i; + + if (base >= HST_FIXED_1) + return; + + for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) + s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); + s->weights[HST_WEIGHT_COUNT - 1] = 0; +} + +static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + unsigned int base_weight = HST_FIXED_95; + unsigned int threshold_multiplier = 0; + char dummy; + + /* + * Arguments: [ []] + * : Base weight for ema [0, 1024) 10-bit fixed point. A + * value of 0 will completely ignore any history. + * If not given, default (HST_FIXED_95) is used. + * : Minimum threshold multiplier for paths to + * be considered different. That is, a path is + * considered different iff (p1 > N * p2) where p1 + * is the path with higher service time. A threshold + * of 1 or 0 has no effect. Defaults to 0. + */ + if (argc > 2) + return -EINVAL; + + if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || + base_weight >= HST_FIXED_1)) { + return -EINVAL; + } + + if (argc > 1 && (sscanf(argv[1], "%u%c", + &threshold_multiplier, &dummy) != 1)) { + return -EINVAL; + } + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + + hst_set_weights(ps, base_weight); + s->threshold_multiplier = threshold_multiplier; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void hst_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int hst_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + if (!path) { + struct selector *s = ps->context; + + DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); + } else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu %llu ", pi->historical_service_time, + pi->outstanding, pi->stale_after); + break; + case STATUSTYPE_TABLE: + DMEMIT("0 "); + break; + } + } + + return sz; +} + +static int hst_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = HST_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (HST_MIN_IO) is used. + */ + if (argc > 1) { + *error = "historical-service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "historical-service-time ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "historical-service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + pi->historical_service_time = HST_FIXED_1; + + spin_lock_init(&pi->lock); + pi->outstanding = 0; + + pi->stale_after = 0; + pi->last_finish = 0; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + s->valid_count--; + spin_unlock_irqrestore(&s->lock, flags); +} + +static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fill_compare(struct path_info *pi, u64 *hst, + u64 *out, u64 *stale) +{ + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + *hst = pi->historical_service_time; + *out = pi->outstanding; + *stale = pi->stale_after; + spin_unlock_irqrestore(&pi->lock, flags); +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + */ +static long long hst_compare(struct path_info *pi1, struct path_info *pi2, + u64 time_now, struct path_selector *ps) +{ + struct selector *s = ps->context; + u64 hst1, hst2; + long long out1, out2, stale1, stale2; + int pi2_better, over_threshold; + + hst_fill_compare(pi1, &hst1, &out1, &stale1); + hst_fill_compare(pi2, &hst2, &out2, &stale2); + + /* Check here if estimated latency for two paths are too similar. + * If this is the case, we skip extra calculation and just compare + * outstanding requests. In this case, any unloaded paths will + * be preferred. + */ + if (hst1 > hst2) + over_threshold = hst1 > (s->threshold_multiplier * hst2); + else + over_threshold = hst2 > (s->threshold_multiplier * hst1); + + if (!over_threshold) + return out1 - out2; + + /* + * If an unloaded path is stale, choose it. If both paths are unloaded, + * choose path that is the most stale. + * (If one path is loaded, choose the other) + */ + if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || + (!out1 && !out2)) + return (!out2 * stale1) - (!out1 * stale2); + + /* Compare estimated service time. If outstanding is the same, we + * don't need to multiply + */ + if (out1 == out2) { + pi2_better = hst1 > hst2; + } else { + /* Potential overflow with out >= 1024 */ + if (unlikely(out1 >= HST_MAX_INFLIGHT || + out2 >= HST_MAX_INFLIGHT)) { + /* If over 1023 in-flights, we may overflow if hst + * is at max. (With this shift we still overflow at + * 1048576 in-flights, which is high enough). + */ + hst1 >>= HST_FIXED_SHIFT; + hst2 >>= HST_FIXED_SHIFT; + } + pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; + } + + /* In the case that the 'winner' is stale, limit to equal usage. */ + if (pi2_better) { + if (stale2 < time_now) + return out1 - out2; + return 1; + } + if (stale1 < time_now) + return out1 - out2; + return -1; +} + +static struct dm_path *hst_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + u64 time_now = sched_clock(); + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || (hst_compare(pi, best, time_now, ps) < 0)) + best = pi; + } + + if (!best) + goto out; + + /* Move last used path to end (least preferred in case of ties) */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; + +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int hst_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + pi->outstanding++; + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static u64 path_service_time(struct path_info *pi, u64 start_time) +{ + u64 sched_now = ktime_get_ns(); + + /* if a previous disk request has finished after this IO was + * sent to the hardware, pretend the submission happened + * serially. + */ + if (time_after64(pi->last_finish, start_time)) + start_time = pi->last_finish; + + pi->last_finish = sched_now; + if (time_before64(sched_now, start_time)) + return 0; + + return sched_now - start_time; +} + +static int hst_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + struct selector *s = ps->context; + unsigned long flags; + u64 st; + + spin_lock_irqsave(&pi->lock, flags); + + st = path_service_time(pi, start_time); + pi->outstanding--; + pi->historical_service_time = + fixed_ema(pi->historical_service_time, + min(st * HST_FIXED_1, HST_FIXED_MAX), + hst_weight(ps, st)); + + /* + * On request end, mark path as fresh. If a path hasn't + * finished any requests within the fresh period, the estimated + * service time is considered too optimistic and we limit the + * maximum requests on that path. + */ + pi->stale_after = pi->last_finish + + (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); + + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static struct path_selector_type hst_ps = { + .name = "historical-service-time", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 3, + .create = hst_create, + .destroy = hst_destroy, + .status = hst_status, + .add_path = hst_add_path, + .fail_path = hst_fail_path, + .reinstate_path = hst_reinstate_path, + .select_path = hst_select_path, + .start_io = hst_start_io, + .end_io = hst_end_io, +}; + +static int __init dm_hst_init(void) +{ + int r = dm_register_path_selector(&hst_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " HST_VERSION " loaded"); + + return r; +} + +static void __exit dm_hst_exit(void) +{ + int r = dm_unregister_path_selector(&hst_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_hst_init); +module_exit(dm_hst_exit); + +MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); +MODULE_AUTHOR("Khazhismel Kumykov "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From bc3d5717d242a37d2e9ea85d7e7b2e3569324d24 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:16 +0200 Subject: dm zoned: add 'status' callback Add callback to supply information for 'dmsetup status' and 'dmsetup table'. The output for 'dmsetup status' is 0 zoned zones / random / sequential where is the number of unmapped (ie free) random zones, the total number of random zones, the number of unmapped sequential zones, and the total number of sequential zones. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-zoned.rst | 16 +++++++++++++ drivers/md/dm-zoned-metadata.c | 15 +++++++++++++ drivers/md/dm-zoned-target.c | 26 ++++++++++++++++++++++ drivers/md/dm-zoned.h | 3 +++ 4 files changed, 60 insertions(+) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 07f56ebc1730..4165fbf1aeb6 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -144,3 +144,19 @@ underlying zoned block device name. Ex:: echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ dmsetup create dmz-`basename ${dev}` + +Information about the internal layout and current usage of the zones can +be obtained with the 'status' callback from dmsetup: + +Ex:: + + dmsetup status /dev/dm-X + +will return a line + + 0 zoned zones / random / sequential + +where is the total number of zones, is the number +of unmapped (ie free) random zones, the total number of zones, + the number of unmapped sequential zones, and the +total number of sequential zones. diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 369de15c4e80..c8787560fa9f 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -202,6 +202,11 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; } +unsigned int dmz_nr_zones(struct dmz_metadata *zmd) +{ + return zmd->dev->nr_zones; +} + unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) { return zmd->nr_chunks; @@ -217,6 +222,16 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_rnd); } +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_seq; +} + +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_seq); +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index f4f83d39b3dc..0b4b27d280fb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -965,6 +965,31 @@ static int dmz_iterate_devices(struct dm_target *ti, return fn(ti, dmz->ddev, 0, capacity, data); } +static void dmz_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + ssize_t sz = 0; + char buf[BDEVNAME_SIZE]; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u zones %u/%u random %u/%u sequential", + dmz_nr_zones(dmz->metadata), + dmz_nr_unmap_rnd_zones(dmz->metadata), + dmz_nr_rnd_zones(dmz->metadata), + dmz_nr_unmap_seq_zones(dmz->metadata), + dmz_nr_seq_zones(dmz->metadata)); + break; + case STATUSTYPE_TABLE: + format_dev_t(buf, dmz->dev->bdev->bd_dev); + DMEMIT("%s", buf); + break; + } + return; +} + static struct target_type dmz_type = { .name = "zoned", .version = {1, 1, 0}, @@ -978,6 +1003,7 @@ static struct target_type dmz_type = { .postsuspend = dmz_suspend, .resume = dmz_resume, .iterate_devices = dmz_iterate_devices, + .status = dmz_status, }; static int __init dmz_init(void) diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 5b5e493d479c..884c0e586082 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -190,8 +190,11 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int chunk); void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); +unsigned int dmz_nr_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); /* * Activate a zone (increment its reference count). -- cgit v1.2.3-59-g8ed1b From 90b39d58f39e1f3f3147caee6fb2a71528db74a2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:17 +0200 Subject: dm zoned: add 'message' callback Add callback for 'dmsetup message' to allow the reclaim process to be triggered manually. Eg. dmsetup message /dev/dm-X 0 message will start the reclaim process even if the default threshold of 50 percent of free random zones is not reached. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- Documentation/admin-guide/device-mapper/dm-zoned.rst | 12 ++++++++++++ drivers/md/dm-zoned-target.c | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 4165fbf1aeb6..7547ce635161 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -160,3 +160,15 @@ where is the total number of zones, is the number of unmapped (ie free) random zones, the total number of zones, the number of unmapped sequential zones, and the total number of sequential zones. + +Normally the reclaim process will be started once there are less than 50 +percent free random zones. In order to start the reclaim process manually +even before reaching this threshold the 'dmsetup message' function can be +used: + +Ex:: + + dmsetup message /dev/dm-X 0 reclaim + +will start the reclaim process and random zones will be moved to sequential +zones. diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 0b4b27d280fb..0bfe34162dbb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -990,6 +990,20 @@ static void dmz_status(struct dm_target *ti, status_type_t type, return; } +static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct dmz_target *dmz = ti->private; + int r = -EINVAL; + + if (!strcasecmp(argv[0], "reclaim")) { + dmz_schedule_reclaim(dmz->reclaim); + r = 0; + } else + DMERR("unrecognized message %s", argv[0]); + return r; +} + static struct target_type dmz_type = { .name = "zoned", .version = {1, 1, 0}, @@ -1004,6 +1018,7 @@ static struct target_type dmz_type = { .resume = dmz_resume, .iterate_devices = dmz_iterate_devices, .status = dmz_status, + .message = dmz_message, }; static int __init dmz_init(void) -- cgit v1.2.3-59-g8ed1b From b71228739851a9b384a59ba0467259eba508b408 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:18 +0200 Subject: dm zoned: store zone id within the zone structure and kill dmz_id() Instead of calculating the zone index by the offset within the zone array store the index within the structure itself. With that the helper dmz_id() is pointless and can be replaced with accessing the ->id value directly. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 40 +++++++++++++++++----------------------- drivers/md/dm-zoned-reclaim.c | 17 ++++++++--------- drivers/md/dm-zoned-target.c | 6 +++--- drivers/md/dm-zoned.h | 4 +++- 4 files changed, 31 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index c8787560fa9f..1993eeb26bc1 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -187,19 +187,14 @@ struct dmz_metadata { /* * Various accessors */ -unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone) -{ - return ((unsigned int)(zone - zmd->zones)); -} - sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift; + return (sector_t)zone->id << zmd->dev->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; + return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; } unsigned int dmz_nr_zones(struct dmz_metadata *zmd) @@ -1119,6 +1114,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) INIT_LIST_HEAD(&zone->link); atomic_set(&zone->refcount, 0); + zone->id = idx; zone->chunk = DMZ_MAP_UNMAPPED; switch (blkz->type) { @@ -1246,7 +1242,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = -EIO; if (ret < 0) { dmz_dev_err(zmd->dev, "Get zone %u report failed", - dmz_id(zmd, zone)); + zone->id); dmz_check_bdev(zmd->dev); return ret; } @@ -1270,7 +1266,7 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, return ret; dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", - dmz_id(zmd, zone), zone->wp_block, wp); + zone->id, zone->wp_block, wp); if (zone->wp_block < wp) { dmz_invalidate_blocks(zmd, zone, zone->wp_block, @@ -1309,7 +1305,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", - dmz_id(zmd, zone), ret); + zone->id, ret); return ret; } } @@ -1757,8 +1753,7 @@ again: } /* Update the chunk mapping */ - dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone), - dmz_id(zmd, bzone)); + dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); set_bit(DMZ_BUF, &bzone->flags); bzone->chunk = dzone->chunk; @@ -1810,7 +1805,7 @@ again: atomic_dec(&zmd->unmap_nr_seq); if (dmz_is_offline(zone)) { - dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone)); + dmz_dev_warn(zmd->dev, "Zone %u is offline", zone->id); zone = NULL; goto again; } @@ -1852,7 +1847,7 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, unsigned int chunk) { /* Set the chunk mapping */ - dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone), + dmz_set_chunk_mapping(zmd, chunk, dzone->id, DMZ_MAP_UNMAPPED); dzone->chunk = chunk; if (dmz_is_rnd(dzone)) @@ -1880,7 +1875,7 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * Unmapping the chunk buffer zone: clear only * the chunk buffer mapping */ - dzone_id = dmz_id(zmd, zone->bzone); + dzone_id = zone->bzone->id; zone->bzone->bzone = NULL; zone->bzone = NULL; @@ -1942,7 +1937,7 @@ static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, sector_t chunk_block) { sector_t bitmap_block = 1 + zmd->nr_map_blocks + - (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) + + (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + (chunk_block >> DMZ_BLOCK_SHIFT_BITS); return dmz_get_mblock(zmd, bitmap_block); @@ -2022,7 +2017,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int n = 0; dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", - dmz_id(zmd, zone), (unsigned long long)chunk_block, + zone->id, (unsigned long long)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); @@ -2052,7 +2047,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, zone->weight += n; else { dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", - dmz_id(zmd, zone), zone->weight, + zone->id, zone->weight, zone_nr_blocks - n); zone->weight = zone_nr_blocks; } @@ -2102,7 +2097,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int n = 0; dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", - dmz_id(zmd, zone), (u64)chunk_block, nr_blocks); + zone->id, (u64)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); @@ -2132,7 +2127,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, zone->weight -= n; else { dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", - dmz_id(zmd, zone), zone->weight, n); + zone->id, zone->weight, n); zone->weight = 0; } @@ -2378,7 +2373,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) { struct dmz_metadata *zmd; - unsigned int i, zid; + unsigned int i; struct dm_zone *zone; int ret; @@ -2419,9 +2414,8 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) goto err; /* Set metadata zones starting from sb_zone */ - zid = dmz_id(zmd, zmd->sb_zone); for (i = 0; i < zmd->nr_meta_zones << 1; i++) { - zone = dmz_get(zmd, zid + i); + zone = dmz_get(zmd, zmd->sb_zone->id + i); if (!dmz_is_rnd(zone)) goto err; set_bit(DMZ_META, &zone->flags); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index e7ace908a9b7..7f57c4299a2f 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -80,7 +80,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, if (ret) { dmz_dev_err(zrc->dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", - dmz_id(zmd, zone), (unsigned long long)wp_block, + zone->id, (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); dmz_check_bdev(zrc->dev); return ret; @@ -196,8 +196,8 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone), - dmz_id(zmd, dzone), dmz_weight(dzone)); + dzone->chunk, bzone->id, dmz_weight(bzone), + dzone->id, dmz_weight(dzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, bzone, dzone); @@ -235,8 +235,8 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - chunk, dmz_id(zmd, dzone), dmz_weight(dzone), - dmz_id(zmd, bzone), dmz_weight(bzone)); + chunk, dzone->id, dmz_weight(dzone), + bzone->id, dmz_weight(bzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, dzone, bzone); @@ -287,8 +287,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_dev_debug(zrc->dev, "Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - chunk, dmz_id(zmd, dzone), dmz_weight(dzone), - dmz_id(zmd, szone)); + chunk, dzone->id, dmz_weight(dzone), szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -403,12 +402,12 @@ out: if (ret) { dmz_dev_debug(zrc->dev, "Metadata flush for zone %u failed, err %d\n", - dmz_id(zmd, rzone), ret); + rzone->id, ret); return ret; } dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", - dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start)); + rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 0bfe34162dbb..859ccc30ba7f 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -180,7 +180,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), - dmz_id(dmz->metadata, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); /* Check block validity to determine the read location */ @@ -317,7 +317,7 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), - dmz_id(dmz->metadata, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { @@ -357,7 +357,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", (unsigned long long)dmz_bio_chunk(dmz->dev, bio), - dmz_id(zmd, zone), + zone->id, (unsigned long long)chunk_block, nr_blocks); /* diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 884c0e586082..30781646741a 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -87,6 +87,9 @@ struct dm_zone { /* Zone activation reference count */ atomic_t refcount; + /* Zone id */ + unsigned int id; + /* Zone write pointer block (relative to the zone start block) */ unsigned int wp_block; @@ -176,7 +179,6 @@ void dmz_lock_flush(struct dmz_metadata *zmd); void dmz_unlock_flush(struct dmz_metadata *zmd); int dmz_flush_metadata(struct dmz_metadata *zmd); -unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); -- cgit v1.2.3-59-g8ed1b From 735bd7e4cd16270b7b67cb82ff4ba2811bfd8d7b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:19 +0200 Subject: dm zoned: use array for superblock zones Instead of storing just the first superblock zone and calculate the secondary relative to that we should be using an array for holding the superblock zones. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1993eeb26bc1..900b1c1224f5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -124,6 +124,7 @@ struct dmz_sb { sector_t block; struct dmz_mblock *mblk; struct dmz_super *sb; + struct dm_zone *zone; }; /* @@ -150,7 +151,6 @@ struct dmz_metadata { /* Zone information array */ struct dm_zone *zones; - struct dm_zone *sb_zone; struct dmz_sb sb[2]; unsigned int mblk_primary; u64 sb_gen; @@ -839,8 +839,9 @@ err: /* * Check super block. */ -static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) +static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) { + struct dmz_super *sb = zmd->sb[set].sb; unsigned int nr_meta_zones, nr_data_zones; struct dmz_dev *dev = zmd->dev; u32 crc, stored_crc; @@ -932,16 +933,20 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; + zmd->sb[1].zone = zmd->sb[0].zone + 1; for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; - if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) + if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) { + zmd->sb[1].zone += i; return 0; + } zmd->sb[1].block += zone_nr_blocks; } dmz_free_mblock(zmd, mblk); zmd->sb[1].mblk = NULL; + zmd->sb[1].zone = NULL; return -EIO; } @@ -985,11 +990,9 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); if (dst_set == 0) - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); - else { - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); - } + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + else + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); page = alloc_page(GFP_NOIO); if (!page) @@ -1033,21 +1036,27 @@ static int dmz_load_sb(struct dmz_metadata *zmd) u64 sb_gen[2] = {0, 0}; int ret; + if (!zmd->sb[0].zone) { + dmz_dev_err(zmd->dev, "Primary super block zone not set"); + return -ENXIO; + } + /* Read and check the primary super block */ - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); ret = dmz_get_sb(zmd, 0); if (ret) { dmz_dev_err(zmd->dev, "Read primary super block failed"); return ret; } - ret = dmz_check_sb(zmd, zmd->sb[0].sb); + ret = dmz_check_sb(zmd, 0); /* Read and check secondary super block */ if (ret == 0) { sb_good[0] = true; - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); + if (!zmd->sb[1].zone) + zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); @@ -1057,7 +1066,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) return ret; } - ret = dmz_check_sb(zmd, zmd->sb[1].sb); + ret = dmz_check_sb(zmd, 1); if (ret == 0) sb_good[1] = true; @@ -1142,9 +1151,9 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; - if (!zmd->sb_zone) { + if (!zmd->sb[0].zone) { /* Super block zone */ - zmd->sb_zone = zone; + zmd->sb[0].zone = zone; } } } @@ -2415,7 +2424,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { - zone = dmz_get(zmd, zmd->sb_zone->id + i); + zone = dmz_get(zmd, zmd->sb[0].zone->id + i); if (!dmz_is_rnd(zone)) goto err; set_bit(DMZ_META, &zone->flags); -- cgit v1.2.3-59-g8ed1b From bf28a3ba098676831bde49e8bc47849727d532a5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:20 +0200 Subject: dm zoned: store device in struct dmz_sb Store the device together with the superblock so that we don't have to recur to the metadata to find it. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 90 +++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 900b1c1224f5..def836e12dd9 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -122,6 +122,7 @@ enum { */ struct dmz_sb { sector_t block; + struct dmz_dev *dev; struct dmz_mblock *mblk; struct dmz_super *sb; struct dm_zone *zone; @@ -197,6 +198,11 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; } +struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + return &zmd->dev[0]; +} + unsigned int dmz_nr_zones(struct dmz_metadata *zmd) { return zmd->dev->nr_zones; @@ -412,9 +418,10 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, { struct dmz_mblock *mblk, *m; sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return ERR_PTR(-EIO); /* Get a new block and a BIO to read it */ @@ -450,7 +457,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); @@ -547,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, sector_t mblk_no) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; /* Check rbtree */ spin_lock(&zmd->mblk_lock); @@ -565,7 +573,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ERR_PTR(-EIO); } @@ -589,10 +597,11 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, unsigned int set) { + struct dmz_dev *dev = zmd->sb[set].dev; sector_t block = zmd->sb[set].block + mblk->no; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -604,7 +613,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); @@ -617,13 +626,13 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, /* * Read/write a metadata block. */ -static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, - struct page *page) +static int dmz_rdwr_block(struct dmz_dev *dev, int op, + sector_t block, struct page *page) { struct bio *bio; int ret; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -631,14 +640,14 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, return -ENOMEM; bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); if (ret) - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ret; } @@ -650,6 +659,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sector_t block = zmd->sb[set].block; struct dmz_mblock *mblk = zmd->sb[set].mblk; struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; u64 sb_gen = zmd->sb_gen + 1; int ret; @@ -669,9 +679,9 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sb->crc = 0; sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); return ret; } @@ -684,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, unsigned int set) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[set].dev; struct blk_plug plug; int ret = 0, nr_mblks_submitted = 0; @@ -705,7 +716,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); ret = -EIO; } nr_mblks_submitted--; @@ -713,7 +724,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); return ret; } @@ -750,6 +761,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) { struct dmz_mblock *mblk; struct list_head write_list; + struct dmz_dev *dev; int ret; if (WARN_ON(!zmd)) @@ -763,6 +775,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) * from modifying metadata. */ down_write(&zmd->mblk_sem); + dev = zmd->sb[zmd->mblk_primary].dev; /* * This is called from the target flush work and reclaim work. @@ -770,7 +783,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ dmz_lock_flush(zmd); - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_bdev_is_dying(dev)) { ret = -EIO; goto out; } @@ -782,7 +795,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); goto err; } @@ -831,7 +844,7 @@ err: list_splice(&write_list, &zmd->mblk_dirty_list); spin_unlock(&zmd->mblk_lock); } - if (!dmz_check_bdev(zmd->dev)) + if (!dmz_check_bdev(dev)) ret = -EIO; goto out; } @@ -842,8 +855,8 @@ err: static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) { struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; unsigned int nr_meta_zones, nr_data_zones; - struct dmz_dev *dev = zmd->dev; u32 crc, stored_crc; u64 gen; @@ -908,8 +921,8 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) { - return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block, - zmd->sb[set].mblk->page); + return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, + zmd->sb[set].block, zmd->sb[set].mblk->page); } /* @@ -934,6 +947,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; zmd->sb[1].zone = zmd->sb[0].zone + 1; + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; @@ -942,11 +956,13 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) return 0; } zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone + i); } dmz_free_mblock(zmd, mblk); zmd->sb[1].mblk = NULL; zmd->sb[1].zone = NULL; + zmd->sb[1].dev = NULL; return -EIO; } @@ -987,7 +1003,8 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) struct page *page; int i, ret; - dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); + dmz_dev_warn(zmd->sb[dst_set].dev, + "Metadata set %u invalid: recovering", dst_set); if (dst_set == 0) zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); @@ -1000,11 +1017,11 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) /* Copy metadata blocks */ for (i = 1; i < zmd->nr_meta_blocks; i++) { - ret = dmz_rdwr_block(zmd, REQ_OP_READ, + ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, zmd->sb[src_set].block + i, page); if (ret) goto out; - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, + ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, zmd->sb[dst_set].block + i, page); if (ret) goto out; @@ -1043,9 +1060,10 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); ret = dmz_get_sb(zmd, 0); if (ret) { - dmz_dev_err(zmd->dev, "Read primary super block failed"); + dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret; } @@ -1057,12 +1075,13 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (!zmd->sb[1].zone) zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); if (ret) { - dmz_dev_err(zmd->dev, "Read secondary super block failed"); + dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); return ret; } @@ -1078,17 +1097,25 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (sb_good[0]) sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 0); + if (ret) { + dmz_dev_err(zmd->sb[0].dev, + "Recovery of superblock 0 failed"); + return -EIO; + } + } if (sb_good[1]) sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 1); - if (ret) { - dmz_dev_err(zmd->dev, "Recovery failed"); - return -EIO; + if (ret) { + dmz_dev_err(zmd->sb[1].dev, + "Recovery of superblock 1 failed"); + return -EIO; + } } if (sb_gen[0] >= sb_gen[1]) { @@ -1099,7 +1126,8 @@ static int dmz_load_sb(struct dmz_metadata *zmd) zmd->mblk_primary = 1; } - dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)", + dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, + "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); return 0; -- cgit v1.2.3-59-g8ed1b From 368205601375bbfb41b07ec8295eab208b6fced5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:21 +0200 Subject: dm zoned: move fields from struct dmz_dev to dmz_metadata Move fields from the device structure into the metadata structure and provide accessor functions. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 88 ++++++++++++++++++++++++++++-------------- drivers/md/dm-zoned-reclaim.c | 8 ++-- drivers/md/dm-zoned-target.c | 48 +++++++++++------------ drivers/md/dm-zoned.h | 14 +++---- 4 files changed, 95 insertions(+), 63 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index def836e12dd9..b844ff02ae7b 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -138,9 +138,16 @@ struct dmz_metadata { unsigned int zone_nr_bitmap_blocks; unsigned int zone_bits_per_mblk; + sector_t zone_nr_blocks; + sector_t zone_nr_blocks_shift; + + sector_t zone_nr_sectors; + sector_t zone_nr_sectors_shift; + unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; + unsigned int nr_zones; unsigned int nr_useable_zones; unsigned int nr_meta_blocks; unsigned int nr_meta_zones; @@ -190,12 +197,12 @@ struct dmz_metadata { */ sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->dev->zone_nr_sectors_shift; + return (sector_t)zone->id << zmd->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->dev->zone_nr_blocks_shift; + return (sector_t)zone->id << zmd->zone_nr_blocks_shift; } struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) @@ -203,9 +210,29 @@ struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) return &zmd->dev[0]; } +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks; +} + +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors; +} + +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors_shift; +} + unsigned int dmz_nr_zones(struct dmz_metadata *zmd) { - return zmd->dev->nr_zones; + return zmd->nr_zones; } unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) @@ -882,8 +909,8 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1) - >> dev->zone_nr_blocks_shift; + nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) + >> zmd->zone_nr_blocks_shift; if (!nr_meta_zones || nr_meta_zones >= zmd->nr_rnd_zones) { dmz_dev_err(dev, "Invalid number of metadata blocks"); @@ -932,7 +959,7 @@ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) { - unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; int i; @@ -1143,7 +1170,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) struct dmz_dev *dev = zmd->dev; /* Ignore the eventual last runt (smaller) zone */ - if (blkz->len != dev->zone_nr_sectors) { + if (blkz->len != zmd->zone_nr_sectors) { if (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO; @@ -1208,19 +1235,24 @@ static int dmz_init_zones(struct dmz_metadata *zmd) int ret; /* Init */ - zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; + zmd->zone_nr_sectors = dev->zone_nr_sectors; + zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); + zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); + zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); + zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; zmd->zone_nr_bitmap_blocks = max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); - zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ - zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); + zmd->nr_zones = dev->nr_zones; + zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); if (!zmd->zones) return -ENOMEM; dmz_dev_info(dev, "Using %zu B for zone information", - sizeof(struct dm_zone) * dev->nr_zones); + sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -1339,7 +1371,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + zmd->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", zone->id, ret); @@ -1393,7 +1425,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dzone_id == DMZ_MAP_UNMAPPED) goto next; - if (dzone_id >= dev->nr_zones) { + if (dzone_id >= zmd->nr_zones) { dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", chunk, dzone_id); return -EIO; @@ -1414,7 +1446,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (bzone_id == DMZ_MAP_UNMAPPED) goto next; - if (bzone_id >= dev->nr_zones) { + if (bzone_id >= zmd->nr_zones) { dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", chunk, bzone_id); return -EIO; @@ -1446,7 +1478,7 @@ next: * fully initialized. All remaining zones are unmapped data * zones. Finish initializing those here. */ - for (i = 0; i < dev->nr_zones; i++) { + for (i = 0; i < zmd->nr_zones; i++) { dzone = dmz_get(zmd, i); if (dmz_is_meta(dzone)) continue; @@ -1990,7 +2022,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, sector_t chunk_block = 0; /* Get the zones bitmap blocks */ - while (chunk_block < zmd->dev->zone_nr_blocks) { + while (chunk_block < zmd->zone_nr_blocks) { from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); if (IS_ERR(from_mblk)) return PTR_ERR(from_mblk); @@ -2025,7 +2057,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, int ret; /* Get the zones bitmap blocks */ - while (chunk_block < zmd->dev->zone_nr_blocks) { + while (chunk_block < zmd->zone_nr_blocks) { /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); if (ret <= 0) @@ -2049,7 +2081,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, sector_t chunk_block, unsigned int nr_blocks) { unsigned int count, bit, nr_bits; - unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; unsigned int n = 0; @@ -2136,7 +2168,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", zone->id, (u64)chunk_block, nr_blocks); - WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); while (nr_blocks) { /* Get bitmap block */ @@ -2180,7 +2212,7 @@ static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; int ret; - WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block >= zmd->zone_nr_blocks); /* Get bitmap block */ mblk = dmz_get_bitmap(zmd, zone, chunk_block); @@ -2210,7 +2242,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned long *bitmap; int n = 0; - WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); + WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); while (nr_blocks) { /* Get bitmap block */ @@ -2254,7 +2286,7 @@ int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, /* The block is valid: get the number of valid blocks from block */ return dmz_to_next_set_block(zmd, zone, chunk_block, - zmd->dev->zone_nr_blocks - chunk_block, 0); + zmd->zone_nr_blocks - chunk_block, 0); } /* @@ -2270,7 +2302,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, int ret; ret = dmz_to_next_set_block(zmd, zone, start_block, - zmd->dev->zone_nr_blocks - start_block, 1); + zmd->zone_nr_blocks - start_block, 1); if (ret < 0) return ret; @@ -2278,7 +2310,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, *chunk_block = start_block; return dmz_to_next_set_block(zmd, zone, start_block, - zmd->dev->zone_nr_blocks - start_block, 0); + zmd->zone_nr_blocks - start_block, 0); } /* @@ -2317,7 +2349,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) struct dmz_mblock *mblk; sector_t chunk_block = 0; unsigned int bit, nr_bits; - unsigned int nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int nr_blocks = zmd->zone_nr_blocks; void *bitmap; int n = 0; @@ -2488,7 +2520,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) dmz_dev_info(dev, " %llu 512-byte logical sectors", (u64)dev->capacity); dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", - dev->nr_zones, (u64)dev->zone_nr_sectors); + zmd->nr_zones, (u64)zmd->zone_nr_sectors); dmz_dev_info(dev, " %u metadata zones", zmd->nr_meta_zones * 2); dmz_dev_info(dev, " %u data zones for %u chunks", @@ -2541,7 +2573,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) int ret; /* Check zones */ - for (i = 0; i < dev->nr_zones; i++) { + for (i = 0; i < zmd->nr_zones; i++) { zone = dmz_get(zmd, i); if (!zone) { dmz_dev_err(dev, "Unable to get zone %u", i); @@ -2569,7 +2601,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) i, (u64)zone->wp_block, (u64)wp_block); zone->wp_block = wp_block; dmz_invalidate_blocks(zmd, zone, zone->wp_block, - dev->zone_nr_blocks - zone->wp_block); + zmd->zone_nr_blocks - zone->wp_block); } } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 7f57c4299a2f..5aa5e5130fe8 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -128,7 +128,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, if (dmz_is_seq(src_zone)) end_block = src_zone->wp_block; else - end_block = dev->zone_nr_blocks; + end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); @@ -210,7 +210,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block); if (ret == 0) { /* Free the buffer zone */ - dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, bzone); dmz_unlock_zone_reclaim(dzone); @@ -252,7 +252,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) * Free the data zone and remap the chunk to * the buffer zone. */ - dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, bzone); dmz_unmap_zone(zmd, dzone); @@ -305,7 +305,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_map(zmd); } else { /* Free the data zone and remap the chunk */ - dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); + dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd)); dmz_lock_map(zmd); dmz_unmap_zone(zmd, dzone); dmz_unlock_zone_reclaim(dzone); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 859ccc30ba7f..68c5684d7b01 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -165,7 +165,8 @@ static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio) { - sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio)); + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); unsigned int nr_blocks = dmz_bio_blocks(bio); sector_t end_block = chunk_block + nr_blocks; struct dm_zone *rzone, *bzone; @@ -178,7 +179,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, } dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -189,7 +190,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, nr_blocks = 0; if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) { /* Test block validity in the data zone */ - ret = dmz_block_valid(dmz->metadata, zone, chunk_block); + ret = dmz_block_valid(zmd, zone, chunk_block); if (ret < 0) return ret; if (ret > 0) { @@ -204,7 +205,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, * Check the buffer zone, if there is one. */ if (!nr_blocks && bzone) { - ret = dmz_block_valid(dmz->metadata, bzone, chunk_block); + ret = dmz_block_valid(zmd, bzone, chunk_block); if (ret < 0) return ret; if (ret > 0) { @@ -308,14 +309,15 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz, static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio) { - sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio)); + struct dmz_metadata *zmd = dmz->metadata; + sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); unsigned int nr_blocks = dmz_bio_blocks(bio); if (!zone) return -ENOSPC; dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), (dmz_is_rnd(zone) ? "RND" : "SEQ"), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -345,7 +347,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, struct dmz_metadata *zmd = dmz->metadata; sector_t block = dmz_bio_block(bio); unsigned int nr_blocks = dmz_bio_blocks(bio); - sector_t chunk_block = dmz_chunk_block(dmz->dev, block); + sector_t chunk_block = dmz_chunk_block(zmd, block); int ret = 0; /* For unmapped chunks, there is nothing to do */ @@ -356,7 +358,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, return -EROFS; dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), + (unsigned long long)dmz_bio_chunk(zmd, bio), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -402,7 +404,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, * mapping for read and discard. If a mapping is obtained, + the zone returned will be set to active state. */ - zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio), + zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), bio_op(bio)); if (IS_ERR(zone)) { ret = PTR_ERR(zone); @@ -525,7 +527,7 @@ static void dmz_flush_work(struct work_struct *work) */ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) { - unsigned int chunk = dmz_bio_chunk(dmz->dev, bio); + unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); struct dm_chunk_work *cw; int ret = 0; @@ -618,6 +620,7 @@ bool dmz_check_bdev(struct dmz_dev *dmz_dev) static int dmz_map(struct dm_target *ti, struct bio *bio) { struct dmz_target *dmz = ti->private; + struct dmz_metadata *zmd = dmz->metadata; struct dmz_dev *dev = dmz->dev; struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); sector_t sector = bio->bi_iter.bi_sector; @@ -630,8 +633,8 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", bio_op(bio), (unsigned long long)sector, nr_sectors, - (unsigned long long)dmz_bio_chunk(dmz->dev, bio), - (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); bio_set_dev(bio, dev->bdev); @@ -659,16 +662,16 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) } /* Split zone BIOs to fit entirely into a zone */ - chunk_sector = sector & (dev->zone_nr_sectors - 1); - if (chunk_sector + nr_sectors > dev->zone_nr_sectors) - dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector); + chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); + if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) + dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { dmz_dev_debug(dmz->dev, "BIO op %d, can't process chunk %llu, err %i\n", - bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio), + bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), ret); return DM_MAPIO_REQUEUE; } @@ -722,10 +725,6 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) } dev->zone_nr_sectors = blk_queue_zone_sectors(q); - dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors); - - dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); - dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); @@ -790,7 +789,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Set target (no write same support) */ - ti->max_io_len = dev->zone_nr_sectors << 9; + ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9; ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_zeroes_bios = 1; @@ -799,7 +798,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->discards_supported = true; /* The exposed capacity is the number of chunks that can be mapped */ - ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; + ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << + dmz_zone_nr_sectors_shift(dmz->metadata); /* Zone BIO */ ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); @@ -895,7 +895,7 @@ static void dmz_dtr(struct dm_target *ti) static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dmz_target *dmz = ti->private; - unsigned int chunk_sectors = dmz->dev->zone_nr_sectors; + unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); limits->logical_block_size = DMZ_BLOCK_SIZE; limits->physical_block_size = DMZ_BLOCK_SIZE; @@ -960,7 +960,7 @@ static int dmz_iterate_devices(struct dm_target *ti, { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = dmz->dev; - sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1); + sector_t capacity = dev->capacity & ~(dmz_zone_nr_sectors(dmz->metadata) - 1); return fn(ti, dmz->ddev, 0, capacity, data); } diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 30781646741a..f997ad62c7b4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -60,15 +60,11 @@ struct dmz_dev { unsigned int flags; sector_t zone_nr_sectors; - unsigned int zone_nr_sectors_shift; - - sector_t zone_nr_blocks; - sector_t zone_nr_blocks_shift; }; -#define dmz_bio_chunk(dev, bio) ((bio)->bi_iter.bi_sector >> \ - (dev)->zone_nr_sectors_shift) -#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1)) +#define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \ + dmz_zone_nr_sectors_shift(zmd)) +#define dmz_chunk_block(zmd, b) ((b) & (dmz_zone_nr_blocks(zmd) - 1)) /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) @@ -197,6 +193,10 @@ unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd); +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd); /* * Activate a zone (increment its reference count). -- cgit v1.2.3-59-g8ed1b From 2234e7321dc61f116de1dc913f3ffa7efff02068 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:22 +0200 Subject: dm zoned: introduce dmz_metadata_label() to format device name Introduce dmz_metadata_label() to format the device-mapper device name and use it instead of the device name of the underlying device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 11 ++++++- drivers/md/dm-zoned-reclaim.c | 15 +++++---- drivers/md/dm-zoned-target.c | 74 +++++++++++++++++++++++------------------- drivers/md/dm-zoned.h | 4 ++- 4 files changed, 62 insertions(+), 42 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b844ff02ae7b..7cda48683c0b 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -134,6 +134,8 @@ struct dmz_sb { struct dmz_metadata { struct dmz_dev *dev; + char devname[BDEVNAME_SIZE]; + sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; unsigned int zone_bits_per_mblk; @@ -260,6 +262,11 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_seq); } +const char *dmz_metadata_label(struct dmz_metadata *zmd) +{ + return (const char *)zmd->devname; +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. @@ -2439,7 +2446,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) /* * Initialize the zoned metadata. */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, + const char *devname) { struct dmz_metadata *zmd; unsigned int i; @@ -2450,6 +2458,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) if (!zmd) return -ENOMEM; + strcpy(zmd->devname, devname); zmd->dev = dev; zmd->mblk_rbtree = RB_ROOT; init_rwsem(&zmd->mblk_sem); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5aa5e5130fe8..699c4145306e 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -480,15 +480,16 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2); } - dmz_dev_debug(zrc->dev, - "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", - zrc->kc_throttle.throttle, - (dmz_target_idle(zrc) ? "Idle" : "Busy"), - p_unmap_rnd, nr_unmap_rnd, nr_rnd); + DMDEBUG("(%s): Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", + dmz_metadata_label(zmd), + zrc->kc_throttle.throttle, + (dmz_target_idle(zrc) ? "Idle" : "Busy"), + p_unmap_rnd, nr_unmap_rnd, nr_rnd); ret = dmz_do_reclaim(zrc); if (ret) { - dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); + DMDEBUG("(%s): Reclaim error %d\n", + dmz_metadata_label(zmd), ret); if (!dmz_check_bdev(zrc->dev)) return; } @@ -524,7 +525,7 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, /* Reclaim work */ INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM, - dev->name); + dmz_metadata_label(zmd)); if (!zrc->wq) { ret = -ENOMEM; goto err; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 68c5684d7b01..ba5b8c507c98 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -178,11 +178,12 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, return 0; } - dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : "SEQ"), + zone->id, + (unsigned long long)chunk_block, nr_blocks); /* Check block validity to determine the read location */ bzone = zone->bzone; @@ -316,11 +317,12 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, if (!zone) return -ENOSPC; - dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", + dmz_metadata_label(zmd), + (unsigned long long)dmz_bio_chunk(zmd, bio), + (dmz_is_rnd(zone) ? "RND" : "SEQ"), + zone->id, + (unsigned long long)chunk_block, nr_blocks); if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { /* @@ -357,10 +359,11 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, if (dmz_is_readonly(zone)) return -EROFS; - dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks", - (unsigned long long)dmz_bio_chunk(zmd, bio), - zone->id, - (unsigned long long)chunk_block, nr_blocks); + DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", + dmz_metadata_label(dmz->metadata), + (unsigned long long)dmz_bio_chunk(zmd, bio), + zone->id, + (unsigned long long)chunk_block, nr_blocks); /* * Invalidate blocks in the data zone and its @@ -429,8 +432,8 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, ret = dmz_handle_discard(dmz, zone, bio); break; default: - dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x", - bio_op(bio)); + DMERR("(%s): Unsupported BIO operation 0x%x", + dmz_metadata_label(dmz->metadata), bio_op(bio)); ret = -EIO; } @@ -504,7 +507,8 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); if (ret) - dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret); + DMDEBUG("(%s): Metadata flush failed, rc=%d\n", + dmz_metadata_label(dmz->metadata), ret); /* Process queued flush requests */ while (1) { @@ -631,11 +635,12 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) if (dmz_bdev_is_dying(dmz->dev)) return DM_MAPIO_KILL; - dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", - bio_op(bio), (unsigned long long)sector, nr_sectors, - (unsigned long long)dmz_bio_chunk(zmd, bio), - (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), - (unsigned int)dmz_bio_blocks(bio)); + DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", + dmz_metadata_label(zmd), + bio_op(bio), (unsigned long long)sector, nr_sectors, + (unsigned long long)dmz_bio_chunk(zmd, bio), + (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), + (unsigned int)dmz_bio_blocks(bio)); bio_set_dev(bio, dev->bdev); @@ -669,10 +674,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { - dmz_dev_debug(dmz->dev, - "BIO op %d, can't process chunk %llu, err %i\n", - bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), - ret); + DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i\n", + dmz_metadata_label(zmd), + bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), + ret); return DM_MAPIO_REQUEUE; } @@ -782,7 +787,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Initialize metadata */ dev = dmz->dev; - ret = dmz_ctr_metadata(dev, &dmz->metadata); + ret = dmz_ctr_metadata(dev, &dmz->metadata, + dm_table_device_name(ti->table)); if (ret) { ti->error = "Metadata initialization failed"; goto err_dev; @@ -811,8 +817,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); - dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0, dev->name); + dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0, + dmz_metadata_label(dmz->metadata)); if (!dmz->chunk_wq) { ti->error = "Create chunk workqueue failed"; ret = -ENOMEM; @@ -824,7 +831,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&dmz->flush_list); INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, - dev->name); + dmz_metadata_label(dmz->metadata)); if (!dmz->flush_wq) { ti->error = "Create flush workqueue failed"; ret = -ENOMEM; @@ -839,9 +846,10 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_fwq; } - dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)", - (unsigned long long)ti->len, - (unsigned long long)dmz_sect2blk(ti->len)); + DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", + dmz_metadata_label(dmz->metadata), + (unsigned long long)ti->len, + (unsigned long long)dmz_sect2blk(ti->len)); return 0; err_fwq: diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index f997ad62c7b4..dd768dc60341 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -163,7 +163,8 @@ struct dmz_reclaim; /* * Functions defined in dm-zoned-metadata.c */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd); +int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd, + const char *devname); void dmz_dtr_metadata(struct dmz_metadata *zmd); int dmz_resume_metadata(struct dmz_metadata *zmd); @@ -174,6 +175,7 @@ void dmz_unlock_metadata(struct dmz_metadata *zmd); void dmz_lock_flush(struct dmz_metadata *zmd); void dmz_unlock_flush(struct dmz_metadata *zmd); int dmz_flush_metadata(struct dmz_metadata *zmd); +const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); -- cgit v1.2.3-59-g8ed1b From d0e21ce40c7a41df43b70b863cc64395c7787abd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:23 +0200 Subject: dm zoned: Introduce dmz_dev_is_dying() and dmz_check_dev() Introduce accessors dmz_dev_is_dying() and dmz_check_dev() to avoid having to reference the devices directly. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 14 ++++++++++++-- drivers/md/dm-zoned-reclaim.c | 4 ++-- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 3 +++ 4 files changed, 18 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 7cda48683c0b..426af738f1ca 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -267,6 +267,16 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd) return (const char *)zmd->devname; } +bool dmz_check_dev(struct dmz_metadata *zmd) +{ + return dmz_check_bdev(&zmd->dev[0]); +} + +bool dmz_dev_is_dying(struct dmz_metadata *zmd) +{ + return dmz_bdev_is_dying(&zmd->dev[0]); +} + /* * Lock/unlock mapping table. * The map lock also protects all the zone lists. @@ -1719,7 +1729,7 @@ again: /* Allocate a random zone */ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!dzone) { - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); goto out; } @@ -1820,7 +1830,7 @@ again: /* Allocate a random zone */ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!bzone) { - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); goto out; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 699c4145306e..5daede0daf92 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -455,7 +455,7 @@ static void dmz_reclaim_work(struct work_struct *work) unsigned int p_unmap_rnd; int ret; - if (dmz_bdev_is_dying(zrc->dev)) + if (dmz_dev_is_dying(zmd)) return; if (!dmz_should_reclaim(zrc)) { @@ -490,7 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work) if (ret) { DMDEBUG("(%s): Reclaim error %d\n", dmz_metadata_label(zmd), ret); - if (!dmz_check_bdev(zrc->dev)) + if (!dmz_check_dev(zmd)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ba5b8c507c98..b32e791b8a5c 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -632,7 +632,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) sector_t chunk_sector; int ret; - if (dmz_bdev_is_dying(dmz->dev)) + if (dmz_dev_is_dying(zmd)) return DM_MAPIO_KILL; DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index dd768dc60341..e0883df8a903 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -181,6 +181,9 @@ sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); +bool dmz_check_dev(struct dmz_metadata *zmd); +bool dmz_dev_is_dying(struct dmz_metadata *zmd); + #define DMZ_ALLOC_RND 0x01 #define DMZ_ALLOC_RECLAIM 0x02 -- cgit v1.2.3-59-g8ed1b From 6c805f77f161d65364cfc4e4734f7057a621fee4 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:24 +0200 Subject: dm zoned: remove 'dev' argument from reclaim Use the dmz_zone_to_dev() mapping function to remove the 'dev' argument from reclaim. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 56 ++++++++++++++++++++++--------------------- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 4 ++-- 3 files changed, 32 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5daede0daf92..3c8847d49e5a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -13,7 +13,6 @@ struct dmz_reclaim { struct dmz_metadata *metadata; - struct dmz_dev *dev; struct delayed_work work; struct workqueue_struct *wq; @@ -59,6 +58,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, sector_t block) { struct dmz_metadata *zmd = zrc->metadata; + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); sector_t wp_block = zone->wp_block; unsigned int nr_blocks; int ret; @@ -74,15 +74,15 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, * pointer and the requested position. */ nr_blocks = block - wp_block; - ret = blkdev_issue_zeroout(zrc->dev->bdev, + ret = blkdev_issue_zeroout(dev->bdev, dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), dmz_blk2sect(nr_blocks), GFP_NOIO, 0); if (ret) { - dmz_dev_err(zrc->dev, + dmz_dev_err(dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", zone->id, (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); - dmz_check_bdev(zrc->dev); + dmz_check_bdev(dev); return ret; } @@ -116,7 +116,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, struct dm_zone *src_zone, struct dm_zone *dst_zone) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *dev = zrc->dev; + struct dmz_dev *src_dev, *dst_dev; struct dm_io_region src, dst; sector_t block = 0, end_block; sector_t nr_blocks; @@ -130,13 +130,17 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, else end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); + src_dev = dmz_zone_to_dev(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); + dst_dev = dmz_zone_to_dev(zmd, dst_zone); if (dmz_is_seq(dst_zone)) set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { - if (dev->flags & DMZ_BDEV_DYING) + if (src_dev->flags & DMZ_BDEV_DYING) + return -EIO; + if (dst_dev->flags & DMZ_BDEV_DYING) return -EIO; /* Get a valid region from the source zone */ @@ -156,11 +160,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, return ret; } - src.bdev = dev->bdev; + src.bdev = src_dev->bdev; src.sector = dmz_blk2sect(src_zone_block + block); src.count = dmz_blk2sect(nr_blocks); - dst.bdev = dev->bdev; + dst.bdev = dst_dev->bdev; dst.sector = dmz_blk2sect(dst_zone_block + block); dst.count = src.count; @@ -194,10 +198,10 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret; - dmz_dev_debug(zrc->dev, - "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dzone->chunk, bzone->id, dmz_weight(bzone), - dzone->id, dmz_weight(dzone)); + DMDEBUG("(%s): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", + dmz_metadata_label(zmd), + dzone->chunk, bzone->id, dmz_weight(bzone), + dzone->id, dmz_weight(dzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, bzone, dzone); @@ -233,10 +237,10 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret = 0; - dmz_dev_debug(zrc->dev, - "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - chunk, dzone->id, dmz_weight(dzone), - bzone->id, dmz_weight(bzone)); + DMDEBUG("(%s): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", + dmz_metadata_label(zmd), + chunk, dzone->id, dmz_weight(dzone), + bzone->id, dmz_weight(bzone)); /* Flush data zone into the buffer zone */ ret = dmz_reclaim_copy(zrc, dzone, bzone); @@ -285,9 +289,9 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) if (!szone) return -ENOSPC; - dmz_dev_debug(zrc->dev, - "Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - chunk, dzone->id, dmz_weight(dzone), szone->id); + DMDEBUG("(%s): Chunk %u, move rnd zone %u (weight %u) to seq zone %u", + dmz_metadata_label(zmd), + chunk, dzone->id, dmz_weight(dzone), szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -352,7 +356,6 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) return PTR_ERR(dzone); start = jiffies; - if (dmz_is_rnd(dzone)) { if (!dmz_weight(dzone)) { /* Empty zone */ @@ -400,14 +403,14 @@ out: ret = dmz_flush_metadata(zrc->metadata); if (ret) { - dmz_dev_debug(zrc->dev, - "Metadata flush for zone %u failed, err %d\n", - rzone->id, ret); + DMDEBUG("(%s): Metadata flush for zone %u failed, err %d\n", + dmz_metadata_label(zmd), rzone->id, ret); return ret; } - dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", - rzone->id, jiffies_to_msecs(jiffies - start)); + DMDEBUG("(%s): Reclaimed zone %u in %u ms", + dmz_metadata_label(zmd), + rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } @@ -500,7 +503,7 @@ static void dmz_reclaim_work(struct work_struct *work) /* * Initialize reclaim. */ -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **reclaim) { struct dmz_reclaim *zrc; @@ -510,7 +513,6 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, if (!zrc) return -ENOMEM; - zrc->dev = dev; zrc->metadata = zmd; zrc->atime = jiffies; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b32e791b8a5c..520e55df627b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -840,7 +840,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); /* Initialize reclaim */ - ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim); + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim); if (ret) { ti->error = "Zone reclaim initialization failed"; goto err_fwq; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index e0883df8a903..2629bd51fa26 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -180,6 +180,7 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); +struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone); bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); @@ -254,8 +255,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, /* * Functions defined in dm-zoned-reclaim.c */ -int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, - struct dmz_reclaim **zrc); +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc); void dmz_dtr_reclaim(struct dmz_reclaim *zrc); void dmz_suspend_reclaim(struct dmz_reclaim *zrc); void dmz_resume_reclaim(struct dmz_reclaim *zrc); -- cgit v1.2.3-59-g8ed1b From 52d6775888c65be66f3577ccd0f14b51691df7f9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:25 +0200 Subject: dm zoned: replace 'target' pointer in the bio context Replace the 'target' pointer in the bio context with the device pointer as this is what's actually used. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 520e55df627b..a09fb78ffe88 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -17,7 +17,7 @@ * Zone BIO context. */ struct dmz_bioctx { - struct dmz_target *target; + struct dmz_dev *dev; struct dm_zone *zone; struct bio *bio; refcount_t ref; @@ -76,12 +76,13 @@ struct dmz_target { */ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; if (bio->bi_status != BLK_STS_OK) - bioctx->target->dev->flags |= DMZ_CHECK_BDEV; + bioctx->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -118,14 +119,20 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio, sector_t chunk_block, unsigned int nr_blocks) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_dev *dev = dmz_zone_to_dev(dmz->metadata, zone); struct bio *clone; + if (dev->flags & DMZ_BDEV_DYING) + return -EIO; + clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); if (!clone) return -ENOMEM; - bio_set_dev(clone, dmz->dev->bdev); + bio_set_dev(clone, dev->bdev); + bioctx->dev = dev; clone->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; @@ -218,8 +225,10 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, if (nr_blocks) { /* Valid blocks found: read them */ - nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block); - ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks); + nr_blocks = min_t(unsigned int, nr_blocks, + end_block - chunk_block); + ret = dmz_submit_bio(dmz, rzone, bio, + chunk_block, nr_blocks); if (ret) return ret; chunk_block += nr_blocks; @@ -330,7 +339,8 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, * and the BIO is aligned to the zone write pointer: * direct write the zone. */ - return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks); + return dmz_handle_direct_write(dmz, zone, bio, + chunk_block, nr_blocks); } /* @@ -383,7 +393,8 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, struct bio *bio) { - struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); + struct dmz_bioctx *bioctx = + dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; int ret; @@ -397,11 +408,6 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dmz_lock_metadata(zmd); - if (dmz->dev->flags & DMZ_BDEV_DYING) { - ret = -EIO; - goto out; - } - /* * Get the data zone mapping the chunk. There may be no * mapping for read and discard. If a mapping is obtained, @@ -625,7 +631,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) { struct dmz_target *dmz = ti->private; struct dmz_metadata *zmd = dmz->metadata; - struct dmz_dev *dev = dmz->dev; struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); sector_t sector = bio->bi_iter.bi_sector; unsigned int nr_sectors = bio_sectors(bio); @@ -642,8 +647,6 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); - bio_set_dev(bio, dev->bdev); - if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; @@ -652,7 +655,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_KILL; /* Initialize the BIO context */ - bioctx->target = dmz; + bioctx->dev = NULL; bioctx->zone = NULL; bioctx->bio = bio; refcount_set(&bioctx->ref, 1); @@ -931,11 +934,12 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; + struct dmz_dev *dev = &dmz->dev[0]; - if (!dmz_check_bdev(dmz->dev)) + if (!dmz_check_bdev(dev)) return -EIO; - *bdev = dmz->dev->bdev; + *bdev = dev->bdev; return 0; } -- cgit v1.2.3-59-g8ed1b From aa821c8dc0d76fa9f827becf1186bfd824f1fcfb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:26 +0200 Subject: dm zoned: use dmz_zone_to_dev() when handling metadata I/O Use accessors to retrieve the device pointer in preparation for adding an additional block device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 426af738f1ca..312194be4cb0 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1310,6 +1310,7 @@ static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); unsigned int noio_flag; int ret; @@ -1320,16 +1321,16 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1, + ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, dmz_update_zone_cb, zone); memalloc_noio_restore(noio_flag); if (ret == 0) ret = -EIO; if (ret < 0) { - dmz_dev_err(zmd->dev, "Get zone %u report failed", + dmz_dev_err(dev, "Get zone %u report failed", zone->id); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ret; } @@ -1343,6 +1344,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, struct dm_zone *zone) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); unsigned int wp = 0; int ret; @@ -1351,7 +1353,7 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, if (ret) return ret; - dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", + dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", zone->id, zone->wp_block, wp); if (zone->wp_block < wp) { @@ -1384,7 +1386,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) return 0; if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { - struct dmz_dev *dev = zmd->dev; + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), -- cgit v1.2.3-59-g8ed1b From ca1a70450a969c63dd19f0a34504fa1bd227e730 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:27 +0200 Subject: dm zoned: add metadata logging functions Use the metadata label for logging and not the underlying device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 96 +++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 312194be4cb0..0e7122867fd8 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -194,6 +194,17 @@ struct dmz_metadata { wait_queue_head_t free_wq; }; +#define dmz_zmd_info(zmd, format, args...) \ + DMINFO("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_err(zmd, format, args...) \ + DMERR("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_warn(zmd, format, args...) \ + DMWARN("(%s): " format, (zmd)->devname, ## args) + +#define dmz_zmd_debug(zmd, format, args...) \ + DMDEBUG("(%s): " format, (zmd)->devname, ## args) /* * Various accessors */ @@ -1098,7 +1109,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) int ret; if (!zmd->sb[0].zone) { - dmz_dev_err(zmd->dev, "Primary super block zone not set"); + dmz_zmd_err(zmd, "Primary super block zone not set"); return -ENXIO; } @@ -1135,7 +1146,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Use highest generation sb first */ if (!sb_good[0] && !sb_good[1]) { - dmz_dev_err(zmd->dev, "No valid super block found"); + dmz_zmd_err(zmd, "No valid super block found"); return -EIO; } @@ -1248,7 +1259,7 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) */ static int dmz_init_zones(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; + struct dmz_dev *dev = &zmd->dev[0]; int ret; /* Init */ @@ -1268,8 +1279,8 @@ static int dmz_init_zones(struct dmz_metadata *zmd) if (!zmd->zones) return -ENOMEM; - dmz_dev_info(dev, "Using %zu B for zone information", - sizeof(struct dm_zone) * zmd->nr_zones); + DMINFO("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -1412,7 +1423,6 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); */ static int dmz_load_mapping(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; struct dm_zone *dzone, *bzone; struct dmz_mblock *dmap_mblk = NULL; struct dmz_map *dmap; @@ -1445,7 +1455,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) goto next; if (dzone_id >= zmd->nr_zones) { - dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", chunk, dzone_id); return -EIO; } @@ -1466,14 +1476,14 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) goto next; if (bzone_id >= zmd->nr_zones) { - dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", chunk, bzone_id); return -EIO; } bzone = dmz_get(zmd, bzone_id); if (!dmz_is_rnd(bzone)) { - dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u", + dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); return -EIO; } @@ -1893,7 +1903,7 @@ again: atomic_dec(&zmd->unmap_nr_seq); if (dmz_is_offline(zone)) { - dmz_dev_warn(zmd->dev, "Zone %u is offline", zone->id); + dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); zone = NULL; goto again; } @@ -2104,7 +2114,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; unsigned int n = 0; - dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", + dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -2134,7 +2144,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, if (likely(zone->weight + n <= zone_nr_blocks)) zone->weight += n; else { - dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", + dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", zone->id, zone->weight, zone_nr_blocks - n); zone->weight = zone_nr_blocks; @@ -2184,7 +2194,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, struct dmz_mblock *mblk; unsigned int n = 0; - dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", + dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", zone->id, (u64)chunk_block, nr_blocks); WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); @@ -2214,7 +2224,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, if (zone->weight >= n) zone->weight -= n; else { - dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", + dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", zone->id, zone->weight, n); zone->weight = 0; } @@ -2424,7 +2434,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) while (!list_empty(&zmd->mblk_dirty_list)) { mblk = list_first_entry(&zmd->mblk_dirty_list, struct dmz_mblock, link); - dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", + dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", (u64)mblk->no, mblk->ref); list_del_init(&mblk->link); rb_erase(&mblk->node, &zmd->mblk_rbtree); @@ -2442,7 +2452,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) /* Sanity checks: the mblock rbtree should now be empty */ root = &zmd->mblk_rbtree; rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { - dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", + dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", (u64)mblk->no, mblk->ref); mblk->ref = 0; dmz_free_mblock(zmd, mblk); @@ -2455,6 +2465,19 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) mutex_destroy(&zmd->map_lock); } +static void dmz_print_dev(struct dmz_metadata *zmd, int num) +{ + struct dmz_dev *dev = &zmd->dev[num]; + + dmz_dev_info(dev, "Host-%s zoned block device", + bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? + "aware" : "managed"); + dmz_dev_info(dev, " %llu 512-byte logical sectors", + (u64)dev->capacity); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dev->nr_zones, (u64)zmd->zone_nr_sectors); +} + /* * Initialize the zoned metadata. */ @@ -2531,34 +2554,31 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, /* Metadata cache shrinker */ ret = register_shrinker(&zmd->mblk_shrinker); if (ret) { - dmz_dev_err(dev, "Register metadata cache shrinker failed"); + dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; } - dmz_dev_info(dev, "Host-%s zoned block device", - bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? - "aware" : "managed"); - dmz_dev_info(dev, " %llu 512-byte logical sectors", - (u64)dev->capacity); - dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", DMZ_META_VER); + dmz_print_dev(zmd, 0); + + dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", zmd->nr_zones, (u64)zmd->zone_nr_sectors); - dmz_dev_info(dev, " %u metadata zones", + dmz_zmd_info(zmd, " %u metadata zones", zmd->nr_meta_zones * 2); - dmz_dev_info(dev, " %u data zones for %u chunks", + dmz_zmd_info(zmd, " %u data zones for %u chunks", zmd->nr_data_zones, zmd->nr_chunks); - dmz_dev_info(dev, " %u random zones (%u unmapped)", + dmz_zmd_info(zmd, " %u random zones (%u unmapped)", zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_dev_info(dev, " %u sequential zones (%u unmapped)", + dmz_zmd_info(zmd, " %u sequential zones (%u unmapped)", zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); - dmz_dev_info(dev, " %u reserved sequential data zones", + dmz_zmd_info(zmd, " %u reserved sequential data zones", zmd->nr_reserved_seq); - - dmz_dev_debug(dev, "Format:"); - dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)", + dmz_zmd_debug(zmd, "Format:"); + dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", zmd->nr_meta_blocks, zmd->max_nr_mblks); - dmz_dev_debug(dev, " %u data zone mapping blocks", + dmz_zmd_debug(zmd, " %u data zone mapping blocks", zmd->nr_map_blocks); - dmz_dev_debug(dev, " %u bitmap blocks", + dmz_zmd_debug(zmd, " %u bitmap blocks", zmd->nr_bitmap_blocks); *metadata = zmd; @@ -2587,7 +2607,6 @@ void dmz_dtr_metadata(struct dmz_metadata *zmd) */ int dmz_resume_metadata(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; struct dm_zone *zone; sector_t wp_block; unsigned int i; @@ -2597,20 +2616,19 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) for (i = 0; i < zmd->nr_zones; i++) { zone = dmz_get(zmd, i); if (!zone) { - dmz_dev_err(dev, "Unable to get zone %u", i); + dmz_zmd_err(zmd, "Unable to get zone %u", i); return -EIO; } - wp_block = zone->wp_block; ret = dmz_update_zone(zmd, zone); if (ret) { - dmz_dev_err(dev, "Broken zone %u", i); + dmz_zmd_err(zmd, "Broken zone %u", i); return ret; } if (dmz_is_offline(zone)) { - dmz_dev_warn(dev, "Zone %u is offline", i); + dmz_zmd_warn(zmd, "Zone %u is offline", i); continue; } @@ -2618,7 +2636,7 @@ int dmz_resume_metadata(struct dmz_metadata *zmd) if (!dmz_is_seq(zone)) zone->wp_block = 0; else if (zone->wp_block != wp_block) { - dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)", + dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", i, (u64)zone->wp_block, (u64)wp_block); zone->wp_block = wp_block; dmz_invalidate_blocks(zmd, zone, zone->wp_block, -- cgit v1.2.3-59-g8ed1b From ae3c1f1171467f83849c7e8c5e0e632c5078ca2f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:28 +0200 Subject: dm zoned: Reduce logging output on startup dm-zoned is becoming quite chatty during startup; reduce the noise by moving some information to 'debug' level. Suggested-by: Mike Snitzer Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0e7122867fd8..1290728c197e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1279,8 +1279,8 @@ static int dmz_init_zones(struct dmz_metadata *zmd) if (!zmd->zones) return -ENOMEM; - DMINFO("(%s): Using %zu B for zone information", - zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); + DMDEBUG("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); /* * Get zone information and initialize zone descriptors. At the same @@ -2563,16 +2563,16 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", zmd->nr_zones, (u64)zmd->zone_nr_sectors); - dmz_zmd_info(zmd, " %u metadata zones", - zmd->nr_meta_zones * 2); - dmz_zmd_info(zmd, " %u data zones for %u chunks", - zmd->nr_data_zones, zmd->nr_chunks); - dmz_zmd_info(zmd, " %u random zones (%u unmapped)", - zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_zmd_info(zmd, " %u sequential zones (%u unmapped)", - zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); - dmz_zmd_info(zmd, " %u reserved sequential data zones", - zmd->nr_reserved_seq); + dmz_zmd_debug(zmd, " %u metadata zones", + zmd->nr_meta_zones * 2); + dmz_zmd_debug(zmd, " %u data zones for %u chunks", + zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", + zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); + dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", + zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); + dmz_zmd_debug(zmd, " %u reserved sequential data zones", + zmd->nr_reserved_seq); dmz_zmd_debug(zmd, "Format:"); dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", zmd->nr_meta_blocks, zmd->max_nr_mblks); -- cgit v1.2.3-59-g8ed1b From dc076c838f65723325001c977b39e55fc6ba0fa7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:29 +0200 Subject: dm zoned: ignore metadata zone in dmz_alloc_zone() When looking up zones in dmz_alloc_zone() we need to ignore metadata zones so as not to accidentally overwrite metadata. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1290728c197e..939deed1606a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1907,7 +1907,13 @@ again: zone = NULL; goto again; } + if (dmz_is_meta(zone)) { + struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + dmz_dev_warn(dev, "Zone %u has metadata", zone->id); + zone = NULL; + goto again; + } return zone; } -- cgit v1.2.3-59-g8ed1b From bd5c40313a1467e4683d92456fc5219d94823f24 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 11 May 2020 10:24:30 +0200 Subject: dm zoned: metadata version 2 Implement handling for metadata version 2. The new metadata adds a label and UUID for the device mapper device, and additional UUID for the underlying block devices. It also allows for an additional regular drive to be used for emulating random access zones. The emulated zones will be placed logically in front of the zones from the zoned block device, causing the superblocks and metadata to be stored on that device. The first zone of the original zoned device will be used to hold another, tertiary copy of the metadata; this copy carries a generation number of 0 and is never updated; it's just used for identification. Signed-off-by: Hannes Reinecke Reviewed-by: Bob Liu Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-zoned.rst | 34 ++- drivers/md/dm-zoned-metadata.c | 310 +++++++++++++++++---- drivers/md/dm-zoned-target.c | 185 ++++++++---- drivers/md/dm-zoned.h | 7 +- 4 files changed, 427 insertions(+), 109 deletions(-) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst index 7547ce635161..553752ea2521 100644 --- a/Documentation/admin-guide/device-mapper/dm-zoned.rst +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -37,9 +37,13 @@ Algorithm dm-zoned implements an on-disk buffering scheme to handle non-sequential write accesses to the sequential zones of a zoned block device. Conventional zones are used for caching as well as for storing internal -metadata. +metadata. It can also use a regular block device together with the zoned +block device; in that case the regular block device will be split logically +in zones with the same size as the zoned block device. These zones will be +placed in front of the zones from the zoned block device and will be handled +just like conventional zones. -The zones of the device are separated into 2 types: +The zones of the device(s) are separated into 2 types: 1) Metadata zones: these are conventional zones used to store metadata. Metadata zones are not reported as useable capacity to the user. @@ -127,6 +131,13 @@ resumed. Flushing metadata thus only temporarily delays write and discard requests. Read requests can be processed concurrently while metadata flush is being executed. +If a regular device is used in conjunction with the zoned block device, +a third set of metadata (without the zone bitmaps) is written to the +start of the zoned block device. This metadata has a generation counter of +'0' and will never be updated during normal operation; it just serves for +identification purposes. The first and second copy of the metadata +are located at the start of the regular block device. + Usage ===== @@ -138,12 +149,21 @@ Ex:: dmzadm --format /dev/sdxx -For a formatted device, the target can be created normally with the -dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex:: - echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ - dmsetup create dmz-`basename ${dev}` +If two drives are to be used, both devices must be specified, with the +regular block device as the first device. + +Ex:: + + dmzadm --format /dev/sdxx /dev/sdyy + + +Fomatted device(s) can be started with the dmzadm utility, too.: + +Ex:: + + dmzadm --start /dev/sdxx /dev/sdyy + Information about the internal layout and current usage of the zones can be obtained with the 'status' callback from dmsetup: diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 939deed1606a..fba690dd37f5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -16,7 +16,7 @@ /* * Metadata version. */ -#define DMZ_META_VER 1 +#define DMZ_META_VER 2 /* * On-disk super block magic. @@ -69,8 +69,17 @@ struct dmz_super { /* Checksum */ __le32 crc; /* 48 */ + /* DM-Zoned label */ + u8 dmz_label[32]; /* 80 */ + + /* DM-Zoned UUID */ + u8 dmz_uuid[16]; /* 96 */ + + /* Device UUID */ + u8 dev_uuid[16]; /* 112 */ + /* Padding to full 512B sector */ - u8 reserved[464]; /* 512 */ + u8 reserved[400]; /* 512 */ }; /* @@ -133,8 +142,11 @@ struct dmz_sb { */ struct dmz_metadata { struct dmz_dev *dev; + unsigned int nr_devs; char devname[BDEVNAME_SIZE]; + char label[BDEVNAME_SIZE]; + uuid_t uuid; sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; @@ -161,8 +173,9 @@ struct dmz_metadata { /* Zone information array */ struct dm_zone *zones; - struct dmz_sb sb[2]; + struct dmz_sb sb[3]; unsigned int mblk_primary; + unsigned int sb_version; u64 sb_gen; unsigned int min_nr_mblks; unsigned int max_nr_mblks; @@ -195,31 +208,56 @@ struct dmz_metadata { }; #define dmz_zmd_info(zmd, format, args...) \ - DMINFO("(%s): " format, (zmd)->devname, ## args) + DMINFO("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_err(zmd, format, args...) \ - DMERR("(%s): " format, (zmd)->devname, ## args) + DMERR("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_warn(zmd, format, args...) \ - DMWARN("(%s): " format, (zmd)->devname, ## args) + DMWARN("(%s): " format, (zmd)->label, ## args) #define dmz_zmd_debug(zmd, format, args...) \ - DMDEBUG("(%s): " format, (zmd)->devname, ## args) + DMDEBUG("(%s): " format, (zmd)->label, ## args) /* * Various accessors */ +static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int zone_id; + + if (WARN_ON(!zone)) + return 0; + + zone_id = zone->id; + if (zmd->nr_devs > 1 && + (zone_id >= zmd->dev[1].zone_offset)) + zone_id -= zmd->dev[1].zone_offset; + return zone_id; +} + sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->zone_nr_sectors_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->zone_nr_blocks_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_blocks_shift; } struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) { + if (WARN_ON(!zone)) + return &zmd->dev[0]; + + if (zmd->nr_devs > 1 && + zone->id >= zmd->dev[1].zone_offset) + return &zmd->dev[1]; + return &zmd->dev[0]; } @@ -275,17 +313,29 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) const char *dmz_metadata_label(struct dmz_metadata *zmd) { - return (const char *)zmd->devname; + return (const char *)zmd->label; } bool dmz_check_dev(struct dmz_metadata *zmd) { - return dmz_check_bdev(&zmd->dev[0]); + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (!dmz_check_bdev(&zmd->dev[i])) + return false; + } + return true; } bool dmz_dev_is_dying(struct dmz_metadata *zmd) { - return dmz_bdev_is_dying(&zmd->dev[0]); + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (dmz_bdev_is_dying(&zmd->dev[i])) + return true; + } + return false; } /* @@ -687,6 +737,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op, struct bio *bio; int ret; + if (WARN_ON(!dev)) + return -EIO; + if (dmz_bdev_is_dying(dev)) return -EIO; @@ -711,19 +764,32 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op, */ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) { - sector_t block = zmd->sb[set].block; struct dmz_mblock *mblk = zmd->sb[set].mblk; struct dmz_super *sb = zmd->sb[set].sb; struct dmz_dev *dev = zmd->sb[set].dev; + sector_t sb_block; u64 sb_gen = zmd->sb_gen + 1; int ret; sb->magic = cpu_to_le32(DMZ_MAGIC); - sb->version = cpu_to_le32(DMZ_META_VER); + + sb->version = cpu_to_le32(zmd->sb_version); + if (zmd->sb_version > 1) { + BUILD_BUG_ON(UUID_SIZE != 16); + export_uuid(sb->dmz_uuid, &zmd->uuid); + memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); + export_uuid(sb->dev_uuid, &dev->uuid); + } sb->gen = cpu_to_le64(sb_gen); - sb->sb_block = cpu_to_le64(block); + /* + * The metadata always references the absolute block address, + * ie relative to the entire block range, not the per-device + * block address. + */ + sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; + sb->sb_block = cpu_to_le64(sb_block); sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); @@ -734,7 +800,8 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sb->crc = 0; sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); - ret = dmz_rdwr_block(dev, REQ_OP_WRITE, block, mblk->page); + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, + mblk->page); if (ret == 0) ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL); @@ -915,6 +982,23 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) u32 crc, stored_crc; u64 gen; + if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { + dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", + DMZ_MAGIC, le32_to_cpu(sb->magic)); + return -ENXIO; + } + + zmd->sb_version = le32_to_cpu(sb->version); + if (zmd->sb_version > DMZ_META_VER) { + dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", + DMZ_META_VER, zmd->sb_version); + return -EINVAL; + } + if ((zmd->sb_version < 1) && (set == 2)) { + dmz_dev_err(dev, "Tertiary superblocks are not supported"); + return -EINVAL; + } + gen = le64_to_cpu(sb->gen); stored_crc = le32_to_cpu(sb->crc); sb->crc = 0; @@ -925,16 +1009,45 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { - dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", - DMZ_MAGIC, le32_to_cpu(sb->magic)); - return -ENXIO; - } + if (zmd->sb_version > 1) { + uuid_t sb_uuid; + + import_uuid(&sb_uuid, sb->dmz_uuid); + if (uuid_is_null(&sb_uuid)) { + dmz_dev_err(dev, "NULL DM-Zoned uuid"); + return -ENXIO; + } else if (uuid_is_null(&zmd->uuid)) { + uuid_copy(&zmd->uuid, &sb_uuid); + } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { + dmz_dev_err(dev, "mismatching DM-Zoned uuid, " + "is %pUl expected %pUl", + &sb_uuid, &zmd->uuid); + return -ENXIO; + } + if (!strlen(zmd->label)) + memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); + else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { + dmz_dev_err(dev, "mismatching DM-Zoned label, " + "is %s expected %s", + sb->dmz_label, zmd->label); + return -ENXIO; + } + import_uuid(&dev->uuid, sb->dev_uuid); + if (uuid_is_null(&dev->uuid)) { + dmz_dev_err(dev, "NULL device uuid"); + return -ENXIO; + } - if (le32_to_cpu(sb->version) != DMZ_META_VER) { - dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", - DMZ_META_VER, le32_to_cpu(sb->version)); - return -ENXIO; + if (set == 2) { + /* + * Generation number should be 0, but it doesn't + * really matter if it isn't. + */ + if (gen != 0) + dmz_dev_warn(dev, "Invalid generation %llu", + gen); + return 0; + } } nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) @@ -1185,21 +1298,38 @@ static int dmz_load_sb(struct dmz_metadata *zmd) "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); + if ((zmd->sb_version > 1) && zmd->sb[2].zone) { + zmd->sb[2].block = dmz_start_block(zmd, zmd->sb[2].zone); + zmd->sb[2].dev = dmz_zone_to_dev(zmd, zmd->sb[2].zone); + ret = dmz_get_sb(zmd, 2); + if (ret) { + dmz_dev_err(zmd->sb[2].dev, + "Read tertiary super block failed"); + return ret; + } + ret = dmz_check_sb(zmd, 2); + if (ret == -EINVAL) + return ret; + } return 0; } /* * Initialize a zone descriptor. */ -static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) +static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) { struct dmz_metadata *zmd = data; + struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; + int idx = num + dev->zone_offset; struct dm_zone *zone = &zmd->zones[idx]; - struct dmz_dev *dev = zmd->dev; - /* Ignore the eventual last runt (smaller) zone */ if (blkz->len != zmd->zone_nr_sectors) { - if (blkz->start + blkz->len == dev->capacity) + if (zmd->sb_version > 1) { + /* Ignore the eventual runt (smaller) zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + return 0; + } else if (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO; } @@ -1234,16 +1364,45 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; - if (!zmd->sb[0].zone) { - /* Super block zone */ + if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { + /* Primary super block zone */ zmd->sb[0].zone = zone; } } + if (zmd->nr_devs > 1 && !zmd->sb[2].zone) { + /* Tertiary superblock zone */ + zmd->sb[2].zone = zone; + } } return 0; } +static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +{ + int idx; + sector_t zone_offset = 0; + + for(idx = 0; idx < dev->nr_zones; idx++) { + struct dm_zone *zone = &zmd->zones[idx]; + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = idx; + zone->chunk = DMZ_MAP_UNMAPPED; + set_bit(DMZ_RND, &zone->flags); + zone->wp_block = 0; + zmd->nr_rnd_zones++; + zmd->nr_useable_zones++; + if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { + /* Disable runt zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + break; + } + zone_offset += zmd->zone_nr_sectors; + } +} + /* * Free zones descriptors. */ @@ -1259,11 +1418,11 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) */ static int dmz_init_zones(struct dmz_metadata *zmd) { - struct dmz_dev *dev = &zmd->dev[0]; - int ret; + int i, ret; + struct dmz_dev *zoned_dev = &zmd->dev[0]; /* Init */ - zmd->zone_nr_sectors = dev->zone_nr_sectors; + zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); @@ -1274,7 +1433,14 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ - zmd->nr_zones = dev->nr_zones; + zmd->nr_zones = 0; + for (i = 0; i < zmd->nr_devs; i++) + zmd->nr_zones += zmd->dev[i].nr_zones; + + if (!zmd->nr_zones) { + DMERR("(%s): No zones found", zmd->devname); + return -ENXIO; + } zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); if (!zmd->zones) return -ENOMEM; @@ -1282,14 +1448,27 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMDEBUG("(%s): Using %zu B for zone information", zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); + if (zmd->nr_devs > 1) { + dmz_emulate_zones(zmd, &zmd->dev[0]); + /* + * Primary superblock zone is always at zone 0 when multiple + * drives are present. + */ + zmd->sb[0].zone = &zmd->zones[0]; + + zoned_dev = &zmd->dev[1]; + } + /* * Get zone information and initialize zone descriptors. At the same * time, determine where the super block should be: first block of the * first randomly writable zone. */ - ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, - zmd); + ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, + dmz_init_zone, zmd); if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); dmz_drop_zones(zmd); return ret; } @@ -1325,6 +1504,9 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) unsigned int noio_flag; int ret; + if (dev->flags & DMZ_BDEV_REGULAR) + return 0; + /* * Get zone information from disk. Since blkdev_report_zones() uses * GFP_KERNEL by default for memory allocations, set the per-task @@ -2475,19 +2657,34 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num) { struct dmz_dev *dev = &zmd->dev[num]; - dmz_dev_info(dev, "Host-%s zoned block device", - bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? - "aware" : "managed"); - dmz_dev_info(dev, " %llu 512-byte logical sectors", - (u64)dev->capacity); - dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", - dev->nr_zones, (u64)zmd->zone_nr_sectors); + if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) + dmz_dev_info(dev, "Regular block device"); + else + dmz_dev_info(dev, "Host-%s zoned block device", + bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? + "aware" : "managed"); + if (zmd->sb_version > 1) { + sector_t sector_offset = + dev->zone_offset << zmd->zone_nr_sectors_shift; + + dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", + (u64)dev->capacity, (u64)sector_offset); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", + dev->nr_zones, (u64)zmd->zone_nr_sectors, + (u64)dev->zone_offset); + } else { + dmz_dev_info(dev, " %llu 512-byte logical sectors", + (u64)dev->capacity); + dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", + dev->nr_zones, (u64)zmd->zone_nr_sectors); + } } /* * Initialize the zoned metadata. */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **metadata, const char *devname) { struct dmz_metadata *zmd; @@ -2501,6 +2698,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, strcpy(zmd->devname, devname); zmd->dev = dev; + zmd->nr_devs = num_dev; zmd->mblk_rbtree = RB_ROOT; init_rwsem(&zmd->mblk_sem); mutex_init(&zmd->mblk_flush_lock); @@ -2535,11 +2733,24 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); - if (!dmz_is_rnd(zone)) + if (!dmz_is_rnd(zone)) { + dmz_zmd_err(zmd, + "metadata zone %d is not random", i); + ret = -ENXIO; goto err; + } + set_bit(DMZ_META, &zone->flags); + } + if (zmd->sb[2].zone) { + zone = dmz_get(zmd, zmd->sb[2].zone->id); + if (!zone) { + dmz_zmd_err(zmd, + "Tertiary metadata zone not present"); + ret = -ENXIO; + goto err; + } set_bit(DMZ_META, &zone->flags); } - /* Load mapping table */ ret = dmz_load_mapping(zmd); if (ret) @@ -2564,8 +2775,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata, goto err; } - dmz_zmd_info(zmd, "DM-Zoned metadata version %d", DMZ_META_VER); - dmz_print_dev(zmd, 0); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); + for (i = 0; i < zmd->nr_devs; i++) + dmz_print_dev(zmd, i); dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", zmd->nr_zones, (u64)zmd->zone_nr_sectors); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a09fb78ffe88..ea43f6892ced 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -13,6 +13,8 @@ #define DMZ_MIN_BIOS 8192 +#define DMZ_MAX_DEVS 2 + /* * Zone BIO context. */ @@ -38,7 +40,7 @@ struct dm_chunk_work { * Target descriptor. */ struct dmz_target { - struct dm_dev *ddev; + struct dm_dev *ddev[DMZ_MAX_DEVS]; unsigned long flags; @@ -81,7 +83,7 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; - if (bio->bi_status != BLK_STS_OK) + if (bioctx->dev && bio->bi_status != BLK_STS_OK) bioctx->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { @@ -690,60 +692,64 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* * Get zoned device information. */ -static int dmz_get_zoned_device(struct dm_target *ti, char *path) +static int dmz_get_zoned_device(struct dm_target *ti, char *path, + int idx, int nr_devs) { struct dmz_target *dmz = ti->private; - struct request_queue *q; + struct dm_dev *ddev; struct dmz_dev *dev; - sector_t aligned_capacity; int ret; + struct block_device *bdev; /* Get the target device */ - ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev); + ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); if (ret) { ti->error = "Get target device failed"; - dmz->ddev = NULL; return ret; } - dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL); - if (!dev) { - ret = -ENOMEM; - goto err; + bdev = ddev->bdev; + if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { + if (nr_devs == 1) { + ti->error = "Invalid regular device"; + goto err; + } + if (idx != 0) { + ti->error = "First device must be a regular device"; + goto err; + } + if (dmz->ddev[0]) { + ti->error = "Too many regular devices"; + goto err; + } + dev = &dmz->dev[idx]; + dev->flags = DMZ_BDEV_REGULAR; + } else { + if (dmz->ddev[idx]) { + ti->error = "Too many zoned devices"; + goto err; + } + if (nr_devs > 1 && idx == 0) { + ti->error = "First device must be a regular device"; + goto err; + } + dev = &dmz->dev[idx]; } - - dev->bdev = dmz->ddev->bdev; + dev->bdev = bdev; (void)bdevname(dev->bdev, dev->name); - if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) { - ti->error = "Not a zoned block device"; - ret = -EINVAL; - goto err; - } - - q = bdev_get_queue(dev->bdev); - dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & - ~((sector_t)blk_queue_zone_sectors(q) - 1); - if (ti->begin || - ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { - ti->error = "Partial mapping not supported"; - ret = -EINVAL; + dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + if (ti->begin) { + ti->error = "Partial mapping is not supported"; goto err; } - dev->zone_nr_sectors = blk_queue_zone_sectors(q); - - dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); - - dmz->dev = dev; + dmz->ddev[idx] = ddev; return 0; err: - dm_put_device(ti, dmz->ddev); - kfree(dev); - - return ret; + dm_put_device(ti, ddev); + return -EINVAL; } /* @@ -752,10 +758,56 @@ err: static void dmz_put_zoned_device(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; - dm_put_device(ti, dmz->ddev); - kfree(dmz->dev); - dmz->dev = NULL; + for (i = 0; i < DMZ_MAX_DEVS; i++) { + if (dmz->ddev[i]) { + dm_put_device(ti, dmz->ddev[i]); + dmz->ddev[i] = NULL; + } + } +} + +static int dmz_fixup_devices(struct dm_target *ti) +{ + struct dmz_target *dmz = ti->private; + struct dmz_dev *reg_dev, *zoned_dev; + struct request_queue *q; + + /* + * When we have two devices, the first one must be a regular block + * device and the second a zoned block device. + */ + if (dmz->ddev[0] && dmz->ddev[1]) { + reg_dev = &dmz->dev[0]; + if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { + ti->error = "Primary disk is not a regular device"; + return -EINVAL; + } + zoned_dev = &dmz->dev[1]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Secondary disk is not a zoned device"; + return -EINVAL; + } + } else { + reg_dev = NULL; + zoned_dev = &dmz->dev[0]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Disk is not a zoned device"; + return -EINVAL; + } + } + q = bdev_get_queue(zoned_dev->bdev); + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); + + if (reg_dev) { + reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; + reg_dev->nr_zones = DIV_ROUND_UP(reg_dev->capacity, + reg_dev->zone_nr_sectors); + zoned_dev->zone_offset = reg_dev->nr_zones; + } + return 0; } /* @@ -764,11 +816,10 @@ static void dmz_put_zoned_device(struct dm_target *ti) static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dmz_target *dmz; - struct dmz_dev *dev; int ret; /* Check arguments */ - if (argc != 1) { + if (argc < 1 || argc > 2) { ti->error = "Invalid argument count"; return -EINVAL; } @@ -779,18 +830,34 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Unable to allocate the zoned target descriptor"; return -ENOMEM; } + dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL); + if (!dmz->dev) { + ti->error = "Unable to allocate the zoned device descriptors"; + kfree(dmz); + return -ENOMEM; + } ti->private = dmz; /* Get the target zoned block device */ - ret = dmz_get_zoned_device(ti, argv[0]); + ret = dmz_get_zoned_device(ti, argv[0], 0, argc); + if (ret) + goto err; + + if (argc == 2) { + ret = dmz_get_zoned_device(ti, argv[1], 1, argc); + if (ret) { + dmz_put_zoned_device(ti); + goto err; + } + } + ret = dmz_fixup_devices(ti); if (ret) { - dmz->ddev = NULL; + dmz_put_zoned_device(ti); goto err; } /* Initialize metadata */ - dev = dmz->dev; - ret = dmz_ctr_metadata(dev, &dmz->metadata, + ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, dm_table_device_name(ti->table)); if (ret) { ti->error = "Metadata initialization failed"; @@ -867,6 +934,7 @@ err_meta: err_dev: dmz_put_zoned_device(ti); err: + kfree(dmz->dev); kfree(dmz); return ret; @@ -897,6 +965,7 @@ static void dmz_dtr(struct dm_target *ti) mutex_destroy(&dmz->chunk_lock); + kfree(dmz->dev); kfree(dmz); } @@ -971,10 +1040,17 @@ static int dmz_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dmz_target *dmz = ti->private; - struct dmz_dev *dev = dmz->dev; - sector_t capacity = dev->capacity & ~(dmz_zone_nr_sectors(dmz->metadata) - 1); - - return fn(ti, dmz->ddev, 0, capacity, data); + unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); + sector_t capacity; + int r; + + capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[0], 0, capacity, data); + if (!r && dmz->ddev[1]) { + capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[1], 0, capacity, data); + } + return r; } static void dmz_status(struct dm_target *ti, status_type_t type, @@ -984,6 +1060,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type, struct dmz_target *dmz = ti->private; ssize_t sz = 0; char buf[BDEVNAME_SIZE]; + struct dmz_dev *dev; switch (type) { case STATUSTYPE_INFO: @@ -995,8 +1072,14 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dmz_nr_seq_zones(dmz->metadata)); break; case STATUSTYPE_TABLE: - format_dev_t(buf, dmz->dev->bdev->bd_dev); + dev = &dmz->dev[0]; + format_dev_t(buf, dev->bdev->bd_dev); DMEMIT("%s", buf); + if (dmz->dev[1].bdev) { + dev = &dmz->dev[1]; + format_dev_t(buf, dev->bdev->bd_dev); + DMEMIT(" %s", buf); + } break; } return; @@ -1018,7 +1101,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, static struct target_type dmz_type = { .name = "zoned", - .version = {1, 1, 0}, + .version = {2, 0, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = dmz_ctr, diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 2629bd51fa26..4971a765be55 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -52,10 +52,12 @@ struct dmz_dev { struct block_device *bdev; char name[BDEVNAME_SIZE]; + uuid_t uuid; sector_t capacity; unsigned int nr_zones; + unsigned int zone_offset; unsigned int flags; @@ -69,6 +71,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) #define DMZ_CHECK_BDEV (2 << 0) +#define DMZ_BDEV_REGULAR (4 << 0) /* * Zone descriptor. @@ -163,8 +166,8 @@ struct dmz_reclaim; /* * Functions defined in dm-zoned-metadata.c */ -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd, - const char *devname); +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, + struct dmz_metadata **zmd, const char *devname); void dmz_dtr_metadata(struct dmz_metadata *zmd); int dmz_resume_metadata(struct dmz_metadata *zmd); -- cgit v1.2.3-59-g8ed1b From b18ae8dd9d7685233d7be472c043c545f18d015a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:51:58 -0500 Subject: dm: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-log-writes.c | 2 +- drivers/md/dm-raid.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-stats.c | 2 +- drivers/md/dm-stripe.c | 2 +- drivers/md/dm-switch.c | 2 +- drivers/md/persistent-data/dm-btree-internal.h | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 91787cde369b..71c651465bdd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -214,7 +214,7 @@ struct crypt_config { struct mutex bio_alloc_lock; u8 *authenc_key; /* space for keys in authenc() format (if used) */ - u8 key[0]; + u8 key[]; }; #define MIN_IOS 64 diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3726b987151e..f794dca22032 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -92,7 +92,7 @@ struct journal_entry { } s; __u64 sector; } u; - commit_id_t last_bytes[0]; + commit_id_t last_bytes[]; /* __u8 tag[0]; */ }; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8ea20b56b4d6..e3d35c6c9f71 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -127,7 +127,7 @@ struct pending_block { char *data; u32 datalen; struct list_head list; - struct bio_vec vecs[0]; + struct bio_vec vecs[]; }; struct per_bio_data { diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a18bef0a5ff..10e8b2fe787b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -254,7 +254,7 @@ struct raid_set { int mode; } journal_dev; - struct raid_dev dev[0]; + struct raid_dev dev[]; }; static void rs_config_backup(struct raid_set *rs, struct rs_layout *l) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 089aed57e083..2f655d9f4200 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -83,7 +83,7 @@ struct mirror_set { struct work_struct trigger_event; unsigned nr_mirrors; - struct mirror mirror[0]; + struct mirror mirror[]; }; DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 71417048256a..35d368c418d0 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -56,7 +56,7 @@ struct dm_stat { size_t percpu_alloc_size; size_t histogram_alloc_size; struct dm_stat_percpu *stat_percpu[NR_CPUS]; - struct dm_stat_shared stat_shared[0]; + struct dm_stat_shared stat_shared[]; }; #define STAT_PRECISE_TIMESTAMPS 1 diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index fa813c0f993d..151d022b032d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -41,7 +41,7 @@ struct stripe_c { /* Work struct used for triggering events*/ struct work_struct trigger_event; - struct stripe stripe[0]; + struct stripe stripe[]; }; /* diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 8a0f057b8122..bff4c7fa1cd2 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -53,7 +53,7 @@ struct switch_ctx { /* * Array of dm devices to switch between. */ - struct switch_path path_list[0]; + struct switch_path path_list[]; }; static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 55a4096f1334..564896659dd4 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -38,7 +38,7 @@ struct node_header { struct btree_node { struct node_header header; - __le64 keys[0]; + __le64 keys[]; } __packed; -- cgit v1.2.3-59-g8ed1b From 49de3b7d21ef12e03358aa77ad6bff4aaf5ac3f5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 14 May 2020 08:09:29 +0200 Subject: dm zoned: remove spurious newlines from debugging messages DMDEBUG will already add a newline to the logging messages, so we shouldn't be adding it to the message itself. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 4 ++-- drivers/md/dm-zoned-target.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 3c8847d49e5a..7e9b11ee064f 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -403,7 +403,7 @@ out: ret = dmz_flush_metadata(zrc->metadata); if (ret) { - DMDEBUG("(%s): Metadata flush for zone %u failed, err %d\n", + DMDEBUG("(%s): Metadata flush for zone %u failed, err %d", dmz_metadata_label(zmd), rzone->id, ret); return ret; } @@ -491,7 +491,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { - DMDEBUG("(%s): Reclaim error %d\n", + DMDEBUG("(%s): Reclaim error %d", dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) return; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index ea43f6892ced..a3d572da70ad 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -515,7 +515,7 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); if (ret) - DMDEBUG("(%s): Metadata flush failed, rc=%d\n", + DMDEBUG("(%s): Metadata flush failed, rc=%d", dmz_metadata_label(dmz->metadata), ret); /* Process queued flush requests */ @@ -679,7 +679,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* Now ready to handle this BIO */ ret = dmz_queue_chunk_work(dmz, bio); if (ret) { - DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i\n", + DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", dmz_metadata_label(zmd), bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), ret); -- cgit v1.2.3-59-g8ed1b From ac75b09fc62df441eee90fecfe9b2a6ca24976f2 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 14 May 2020 12:55:39 -0400 Subject: dm: use DMDEBUG macros now that they use pr_debug variants Now that DMDEBUG uses pr_debug and DMDEBUG_LIMIT uses pr_debug_ratelimited cleanup DM's 2 direct pr_debug callers to use them to get the benefit of consistent DM_FMT formatting of debugging messages. While doing so, dm-mpath.c:dm_report_EIO() was switched over to using DMDEBUG_LIMIT due to the potential for error handling floods in the IO completion path. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 12 ++++++------ drivers/md/dm.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 74246d7c7d68..95f16d816585 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -439,7 +439,7 @@ failed: } /* - * dm_report_EIO() is a macro instead of a function to make pr_debug() + * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited() * report the function name and line number of the function from which * it has been invoked. */ @@ -447,11 +447,11 @@ failed: do { \ struct mapped_device *md = dm_table_get_md((m)->ti->table); \ \ - pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ - dm_device_name(md), \ - test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ - dm_noflush_suspending((m)->ti)); \ + DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \ + dm_device_name(md), \ + test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ + dm_noflush_suspending((m)->ti)); \ } while (0) /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2fcb932eb4bd..1fae647ef108 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2609,7 +2609,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); else - pr_debug("%s: suspending with flush\n", dm_device_name(md)); + DMDEBUG("%s: suspending with flush", dm_device_name(md)); /* * This gets reverted if there's an error later and the targets -- cgit v1.2.3-59-g8ed1b From 42c689f671233371f5c8c1685ab77bd66c274932 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 13 May 2020 01:45:22 -0700 Subject: dm zoned: Avoid 64-bit division error in dmz_fixup_devices When building arm32 allyesconfig: ld.lld: error: undefined symbol: __aeabi_uldivmod >>> referenced by dm-zoned-target.c >>> md/dm-zoned-target.o:(dmz_ctr) in archive drivers/built-in.a dmz_fixup_devices uses DIV_ROUND_UP with variables of type sector_t. As such, it should be using DIV_ROUND_UP_SECTOR_T, which handles this automatically. Fixes: 70978208ec91 ("dm zoned: metadata version 2") Signed-off-by: Nathan Chancellor Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a3d572da70ad..b586fc67d931 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -803,8 +803,9 @@ static int dmz_fixup_devices(struct dm_target *ti) if (reg_dev) { reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; - reg_dev->nr_zones = DIV_ROUND_UP(reg_dev->capacity, - reg_dev->zone_nr_sectors); + reg_dev->nr_zones = + DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, + reg_dev->zone_nr_sectors); zoned_dev->zone_offset = reg_dev->nr_zones; } return 0; -- cgit v1.2.3-59-g8ed1b From 489dc0f06a5837f87482c0ce61d830d24e17082e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:19 +0200 Subject: dm zoned: return NULL if dmz_get_zone_for_reclaim() fails to find a zone The only case where dmz_get_zone_for_reclaim() cannot return a zone is if the respective lists are empty. So we should just return a simple NULL value here as we really don't have an error code which would make sense. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- drivers/md/dm-zoned-reclaim.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fba690dd37f5..fa7bcb28e952 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1845,7 +1845,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return ERR_PTR(-EBUSY); + return NULL; } /* @@ -1865,7 +1865,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return ERR_PTR(-EBUSY); + return NULL; } /* diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 7e9b11ee064f..201177ad1f17 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -352,8 +352,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (IS_ERR(dzone)) - return PTR_ERR(dzone); + if (!dzone) + return -EBUSY; start = jiffies; if (dmz_is_rnd(dzone)) { -- cgit v1.2.3-59-g8ed1b From 34f5affd04c4a16d9df19c369bcec6e873e57ffe Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:20 +0200 Subject: dm zoned: separate random and cache zones Instead of lumping emulated zones together with random zones we should be handling them as separate 'cache' zones. This improves code readability and allows an easier implementation of different cache policies. Also add additional allocation flags, to separate the type (cache, random, or sequential) from the purpose (eg reclaim). Also switch the allocation policy to not use random zones as buffer zones if cache zones are present. This avoids a performance drop when all cache zones are used. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 123 ++++++++++++++++++++++++++++++----------- drivers/md/dm-zoned-reclaim.c | 76 +++++++++++++++---------- drivers/md/dm-zoned-target.c | 19 +++++-- drivers/md/dm-zoned.h | 8 ++- 4 files changed, 159 insertions(+), 67 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fa7bcb28e952..6c009a8b36a4 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -166,6 +166,7 @@ struct dmz_metadata { unsigned int nr_meta_blocks; unsigned int nr_meta_zones; unsigned int nr_data_zones; + unsigned int nr_cache_zones; unsigned int nr_rnd_zones; unsigned int nr_reserved_seq; unsigned int nr_chunks; @@ -196,6 +197,11 @@ struct dmz_metadata { struct list_head unmap_rnd_list; struct list_head map_rnd_list; + unsigned int nr_cache; + atomic_t unmap_nr_cache; + struct list_head unmap_cache_list; + struct list_head map_cache_list; + unsigned int nr_seq; atomic_t unmap_nr_seq; struct list_head unmap_seq_list; @@ -301,6 +307,16 @@ unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_rnd); } +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_cache; +} + +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_cache); +} + unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) { return zmd->nr_seq; @@ -1390,9 +1406,9 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) atomic_set(&zone->refcount, 0); zone->id = idx; zone->chunk = DMZ_MAP_UNMAPPED; - set_bit(DMZ_RND, &zone->flags); + set_bit(DMZ_CACHE, &zone->flags); zone->wp_block = 0; - zmd->nr_rnd_zones++; + zmd->nr_cache_zones++; zmd->nr_useable_zones++; if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { /* Disable runt zone */ @@ -1647,7 +1663,9 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) dzone->chunk = chunk; dmz_get_zone_weight(zmd, dzone); - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) list_add_tail(&dzone->link, &zmd->map_rnd_list); else list_add_tail(&dzone->link, &zmd->map_seq_list); @@ -1664,7 +1682,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } bzone = dmz_get(zmd, bzone_id); - if (!dmz_is_rnd(bzone)) { + if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); return -EIO; @@ -1676,7 +1694,10 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) bzone->bzone = dzone; dzone->bzone = bzone; dmz_get_zone_weight(zmd, bzone); - list_add_tail(&bzone->link, &zmd->map_rnd_list); + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &zmd->map_rnd_list); next: chunk++; e++; @@ -1693,8 +1714,12 @@ next: dzone = dmz_get(zmd, i); if (dmz_is_meta(dzone)) continue; + if (dmz_is_offline(dzone)) + continue; - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + zmd->nr_cache++; + else if (dmz_is_rnd(dzone)) zmd->nr_rnd++; else zmd->nr_seq++; @@ -1707,7 +1732,10 @@ next: /* Unmapped data zone */ set_bit(DMZ_DATA, &dzone->flags); dzone->chunk = DMZ_MAP_UNMAPPED; - if (dmz_is_rnd(dzone)) { + if (dmz_is_cache(dzone)) { + list_add_tail(&dzone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(dzone)) { list_add_tail(&dzone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { @@ -1751,6 +1779,9 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (dmz_is_seq(zone)) { /* LRU rotate sequential zone */ list_add_tail(&zone->link, &zmd->map_seq_list); + } else if (dmz_is_cache(zone)) { + /* LRU rotate cache zone */ + list_add_tail(&zone->link, &zmd->map_cache_list); } else { /* LRU rotate random zone */ list_add_tail(&zone->link, &zmd->map_rnd_list); @@ -1826,17 +1857,19 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) } /* - * Select a random write zone for reclaim. + * Select a cache or random write zone for reclaim. */ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *dzone = NULL; struct dm_zone *zone; + struct list_head *zone_list = &zmd->map_rnd_list; - if (list_empty(&zmd->map_rnd_list)) - return ERR_PTR(-EBUSY); + /* If we have cache zones select from the cache zone list */ + if (zmd->nr_cache) + zone_list = &zmd->map_cache_list; - list_for_each_entry(zone, &zmd->map_rnd_list, link) { + list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) dzone = zone->bzone; else @@ -1855,9 +1888,6 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *zone; - if (list_empty(&zmd->map_seq_list)) - return ERR_PTR(-EBUSY); - list_for_each_entry(zone, &zmd->map_seq_list, link) { if (!zone->bzone) continue; @@ -1907,6 +1937,7 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu unsigned int dzone_id; struct dm_zone *dzone = NULL; int ret = 0; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; dmz_lock_map(zmd); again: @@ -1921,7 +1952,7 @@ again: goto out; /* Allocate a random zone */ - dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); + dzone = dmz_alloc_zone(zmd, alloc_flags); if (!dzone) { if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); @@ -2014,6 +2045,7 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, struct dm_zone *dzone) { struct dm_zone *bzone; + int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; dmz_lock_map(zmd); again: @@ -2022,7 +2054,7 @@ again: goto out; /* Allocate a random zone */ - bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); + bzone = dmz_alloc_zone(zmd, alloc_flags); if (!bzone) { if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); @@ -2039,7 +2071,10 @@ again: bzone->chunk = dzone->chunk; bzone->bzone = dzone; dzone->bzone = bzone; - list_add_tail(&bzone->link, &zmd->map_rnd_list); + if (dmz_is_cache(bzone)) + list_add_tail(&bzone->link, &zmd->map_cache_list); + else + list_add_tail(&bzone->link, &zmd->map_rnd_list); out: dmz_unlock_map(zmd); @@ -2055,31 +2090,46 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) struct list_head *list; struct dm_zone *zone; - if (flags & DMZ_ALLOC_RND) + if (flags & DMZ_ALLOC_CACHE) + list = &zmd->unmap_cache_list; + else if (flags & DMZ_ALLOC_RND) list = &zmd->unmap_rnd_list; else list = &zmd->unmap_seq_list; + again: if (list_empty(list)) { /* - * No free zone: if this is for reclaim, allow using the - * reserved sequential zones. + * No free zone: return NULL if this is for not reclaim. */ - if (!(flags & DMZ_ALLOC_RECLAIM) || - list_empty(&zmd->reserved_seq_zones_list)) + if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - - zone = list_first_entry(&zmd->reserved_seq_zones_list, - struct dm_zone, link); - list_del_init(&zone->link); - atomic_dec(&zmd->nr_reserved_seq_zones); + /* + * Use sequential write zones if we started off with random + * zones and the list is empty + */ + if (list == &zmd->unmap_rnd_list) { + list = &zmd->unmap_seq_list; + goto again; + } + /* + * Fallback to the reserved sequential zones + */ + zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, + struct dm_zone, link); + if (zone) { + list_del_init(&zone->link); + atomic_dec(&zmd->nr_reserved_seq_zones); + } return zone; } zone = list_first_entry(list, struct dm_zone, link); list_del_init(&zone->link); - if (dmz_is_rnd(zone)) + if (dmz_is_cache(zone)) + atomic_dec(&zmd->unmap_nr_cache); + else if (dmz_is_rnd(zone)) atomic_dec(&zmd->unmap_nr_rnd); else atomic_dec(&zmd->unmap_nr_seq); @@ -2110,7 +2160,10 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) dmz_reset_zone(zmd, zone); /* Return the zone to its type unmap list */ - if (dmz_is_rnd(zone)) { + if (dmz_is_cache(zone)) { + list_add_tail(&zone->link, &zmd->unmap_cache_list); + atomic_inc(&zmd->unmap_nr_cache); + } else if (dmz_is_rnd(zone)) { list_add_tail(&zone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < @@ -2136,7 +2189,9 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, dmz_set_chunk_mapping(zmd, chunk, dzone->id, DMZ_MAP_UNMAPPED); dzone->chunk = chunk; - if (dmz_is_rnd(dzone)) + if (dmz_is_cache(dzone)) + list_add_tail(&dzone->link, &zmd->map_cache_list); + else if (dmz_is_rnd(dzone)) list_add_tail(&dzone->link, &zmd->map_rnd_list); else list_add_tail(&dzone->link, &zmd->map_seq_list); @@ -2711,6 +2766,10 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, INIT_LIST_HEAD(&zmd->unmap_rnd_list); INIT_LIST_HEAD(&zmd->map_rnd_list); + atomic_set(&zmd->unmap_nr_cache, 0); + INIT_LIST_HEAD(&zmd->unmap_cache_list); + INIT_LIST_HEAD(&zmd->map_cache_list); + atomic_set(&zmd->unmap_nr_seq, 0); INIT_LIST_HEAD(&zmd->unmap_seq_list); INIT_LIST_HEAD(&zmd->map_seq_list); @@ -2733,7 +2792,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); - if (!dmz_is_rnd(zone)) { + if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { dmz_zmd_err(zmd, "metadata zone %d is not random", i); ret = -ENXIO; @@ -2785,6 +2844,8 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->nr_meta_zones * 2); dmz_zmd_debug(zmd, " %u data zones for %u chunks", zmd->nr_data_zones, zmd->nr_chunks); + dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", + zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 201177ad1f17..d566dedcd8b8 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -43,13 +43,13 @@ enum { * Percentage of unmapped (free) random zones below which reclaim starts * even if the target is busy. */ -#define DMZ_RECLAIM_LOW_UNMAP_RND 30 +#define DMZ_RECLAIM_LOW_UNMAP_ZONES 30 /* * Percentage of unmapped (free) random zones above which reclaim will * stop if the target is busy. */ -#define DMZ_RECLAIM_HIGH_UNMAP_RND 50 +#define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50 /* * Align a sequential zone write pointer to chunk_block. @@ -281,17 +281,21 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dm_zone *szone = NULL; struct dmz_metadata *zmd = zrc->metadata; int ret; + int alloc_flags = dmz_nr_cache_zones(zmd) ? + DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; /* Get a free sequential zone */ dmz_lock_map(zmd); - szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM); + szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); dmz_unlock_map(zmd); if (!szone) return -ENOSPC; - DMDEBUG("(%s): Chunk %u, move rnd zone %u (weight %u) to seq zone %u", - dmz_metadata_label(zmd), - chunk, dzone->id, dmz_weight(dzone), szone->id); + DMDEBUG("(%s): Chunk %u, move %s zone %u (weight %u) to %s zone %u", + dmz_metadata_label(zmd), chunk, + dmz_is_cache(dzone) ? "cache" : "rnd", + dzone->id, dmz_weight(dzone), + dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); /* Flush the random data zone into the sequential zone */ ret = dmz_reclaim_copy(zrc, dzone, szone); @@ -356,7 +360,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) return -EBUSY; start = jiffies; - if (dmz_is_rnd(dzone)) { + if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { if (!dmz_weight(dzone)) { /* Empty zone */ dmz_reclaim_empty(zrc, dzone); @@ -422,29 +426,41 @@ static inline int dmz_target_idle(struct dmz_reclaim *zrc) return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); } -/* - * Test if reclaim is necessary. - */ -static bool dmz_should_reclaim(struct dmz_reclaim *zrc) +static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; + unsigned int nr_cache = dmz_nr_cache_zones(zmd); unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); - unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; + unsigned int nr_unmap, nr_zones; + if (nr_cache) { + nr_zones = nr_cache; + nr_unmap = dmz_nr_unmap_cache_zones(zmd); + } else { + nr_zones = nr_rnd; + nr_unmap = dmz_nr_unmap_rnd_zones(zmd); + } + return nr_unmap * 100 / nr_zones; +} + +/* + * Test if reclaim is necessary. + */ +static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) +{ /* Reclaim when idle */ - if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd) + if (dmz_target_idle(zrc) && p_unmap < 100) return true; - /* If there are still plenty of random zones, do not reclaim */ - if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND) + /* If there are still plenty of cache zones, do not reclaim */ + if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES) return false; /* - * If the percentage of unmapped random zones is low, + * If the percentage of unmapped cache zones is low, * reclaim even if the target is busy. */ - return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND; + return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES; } /* @@ -454,14 +470,14 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int nr_rnd, nr_unmap_rnd; - unsigned int p_unmap_rnd; + unsigned int p_unmap; int ret; if (dmz_dev_is_dying(zmd)) return; - if (!dmz_should_reclaim(zrc)) { + p_unmap = dmz_reclaim_percentage(zrc); + if (!dmz_should_reclaim(zrc, p_unmap)) { mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); return; } @@ -472,22 +488,22 @@ static void dmz_reclaim_work(struct work_struct *work) * and slower if there are still some free random zones to avoid * as much as possible to negatively impact the user workload. */ - nr_rnd = dmz_nr_rnd_zones(zmd); - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; - if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) { + if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) { /* Idle or very low percentage: go fast */ zrc->kc_throttle.throttle = 100; } else { /* Busy but we still have some random zone: throttle */ - zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2); + zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - DMDEBUG("(%s): Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", + DMDEBUG("(%s): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->kc_throttle.throttle, (dmz_target_idle(zrc) ? "Idle" : "Busy"), - p_unmap_rnd, nr_unmap_rnd, nr_rnd); + p_unmap, dmz_nr_unmap_cache_zones(zmd), + dmz_nr_cache_zones(zmd), + dmz_nr_unmap_rnd_zones(zmd), + dmz_nr_rnd_zones(zmd)); ret = dmz_do_reclaim(zrc); if (ret) { @@ -585,7 +601,9 @@ void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc) */ void dmz_schedule_reclaim(struct dmz_reclaim *zrc) { - if (dmz_should_reclaim(zrc)) + unsigned int p_unmap = dmz_reclaim_percentage(zrc); + + if (dmz_should_reclaim(zrc, p_unmap)) mod_delayed_work(zrc->wq, &zrc->work, 0); } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b586fc67d931..2770e293a97b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -190,7 +190,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", dmz_metadata_label(zmd), (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), zone->id, (unsigned long long)chunk_block, nr_blocks); @@ -198,7 +199,8 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, bzone = zone->bzone; while (chunk_block < end_block) { nr_blocks = 0; - if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) { + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) { /* Test block validity in the data zone */ ret = dmz_block_valid(zmd, zone, chunk_block); if (ret < 0) @@ -331,11 +333,13 @@ static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", dmz_metadata_label(zmd), (unsigned long long)dmz_bio_chunk(zmd, bio), - (dmz_is_rnd(zone) ? "RND" : "SEQ"), + (dmz_is_rnd(zone) ? "RND" : + (dmz_is_cache(zone) ? "CACHE" : "SEQ")), zone->id, (unsigned long long)chunk_block, nr_blocks); - if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) { + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block == zone->wp_block) { /* * zone is a random zone or it is a sequential zone * and the BIO is aligned to the zone write pointer: @@ -381,7 +385,8 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, * Invalidate blocks in the data zone and its * buffer zone if one is mapped. */ - if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) + if (dmz_is_rnd(zone) || dmz_is_cache(zone) || + chunk_block < zone->wp_block) ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); if (ret == 0 && zone->bzone) ret = dmz_invalidate_blocks(zmd, zone->bzone, @@ -1065,8 +1070,10 @@ static void dmz_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u zones %u/%u random %u/%u sequential", + DMEMIT("%u zones %u/%u cache %u/%u random %u/%u sequential", dmz_nr_zones(dmz->metadata), + dmz_nr_unmap_cache_zones(dmz->metadata), + dmz_nr_cache_zones(dmz->metadata), dmz_nr_unmap_rnd_zones(dmz->metadata), dmz_nr_rnd_zones(dmz->metadata), dmz_nr_unmap_seq_zones(dmz->metadata), diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 4971a765be55..29e01a853f84 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -111,6 +111,7 @@ struct dm_zone { */ enum { /* Zone write type */ + DMZ_CACHE, DMZ_RND, DMZ_SEQ, @@ -131,6 +132,7 @@ enum { /* * Zone data accessors. */ +#define dmz_is_cache(z) test_bit(DMZ_CACHE, &(z)->flags) #define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags) #define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags) #define dmz_is_empty(z) ((z)->wp_block == 0) @@ -189,7 +191,9 @@ bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); #define DMZ_ALLOC_RND 0x01 -#define DMZ_ALLOC_RECLAIM 0x02 +#define DMZ_ALLOC_CACHE 0x02 +#define DMZ_ALLOC_SEQ 0x04 +#define DMZ_ALLOC_RECLAIM 0x10 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags); void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); @@ -198,6 +202,8 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, unsigned int chunk); void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); -- cgit v1.2.3-59-g8ed1b From 90a9b8693f1b84a695864f2b416cba9bde107268 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:21 +0200 Subject: dm zoned: reclaim random zones when idle When the system is idle we should be starting reclaiming random zones, too. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 13 +++++++++---- drivers/md/dm-zoned-reclaim.c | 30 +++++++++++++++++++----------- drivers/md/dm-zoned.h | 2 +- 3 files changed, 29 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 6c009a8b36a4..b5fd67eff046 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1859,15 +1859,20 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) /* * Select a cache or random write zone for reclaim. */ -static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) +static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, + bool idle) { struct dm_zone *dzone = NULL; struct dm_zone *zone; struct list_head *zone_list = &zmd->map_rnd_list; /* If we have cache zones select from the cache zone list */ - if (zmd->nr_cache) + if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; + /* Try to relaim random zones, too, when idle */ + if (idle && list_empty(zone_list)) + zone_list = &zmd->map_rnd_list; + } list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) @@ -1901,7 +1906,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) /* * Select a zone for reclaim. */ -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) { struct dm_zone *zone; @@ -1917,7 +1922,7 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) if (list_empty(&zmd->reserved_seq_zones_list)) zone = dmz_get_seq_zone_for_reclaim(zmd); else - zone = dmz_get_rnd_zone_for_reclaim(zmd); + zone = dmz_get_rnd_zone_for_reclaim(zmd, idle); dmz_unlock_map(zmd); return zone; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d566dedcd8b8..bd62245d4556 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -284,7 +284,10 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) int alloc_flags = dmz_nr_cache_zones(zmd) ? DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; - /* Get a free sequential zone */ + /* Always use sequential zones to reclaim random zones */ + if (dmz_is_rnd(dzone)) + alloc_flags = DMZ_ALLOC_SEQ; + /* Get a free random or sequential zone */ dmz_lock_map(zmd); szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); dmz_unlock_map(zmd); @@ -343,6 +346,14 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); } +/* + * Test if the target device is idle. + */ +static inline int dmz_target_idle(struct dmz_reclaim *zrc) +{ + return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); +} + /* * Find a candidate zone for reclaim and process it. */ @@ -355,7 +366,7 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) int ret; /* Get a data zone */ - dzone = dmz_get_zone_for_reclaim(zmd); + dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); if (!dzone) return -EBUSY; @@ -418,14 +429,6 @@ out: return 0; } -/* - * Test if the target device is idle. - */ -static inline int dmz_target_idle(struct dmz_reclaim *zrc) -{ - return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); -} - static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; @@ -448,8 +451,13 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) */ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { + unsigned int nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + + if (dmz_nr_cache_zones(zrc->metadata)) + nr_reclaim += dmz_nr_cache_zones(zrc->metadata); + /* Reclaim when idle */ - if (dmz_target_idle(zrc) && p_unmap < 100) + if (dmz_target_idle(zrc) && nr_reclaim) return true; /* If there are still plenty of cache zones, do not reclaim */ diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 29e01a853f84..288054dd7cf4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -240,7 +240,7 @@ static inline bool dmz_is_active(struct dm_zone *zone) int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd); +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op); -- cgit v1.2.3-59-g8ed1b From c5c7885952927384837a070793698d996cb0fbf3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:22 +0200 Subject: dm zoned: start reclaim with sequential zones Sequential zones perform better for reclaim, so start off using them and only use random zones as a fallback when cache zones are present. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index bd62245d4556..d62f6890b92c 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -281,15 +281,16 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dm_zone *szone = NULL; struct dmz_metadata *zmd = zrc->metadata; int ret; - int alloc_flags = dmz_nr_cache_zones(zmd) ? - DMZ_ALLOC_RND : DMZ_ALLOC_SEQ; + int alloc_flags = DMZ_ALLOC_SEQ; - /* Always use sequential zones to reclaim random zones */ - if (dmz_is_rnd(dzone)) - alloc_flags = DMZ_ALLOC_SEQ; /* Get a free random or sequential zone */ dmz_lock_map(zmd); +again: szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); + if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { + alloc_flags = DMZ_ALLOC_RND; + goto again; + } dmz_unlock_map(zmd); if (!szone) return -ENOSPC; -- cgit v1.2.3-59-g8ed1b From a16b7dee302d2040d9e1fedff2161d1aceda0e8c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 19 May 2020 10:14:23 +0200 Subject: dm zoned: terminate reclaim on congestion When dmz_get_chunk_mapping() selects a zone which is under reclaim we should terminate the reclaim copy process. Since we're changing the zone itself, reclaim needs to run afterwards again anyway. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 2 ++ drivers/md/dm-zoned-reclaim.c | 6 ++++-- drivers/md/dm-zoned.h | 3 +++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b5fd67eff046..db0dc2b5d44d 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1851,7 +1851,9 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) { dmz_unlock_map(zmd); dmz_unlock_metadata(zmd); + set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); + clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); dmz_lock_metadata(zmd); dmz_lock_map(zmd); } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d62f6890b92c..571bc1d41bab 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -143,6 +143,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, if (dst_dev->flags & DMZ_BDEV_DYING) return -EIO; + if (dmz_reclaim_should_terminate(src_zone)) + return -EINTR; + /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, src_zone, &block); if (ret <= 0) @@ -515,7 +518,7 @@ static void dmz_reclaim_work(struct work_struct *work) dmz_nr_rnd_zones(zmd)); ret = dmz_do_reclaim(zrc); - if (ret) { + if (ret && ret != -EINTR) { DMDEBUG("(%s): Reclaim error %d", dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) @@ -615,4 +618,3 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc) if (dmz_should_reclaim(zrc, p_unmap)) mod_delayed_work(zrc->wq, &zrc->work, 0); } - diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 288054dd7cf4..8083607b9535 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -127,6 +127,7 @@ enum { /* Zone internal state */ DMZ_RECLAIM, DMZ_SEQ_WRITE_ERR, + DMZ_RECLAIM_TERMINATE, }; /* @@ -140,6 +141,8 @@ enum { #define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) #define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) #define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) +#define dmz_reclaim_should_terminate(z) \ + test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags) #define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags) #define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags) -- cgit v1.2.3-59-g8ed1b From b4756d43a1dd2cfb778eb3cef3ba2efd2dcd5263 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 22 May 2020 10:58:53 +0200 Subject: dm zoned: remove leftover hunk for switching to sequential zones Remove a leftover hunk to switch from random zones to sequential zones when selecting a reclaim zone; the logic has moved into the caller and this hunk is now pointless. Fixes: 34f5affd04c4 ("dm zoned: separate random and cache zones") Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index db0dc2b5d44d..4a2e351365c5 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2111,14 +2111,6 @@ again: */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - /* - * Use sequential write zones if we started off with random - * zones and the list is empty - */ - if (list == &zmd->unmap_rnd_list) { - list = &zmd->unmap_seq_list; - goto again; - } /* * Fallback to the reserved sequential zones */ -- cgit v1.2.3-59-g8ed1b From a1c979f330cb82cae7a3b19464f9815e43060fe3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 27 May 2020 07:04:46 -0400 Subject: dm bufio: delete unused and inefficient dm_bufio_discard_buffers There is no user for this interface. If in future it is needed it can be reimplemented to walk the rbtree of buffers instead of doing block-by-block lookups. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 26 -------------------------- include/linux/dm-bufio.h | 7 ------- 2 files changed, 33 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index bf289be1ee3a..993e624e506c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1358,32 +1358,6 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count) -{ - sector_t i; - - for (i = block; i < block + count; i++) { - struct dm_buffer *b; - dm_bufio_lock(c); - b = __find(c, i); - if (b && likely(!b->hold_count)) { - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); - wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); - __unlink_buffer(b); - __free_buffer_wake(b); - } - dm_bufio_unlock(c); - } - - return dm_bufio_issue_discard(c, block, count); -} -EXPORT_SYMBOL_GPL(dm_bufio_discard_buffers); - /* * We first delete any other buffer that may be at that new location. * diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 07e1f163e299..5ec6bfbde9ae 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -123,13 +123,6 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c); */ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count); -/* - * Free the specified range of buffers. If a buffer is held by other process, it - * is not freed. If a buffer is dirty, it is discarded without writeback. - * Finally, send the discard request to the device. - */ -int dm_bufio_discard_buffers(struct dm_bufio_client *c, sector_t block, sector_t count); - /* * Like dm_bufio_release but also move the buffer to the new * block. dm_bufio_write_dirty_buffers is needed to commit the new block. -- cgit v1.2.3-59-g8ed1b From 88f878e58879acfdad03e08776c9802f9cd6f26a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:39 +0200 Subject: dm bufio: clean up rbtree block ordering dm-bufio uses unnatural ordering in the rb-tree - blocks with smaller numbers were put to the right node and blocks with bigger numbers were put to the left node. Reverse that logic so that it's natural. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 993e624e506c..ff19add97e0b 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -256,7 +256,7 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) if (b->block == block) return b; - n = (b->block < block) ? n->rb_left : n->rb_right; + n = block < b->block ? n->rb_left : n->rb_right; } return NULL; @@ -276,8 +276,8 @@ static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) } parent = *new; - new = (found->block < b->block) ? - &((*new)->rb_left) : &((*new)->rb_right); + new = b->block < found->block ? + &found->node.rb_left : &found->node.rb_right; } rb_link_node(&b->node, parent, new); -- cgit v1.2.3-59-g8ed1b From 33a180623b6c35f2727daecb63763955af3af1df Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:40 +0200 Subject: dm bufio: introduce forget_buffer_locked Introduce a function forget_buffer_locked that forgets a range of buffers. It is more efficient than calling forget_buffer in a loop. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/dm-bufio.h | 7 ++++++ 2 files changed, 63 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff19add97e0b..95f6c544aa01 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -262,6 +262,29 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) return NULL; } +static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + struct dm_buffer *best = NULL; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + if (block <= b->block) { + n = n->rb_left; + best = b; + } else { + n = n->rb_right; + } + } + + return best; +} + static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) { struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; @@ -1434,6 +1457,14 @@ retry: } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +static void forget_buffer_locked(struct dm_buffer *b) +{ + if (likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } +} + /* * Free the given buffer. * @@ -1447,15 +1478,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_lock(c); b = __find(c, block); - if (b && likely(!b->hold_count) && likely(!b->state)) { - __unlink_buffer(b); - __free_buffer_wake(b); - } + if (b) + forget_buffer_locked(b); dm_bufio_unlock(c); } EXPORT_SYMBOL_GPL(dm_bufio_forget); +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) +{ + struct dm_buffer *b; + sector_t end_block = block + n_blocks; + + while (block < end_block) { + dm_bufio_lock(c); + + b = __find_next(c, block); + if (b) { + block = b->block + 1; + forget_buffer_locked(b); + } + + dm_bufio_unlock(c); + + if (!b) + break; + } + +} +EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); + void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 5ec6bfbde9ae..29d255fdd5d6 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -136,6 +136,13 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); */ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); +/* + * Free the given range of buffers. + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks); + /* * Set the minimum number of buffers before cleanup happens. */ -- cgit v1.2.3-59-g8ed1b From 334b4fc17275667b38ebfd719714dab0edb83c6a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 2 Jun 2020 15:34:41 +0200 Subject: dm ebs: use dm_bufio_forget_buffers Use dm_bufio_forget_buffers instead of a block-by-block loop that calls dm_bufio_forget. dm_bufio_forget_buffers is faster than the loop because it searches for used buffers using rb-tree. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-ebs-target.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ae3f5fad3b39..44451276f128 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -167,8 +167,8 @@ static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) sector_t blocks, sector = bio->bi_iter.bi_sector; blocks = __nr_blocks(ec, bio); - for (; blocks--; sector += ec->u_bs) - dm_bufio_forget(ec->bufio, __sector_to_block(ec, sector)); + + dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); } /* Worker funtion to process incoming bios. */ -- cgit v1.2.3-59-g8ed1b From 35d0c96e422a484bbc5d4921fa20dcc880bfba2c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:42 +0200 Subject: dm zoned: add debugging message for reading superblocks Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 4a2e351365c5..ef1524d5928a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1105,6 +1105,10 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) { + dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", + set, zmd->sb[set].dev->name, + zmd->sb[set].block); + return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, zmd->sb[set].block, zmd->sb[set].mblk->page); } -- cgit v1.2.3-59-g8ed1b From 1565929b870fe166c5a57a85d6cb5a2bfe1e6c84 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:43 +0200 Subject: dm zoned: avoid unnecessary device recalulation for secondary superblock The secondary superblock must reside on the same device as the primary superblock, so there is no need to re-calculate the device. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index ef1524d5928a..043ed882970a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1135,7 +1135,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; zmd->sb[1].zone = zmd->sb[0].zone + 1; - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { if (dmz_read_sb(zmd, 1) != 0) break; @@ -1144,7 +1144,6 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) return 0; } zmd->sb[1].block += zone_nr_blocks; - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone + i); } dmz_free_mblock(zmd, mblk); @@ -1263,7 +1262,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) if (!zmd->sb[1].zone) zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); - zmd->sb[1].dev = dmz_zone_to_dev(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; ret = dmz_get_sb(zmd, 1); } else ret = dmz_lookup_secondary_sb(zmd); -- cgit v1.2.3-59-g8ed1b From c3ff479dde9f77d044c164f3ff5443bbe2b6c72d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:44 +0200 Subject: dm zoned: improve logging messages for reclaim Instead of just reporting the errno, add some more verbose debugging message in the reclaim path. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 571bc1d41bab..fd4d47dfcea1 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -371,8 +371,11 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); - if (!dzone) + if (!dzone) { + DMDEBUG("(%s): No zone found to reclaim", + dmz_metadata_label(zmd)); return -EBUSY; + } start = jiffies; if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) { @@ -416,6 +419,12 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) } out: if (ret) { + if (ret == -EINTR) + DMDEBUG("(%s): reclaim zone %u interrupted", + dmz_metadata_label(zmd), rzone->id); + else + DMDEBUG("(%s): Failed to reclaim zone %u, err %d", + dmz_metadata_label(zmd), rzone->id, ret); dmz_unlock_zone_reclaim(dzone); return ret; } @@ -519,8 +528,6 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret && ret != -EINTR) { - DMDEBUG("(%s): Reclaim error %d", - dmz_metadata_label(zmd), ret); if (!dmz_check_dev(zmd)) return; } -- cgit v1.2.3-59-g8ed1b From aec67b4ffa4bea4a02063d9a0f379e5795d6f5dc Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:45 +0200 Subject: dm zoned: add a 'reserved' zone flag Instead of counting the number of reserved zones in dmz_free_zone(), mark the zone as 'reserved' during allocation and simplify dmz_free_zone(). Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 4 ++-- drivers/md/dm-zoned.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 043ed882970a..0982ab1758a6 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1743,6 +1743,7 @@ next: atomic_inc(&zmd->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); + set_bit(DMZ_RESERVED, &dzone->flags); atomic_inc(&zmd->nr_reserved_seq_zones); zmd->nr_seq--; } else { @@ -2168,8 +2169,7 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) } else if (dmz_is_rnd(zone)) { list_add_tail(&zone->link, &zmd->unmap_rnd_list); atomic_inc(&zmd->unmap_nr_rnd); - } else if (atomic_read(&zmd->nr_reserved_seq_zones) < - zmd->nr_reserved_seq) { + } else if (dmz_is_reserved(zone)) { list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); atomic_inc(&zmd->nr_reserved_seq_zones); } else { diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 8083607b9535..3451b5a768b4 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -123,6 +123,7 @@ enum { DMZ_META, DMZ_DATA, DMZ_BUF, + DMZ_RESERVED, /* Zone internal state */ DMZ_RECLAIM, @@ -140,6 +141,7 @@ enum { #define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags) #define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags) #define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags) +#define dmz_is_reserved(z) test_bit(DMZ_RESERVED, &(z)->flags) #define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags) #define dmz_reclaim_should_terminate(z) \ test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags) -- cgit v1.2.3-59-g8ed1b From a92fbc446d1a93950b7e25bec6ad75dd26f01ba8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:46 +0200 Subject: dm zoned: convert to xarray The zones array is getting really large, and large arrays tend to wreak havoc with the CPU caches. So convert it to xarray to become more cache friendly. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Colin Ian King # fix leak in dmz_insert Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 122 ++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 32 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0982ab1758a6..b235283a8846 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -172,7 +172,7 @@ struct dmz_metadata { unsigned int nr_chunks; /* Zone information array */ - struct dm_zone *zones; + struct xarray zones; struct dmz_sb sb[3]; unsigned int mblk_primary; @@ -327,6 +327,32 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_seq); } +static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) +{ + return xa_load(&zmd->zones, zone_id); +} + +static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, + unsigned int zone_id) +{ + struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); + + if (!zone) + return ERR_PTR(-ENOMEM); + + if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { + kfree(zone); + return ERR_PTR(-EBUSY); + } + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = zone_id; + zone->chunk = DMZ_MAP_UNMAPPED; + + return zone; +} + const char *dmz_metadata_label(struct dmz_metadata *zmd) { return (const char *)zmd->label; @@ -1122,6 +1148,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) { unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; + unsigned int zone_id = zmd->sb[0].zone->id; int i; /* Allocate a block */ @@ -1134,16 +1161,15 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; - zmd->sb[1].zone = zmd->sb[0].zone + 1; + zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); zmd->sb[1].dev = zmd->sb[0].dev; - for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { + for (i = 1; i < zmd->nr_rnd_zones; i++) { if (dmz_read_sb(zmd, 1) != 0) break; - if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) { - zmd->sb[1].zone += i; + if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0; - } zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].zone = dmz_get(zmd, zone_id + i); } dmz_free_mblock(zmd, mblk); @@ -1259,8 +1285,12 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check secondary super block */ if (ret == 0) { sb_good[0] = true; - if (!zmd->sb[1].zone) - zmd->sb[1].zone = zmd->sb[0].zone + zmd->nr_meta_zones; + if (!zmd->sb[1].zone) { + unsigned int zone_id = + zmd->sb[0].zone->id + zmd->nr_meta_zones; + + zmd->sb[1].zone = dmz_get(zmd, zone_id); + } zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); zmd->sb[1].dev = zmd->sb[0].dev; ret = dmz_get_sb(zmd, 1); @@ -1341,7 +1371,11 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) struct dmz_metadata *zmd = data; struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; int idx = num + dev->zone_offset; - struct dm_zone *zone = &zmd->zones[idx]; + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx); + if (IS_ERR(zone)) + return PTR_ERR(zone); if (blkz->len != zmd->zone_nr_sectors) { if (zmd->sb_version > 1) { @@ -1353,11 +1387,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return -ENXIO; } - INIT_LIST_HEAD(&zone->link); - atomic_set(&zone->refcount, 0); - zone->id = idx; - zone->chunk = DMZ_MAP_UNMAPPED; - switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); @@ -1397,18 +1426,17 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return 0; } -static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) { int idx; sector_t zone_offset = 0; for(idx = 0; idx < dev->nr_zones; idx++) { - struct dm_zone *zone = &zmd->zones[idx]; + struct dm_zone *zone; - INIT_LIST_HEAD(&zone->link); - atomic_set(&zone->refcount, 0); - zone->id = idx; - zone->chunk = DMZ_MAP_UNMAPPED; + zone = dmz_insert(zmd, idx); + if (IS_ERR(zone)) + return PTR_ERR(zone); set_bit(DMZ_CACHE, &zone->flags); zone->wp_block = 0; zmd->nr_cache_zones++; @@ -1420,6 +1448,7 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) } zone_offset += zmd->zone_nr_sectors; } + return 0; } /* @@ -1427,8 +1456,15 @@ static void dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) */ static void dmz_drop_zones(struct dmz_metadata *zmd) { - kfree(zmd->zones); - zmd->zones = NULL; + int idx; + + for(idx = 0; idx < zmd->nr_zones; idx++) { + struct dm_zone *zone = xa_load(&zmd->zones, idx); + + kfree(zone); + xa_erase(&zmd->zones, idx); + } + xa_destroy(&zmd->zones); } /* @@ -1460,20 +1496,25 @@ static int dmz_init_zones(struct dmz_metadata *zmd) DMERR("(%s): No zones found", zmd->devname); return -ENXIO; } - zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); - if (!zmd->zones) - return -ENOMEM; + xa_init(&zmd->zones); DMDEBUG("(%s): Using %zu B for zone information", zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); if (zmd->nr_devs > 1) { - dmz_emulate_zones(zmd, &zmd->dev[0]); + ret = dmz_emulate_zones(zmd, &zmd->dev[0]); + if (ret < 0) { + DMDEBUG("(%s): Failed to emulate zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + /* * Primary superblock zone is always at zone 0 when multiple * drives are present. */ - zmd->sb[0].zone = &zmd->zones[0]; + zmd->sb[0].zone = dmz_get(zmd, 0); zoned_dev = &zmd->dev[1]; } @@ -1576,11 +1617,6 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, return 0; } -static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) -{ - return &zmd->zones[zone_id]; -} - /* * Reset a zone write pointer. */ @@ -1662,6 +1698,11 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", + chunk, dzone_id); + return -EIO; + } set_bit(DMZ_DATA, &dzone->flags); dzone->chunk = chunk; dmz_get_zone_weight(zmd, dzone); @@ -1685,6 +1726,11 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) } bzone = dmz_get(zmd, bzone_id); + if (!bzone) { + dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", + chunk, bzone_id); + return -EIO; + } if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", chunk, bzone_id); @@ -1715,6 +1761,8 @@ next: */ for (i = 0; i < zmd->nr_zones; i++) { dzone = dmz_get(zmd, i); + if (!dzone) + continue; if (dmz_is_meta(dzone)) continue; if (dmz_is_offline(dzone)) @@ -1978,6 +2026,10 @@ again: } else { /* The chunk is already mapped: get the mapping zone */ dzone = dmz_get(zmd, dzone_id); + if (!dzone) { + dzone = ERR_PTR(-EIO); + goto out; + } if (dzone->chunk != chunk) { dzone = ERR_PTR(-EIO); goto out; @@ -2794,6 +2846,12 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, /* Set metadata zones starting from sb_zone */ for (i = 0; i < zmd->nr_meta_zones << 1; i++) { zone = dmz_get(zmd, zmd->sb[0].zone->id + i); + if (!zone) { + dmz_zmd_err(zmd, + "metadata zone %u not present", i); + ret = -ENXIO; + goto err; + } if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { dmz_zmd_err(zmd, "metadata zone %d is not random", i); -- cgit v1.2.3-59-g8ed1b From 5d2c74f3ddc010b5812e556715f7605201eff101 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:47 +0200 Subject: dm zoned: allocate temporary superblock for tertiary devices Checking the tertiary superblock just consists of validating UUIDs, crcs, and the generation number; it doesn't have contents which would be required during the actual operation. So allocate a temporary superblock when checking tertiary devices to avoid having to store it together with the 'real' superblocks. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 109 +++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 48 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b235283a8846..525ac0d80287 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -174,7 +174,7 @@ struct dmz_metadata { /* Zone information array */ struct xarray zones; - struct dmz_sb sb[3]; + struct dmz_sb sb[2]; unsigned int mblk_primary; unsigned int sb_version; u64 sb_gen; @@ -1016,10 +1016,11 @@ err: /* * Check super block. */ -static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, + bool tertiary) { - struct dmz_super *sb = zmd->sb[set].sb; - struct dmz_dev *dev = zmd->sb[set].dev; + struct dmz_super *sb = dsb->sb; + struct dmz_dev *dev = dsb->dev; unsigned int nr_meta_zones, nr_data_zones; u32 crc, stored_crc; u64 gen; @@ -1036,7 +1037,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) DMZ_META_VER, zmd->sb_version); return -EINVAL; } - if ((zmd->sb_version < 1) && (set == 2)) { + if (zmd->sb_version < 2 && tertiary) { dmz_dev_err(dev, "Tertiary superblocks are not supported"); return -EINVAL; } @@ -1080,7 +1081,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) return -ENXIO; } - if (set == 2) { + if (tertiary) { /* * Generation number should be 0, but it doesn't * really matter if it isn't. @@ -1129,14 +1130,13 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set) /* * Read the first or second super block from disk. */ -static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", - set, zmd->sb[set].dev->name, - zmd->sb[set].block); + set, sb->dev->name, sb->block); - return dmz_rdwr_block(zmd->sb[set].dev, REQ_OP_READ, - zmd->sb[set].block, zmd->sb[set].mblk->page); + return dmz_rdwr_block(sb->dev, REQ_OP_READ, + sb->block, sb->mblk->page); } /* @@ -1164,7 +1164,7 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); zmd->sb[1].dev = zmd->sb[0].dev; for (i = 1; i < zmd->nr_rnd_zones; i++) { - if (dmz_read_sb(zmd, 1) != 0) + if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) break; if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0; @@ -1181,9 +1181,9 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) } /* - * Read the first or second super block from disk. + * Read a super block from disk. */ -static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { struct dmz_mblock *mblk; int ret; @@ -1193,14 +1193,14 @@ static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) if (!mblk) return -ENOMEM; - zmd->sb[set].mblk = mblk; - zmd->sb[set].sb = mblk->data; + sb->mblk = mblk; + sb->sb = mblk->data; /* Read super block */ - ret = dmz_read_sb(zmd, set); + ret = dmz_read_sb(zmd, sb, set); if (ret) { dmz_free_mblock(zmd, mblk); - zmd->sb[set].mblk = NULL; + sb->mblk = NULL; return ret; } @@ -1274,13 +1274,13 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); - ret = dmz_get_sb(zmd, 0); + ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) { dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret; } - ret = dmz_check_sb(zmd, 0); + ret = dmz_check_sb(zmd, &zmd->sb[0], false); /* Read and check secondary super block */ if (ret == 0) { @@ -1293,7 +1293,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) } zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); zmd->sb[1].dev = zmd->sb[0].dev; - ret = dmz_get_sb(zmd, 1); + ret = dmz_get_sb(zmd, &zmd->sb[1], 1); } else ret = dmz_lookup_secondary_sb(zmd); @@ -1302,7 +1302,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) return ret; } - ret = dmz_check_sb(zmd, 1); + ret = dmz_check_sb(zmd, &zmd->sb[1], false); if (ret == 0) sb_good[1] = true; @@ -1347,20 +1347,40 @@ static int dmz_load_sb(struct dmz_metadata *zmd) "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); - if ((zmd->sb_version > 1) && zmd->sb[2].zone) { - zmd->sb[2].block = dmz_start_block(zmd, zmd->sb[2].zone); - zmd->sb[2].dev = dmz_zone_to_dev(zmd, zmd->sb[2].zone); - ret = dmz_get_sb(zmd, 2); - if (ret) { - dmz_dev_err(zmd->sb[2].dev, - "Read tertiary super block failed"); - return ret; + if (zmd->sb_version > 1) { + int i; + struct dmz_sb *sb; + + sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + for (i = 1; i < zmd->nr_devs; i++) { + sb->block = 0; + sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); + sb->dev = &zmd->dev[i]; + if (!dmz_is_meta(sb->zone)) { + dmz_dev_err(sb->dev, + "Tertiary super block zone %u not marked as metadata zone", + sb->zone->id); + ret = -EINVAL; + goto out_kfree; + } + ret = dmz_get_sb(zmd, sb, i + 1); + if (ret) { + dmz_dev_err(sb->dev, + "Read tertiary super block failed"); + dmz_free_mblock(zmd, sb->mblk); + goto out_kfree; + } + ret = dmz_check_sb(zmd, sb, true); + dmz_free_mblock(zmd, sb->mblk); + if (ret == -EINVAL) + goto out_kfree; } - ret = dmz_check_sb(zmd, 2); - if (ret == -EINVAL) - return ret; + out_kfree: + kfree(sb); } - return 0; + return ret; } /* @@ -1417,12 +1437,15 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) zmd->sb[0].zone = zone; } } - if (zmd->nr_devs > 1 && !zmd->sb[2].zone) { - /* Tertiary superblock zone */ - zmd->sb[2].zone = zone; + if (zmd->nr_devs > 1 && num == 0) { + /* + * Tertiary superblock zones are always at the + * start of the zoned devices, so mark them + * as metadata zone. + */ + set_bit(DMZ_META, &zone->flags); } } - return 0; } @@ -2860,16 +2883,6 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, } set_bit(DMZ_META, &zone->flags); } - if (zmd->sb[2].zone) { - zone = dmz_get(zmd, zmd->sb[2].zone->id); - if (!zone) { - dmz_zmd_err(zmd, - "Tertiary metadata zone not present"); - ret = -ENXIO; - goto err; - } - set_bit(DMZ_META, &zone->flags); - } /* Load mapping table */ ret = dmz_load_mapping(zmd); if (ret) -- cgit v1.2.3-59-g8ed1b From 8f22272af7a72763fe9f6b710cdcc380fed80f75 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:48 +0200 Subject: dm zoned: add device pointer to struct dm_zone Add a pointer, to the containing device, within struct dm_zone and kill dmz_zone_to_dev(). Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 39 ++++++++++----------------------------- drivers/md/dm-zoned-reclaim.c | 13 +++++-------- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm-zoned.h | 4 +++- 4 files changed, 19 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 525ac0d80287..68d44506e6f3 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -229,16 +229,10 @@ struct dmz_metadata { */ static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) { - unsigned int zone_id; - if (WARN_ON(!zone)) return 0; - zone_id = zone->id; - if (zmd->nr_devs > 1 && - (zone_id >= zmd->dev[1].zone_offset)) - zone_id -= zmd->dev[1].zone_offset; - return zone_id; + return zone->id - zone->dev->zone_offset; } sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) @@ -255,18 +249,6 @@ sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) return (sector_t)zone_id << zmd->zone_nr_blocks_shift; } -struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone) -{ - if (WARN_ON(!zone)) - return &zmd->dev[0]; - - if (zmd->nr_devs > 1 && - zone->id >= zmd->dev[1].zone_offset) - return &zmd->dev[1]; - - return &zmd->dev[0]; -} - unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) { return zmd->zone_nr_blocks; @@ -333,7 +315,7 @@ static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) } static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, - unsigned int zone_id) + unsigned int zone_id, struct dmz_dev *dev) { struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); @@ -349,6 +331,7 @@ static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, atomic_set(&zone->refcount, 0); zone->id = zone_id; zone->chunk = DMZ_MAP_UNMAPPED; + zone->dev = dev; return zone; } @@ -1273,7 +1256,7 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* Read and check the primary super block */ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); - zmd->sb[0].dev = dmz_zone_to_dev(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = zmd->sb[0].zone->dev; ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) { dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); @@ -1393,7 +1376,7 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) int idx = num + dev->zone_offset; struct dm_zone *zone; - zone = dmz_insert(zmd, idx); + zone = dmz_insert(zmd, idx, dev); if (IS_ERR(zone)) return PTR_ERR(zone); @@ -1457,7 +1440,7 @@ static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) for(idx = 0; idx < dev->nr_zones; idx++) { struct dm_zone *zone; - zone = dmz_insert(zmd, idx); + zone = dmz_insert(zmd, idx, dev); if (IS_ERR(zone)) return PTR_ERR(zone); set_bit(DMZ_CACHE, &zone->flags); @@ -1583,7 +1566,7 @@ static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; unsigned int noio_flag; int ret; @@ -1620,7 +1603,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, struct dm_zone *zone) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; unsigned int wp = 0; int ret; @@ -1657,7 +1640,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) return 0; if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), @@ -2218,9 +2201,7 @@ again: goto again; } if (dmz_is_meta(zone)) { - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); - - dmz_dev_warn(dev, "Zone %u has metadata", zone->id); + dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); zone = NULL; goto again; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index fd4d47dfcea1..e9e3b730e258 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -58,7 +58,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, sector_t block) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *dev = dmz_zone_to_dev(zmd, zone); + struct dmz_dev *dev = zone->dev; sector_t wp_block = zone->wp_block; unsigned int nr_blocks; int ret; @@ -116,7 +116,6 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, struct dm_zone *src_zone, struct dm_zone *dst_zone) { struct dmz_metadata *zmd = zrc->metadata; - struct dmz_dev *src_dev, *dst_dev; struct dm_io_region src, dst; sector_t block = 0, end_block; sector_t nr_blocks; @@ -130,17 +129,15 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, else end_block = dmz_zone_nr_blocks(zmd); src_zone_block = dmz_start_block(zmd, src_zone); - src_dev = dmz_zone_to_dev(zmd, src_zone); dst_zone_block = dmz_start_block(zmd, dst_zone); - dst_dev = dmz_zone_to_dev(zmd, dst_zone); if (dmz_is_seq(dst_zone)) set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { - if (src_dev->flags & DMZ_BDEV_DYING) + if (src_zone->dev->flags & DMZ_BDEV_DYING) return -EIO; - if (dst_dev->flags & DMZ_BDEV_DYING) + if (dst_zone->dev->flags & DMZ_BDEV_DYING) return -EIO; if (dmz_reclaim_should_terminate(src_zone)) @@ -163,11 +160,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, return ret; } - src.bdev = src_dev->bdev; + src.bdev = src_zone->dev->bdev; src.sector = dmz_blk2sect(src_zone_block + block); src.count = dmz_blk2sect(nr_blocks); - dst.bdev = dst_dev->bdev; + dst.bdev = dst_zone->dev->bdev; dst.sector = dmz_blk2sect(dst_zone_block + block); dst.count = src.count; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 2770e293a97b..087dd4801663 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -123,7 +123,7 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, { struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); - struct dmz_dev *dev = dmz_zone_to_dev(dmz->metadata, zone); + struct dmz_dev *dev = zone->dev; struct bio *clone; if (dev->flags & DMZ_BDEV_DYING) diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 3451b5a768b4..316344bf07bd 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -80,6 +80,9 @@ struct dm_zone { /* For listing the zone depending on its state */ struct list_head link; + /* Device containing this zone */ + struct dmz_dev *dev; + /* Zone type and state */ unsigned long flags; @@ -190,7 +193,6 @@ const char *dmz_metadata_label(struct dmz_metadata *zmd); sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone); sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_chunks(struct dmz_metadata *zmd); -struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone); bool dmz_check_dev(struct dmz_metadata *zmd); bool dmz_dev_is_dying(struct dmz_metadata *zmd); -- cgit v1.2.3-59-g8ed1b From 18979819b57ecbc598a8e27d925ab4bb9e145cf0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:49 +0200 Subject: dm zoned: add metadata pointer to struct dmz_dev Add a metadata pointer within struct dmz_dev and use it as argument for blkdev_report_zones() instead of the metadata itself. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 14 +++++++++----- drivers/md/dm-zoned.h | 7 ++++--- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 68d44506e6f3..71f263a78515 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1371,8 +1371,8 @@ static int dmz_load_sb(struct dmz_metadata *zmd) */ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) { - struct dmz_metadata *zmd = data; - struct dmz_dev *dev = zmd->nr_devs > 1 ? &zmd->dev[1] : &zmd->dev[0]; + struct dmz_dev *dev = data; + struct dmz_metadata *zmd = dev->metadata; int idx = num + dev->zone_offset; struct dm_zone *zone; @@ -1495,8 +1495,12 @@ static int dmz_init_zones(struct dmz_metadata *zmd) /* Allocate zone array */ zmd->nr_zones = 0; - for (i = 0; i < zmd->nr_devs; i++) - zmd->nr_zones += zmd->dev[i].nr_zones; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + dev->metadata = zmd; + zmd->nr_zones += dev->nr_zones; + } if (!zmd->nr_zones) { DMERR("(%s): No zones found", zmd->devname); @@ -1531,7 +1535,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd) * first randomly writable zone. */ ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, - dmz_init_zone, zmd); + dmz_init_zone, zoned_dev); if (ret < 0) { DMDEBUG("(%s): Failed to report zones, error %d", zmd->devname, ret); diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 316344bf07bd..983f5b5e9fa0 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -45,11 +45,15 @@ #define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector) #define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio)) +struct dmz_metadata; +struct dmz_reclaim; + /* * Zoned block device information. */ struct dmz_dev { struct block_device *bdev; + struct dmz_metadata *metadata; char name[BDEVNAME_SIZE]; uuid_t uuid; @@ -170,9 +174,6 @@ enum { #define dmz_dev_debug(dev, format, args...) \ DMDEBUG("(%s): " format, (dev)->name, ## args) -struct dmz_metadata; -struct dmz_reclaim; - /* * Functions defined in dm-zoned-metadata.c */ -- cgit v1.2.3-59-g8ed1b From f97809aec58995a87a9a30cb45c9a6148377df64 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:50 +0200 Subject: dm zoned: per-device reclaim Instead of having one reclaim workqueue for the entire set we should be allocating a reclaim workqueue per device; doing so will reduce contention and should boost performance for a multi-device setup. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 66 +++++++++++++++++++++++++++---------------- drivers/md/dm-zoned-target.c | 41 +++++++++++++++++---------- drivers/md/dm-zoned.h | 38 +++++++++++++------------ 3 files changed, 88 insertions(+), 57 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index e9e3b730e258..09843645248a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -21,6 +21,8 @@ struct dmz_reclaim { struct dm_kcopyd_throttle kc_throttle; int kc_err; + int dev_idx; + unsigned long flags; /* Last target access time */ @@ -198,8 +200,8 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret; - DMDEBUG("(%s): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, dzone->chunk, bzone->id, dmz_weight(bzone), dzone->id, dmz_weight(dzone)); @@ -237,8 +239,8 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) struct dmz_metadata *zmd = zrc->metadata; int ret = 0; - DMDEBUG("(%s): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", + dmz_metadata_label(zmd), zrc->dev_idx, chunk, dzone->id, dmz_weight(dzone), bzone->id, dmz_weight(bzone)); @@ -295,8 +297,8 @@ again: if (!szone) return -ENOSPC; - DMDEBUG("(%s): Chunk %u, move %s zone %u (weight %u) to %s zone %u", - dmz_metadata_label(zmd), chunk, + DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u", + dmz_metadata_label(zmd), zrc->dev_idx, chunk, dmz_is_cache(dzone) ? "cache" : "rnd", dzone->id, dmz_weight(dzone), dmz_is_rnd(szone) ? "rnd" : "seq", szone->id); @@ -369,8 +371,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); if (!dzone) { - DMDEBUG("(%s): No zone found to reclaim", - dmz_metadata_label(zmd)); + DMDEBUG("(%s/%u): No zone found to reclaim", + dmz_metadata_label(zmd), zrc->dev_idx); return -EBUSY; } @@ -417,24 +419,26 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) out: if (ret) { if (ret == -EINTR) - DMDEBUG("(%s): reclaim zone %u interrupted", - dmz_metadata_label(zmd), rzone->id); + DMDEBUG("(%s/%u): reclaim zone %u interrupted", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id); else - DMDEBUG("(%s): Failed to reclaim zone %u, err %d", - dmz_metadata_label(zmd), rzone->id, ret); + DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, + rzone->id, ret); dmz_unlock_zone_reclaim(dzone); return ret; } ret = dmz_flush_metadata(zrc->metadata); if (ret) { - DMDEBUG("(%s): Metadata flush for zone %u failed, err %d", - dmz_metadata_label(zmd), rzone->id, ret); + DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d", + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret); return ret; } - DMDEBUG("(%s): Reclaimed zone %u in %u ms", - dmz_metadata_label(zmd), + DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms", + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, jiffies_to_msecs(jiffies - start)); return 0; } @@ -461,10 +465,20 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) */ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { - unsigned int nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + unsigned int nr_reclaim; + + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); - if (dmz_nr_cache_zones(zrc->metadata)) + if (dmz_nr_cache_zones(zrc->metadata)) { + /* + * The first device in a multi-device + * setup only contains cache zones, so + * never start reclaim there. + */ + if (zrc->dev_idx == 0) + return false; nr_reclaim += dmz_nr_cache_zones(zrc->metadata); + } /* Reclaim when idle */ if (dmz_target_idle(zrc) && nr_reclaim) @@ -488,7 +502,7 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int p_unmap; + unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; int ret; if (dmz_dev_is_dying(zmd)) @@ -514,8 +528,11 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - DMDEBUG("(%s): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", - dmz_metadata_label(zmd), + nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); + nr_rnd = dmz_nr_rnd_zones(zmd); + + DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", + dmz_metadata_label(zmd), zrc->dev_idx, zrc->kc_throttle.throttle, (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap, dmz_nr_unmap_cache_zones(zmd), @@ -536,7 +553,7 @@ static void dmz_reclaim_work(struct work_struct *work) * Initialize reclaim. */ int dmz_ctr_reclaim(struct dmz_metadata *zmd, - struct dmz_reclaim **reclaim) + struct dmz_reclaim **reclaim, int idx) { struct dmz_reclaim *zrc; int ret; @@ -547,6 +564,7 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd, zrc->metadata = zmd; zrc->atime = jiffies; + zrc->dev_idx = idx; /* Reclaim kcopyd client */ zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle); @@ -558,8 +576,8 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd, /* Reclaim work */ INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); - zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM, - dmz_metadata_label(zmd)); + zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM, + dmz_metadata_label(zmd), idx); if (!zrc->wq) { ret = -ENOMEM; goto err; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 087dd4801663..97d63d8e6c19 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -41,6 +41,7 @@ struct dm_chunk_work { */ struct dmz_target { struct dm_dev *ddev[DMZ_MAX_DEVS]; + unsigned int nr_ddevs; unsigned long flags; @@ -50,9 +51,6 @@ struct dmz_target { /* For metadata handling */ struct dmz_metadata *metadata; - /* For reclaim */ - struct dmz_reclaim *reclaim; - /* For chunk work */ struct radix_tree_root chunk_rxtree; struct workqueue_struct *chunk_wq; @@ -404,14 +402,15 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; - int ret; + int i, ret; /* * Write may trigger a zone allocation. So make sure the * allocation can succeed. */ if (bio_op(bio) == REQ_OP_WRITE) - dmz_schedule_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_schedule_reclaim(dmz->dev[i].reclaim); dmz_lock_metadata(zmd); @@ -431,6 +430,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, if (zone) { dmz_activate_zone(zone); bioctx->zone = zone; + dmz_reclaim_bio_acc(zone->dev->reclaim); } switch (bio_op(bio)) { @@ -577,7 +577,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) bio_list_add(&cw->bio_list, bio); - dmz_reclaim_bio_acc(dmz->reclaim); if (queue_work(dmz->chunk_wq, &cw->work)) dmz_get_chunk_work(cw); out: @@ -822,7 +821,7 @@ static int dmz_fixup_devices(struct dm_target *ti) static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dmz_target *dmz; - int ret; + int ret, i; /* Check arguments */ if (argc < 1 || argc > 2) { @@ -842,6 +841,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(dmz); return -ENOMEM; } + dmz->nr_ddevs = argc; ti->private = dmz; /* Get the target zoned block device */ @@ -916,10 +916,12 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); /* Initialize reclaim */ - ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim); - if (ret) { - ti->error = "Zone reclaim initialization failed"; - goto err_fwq; + for (i = 0; i < dmz->nr_ddevs; i++) { + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); + if (ret) { + ti->error = "Zone reclaim initialization failed"; + goto err_fwq; + } } DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", @@ -952,11 +954,13 @@ err: static void dmz_dtr(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; flush_workqueue(dmz->chunk_wq); destroy_workqueue(dmz->chunk_wq); - dmz_dtr_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_dtr_reclaim(dmz->dev[i].reclaim); cancel_delayed_work_sync(&dmz->flush_work); destroy_workqueue(dmz->flush_wq); @@ -1025,9 +1029,11 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) static void dmz_suspend(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; flush_workqueue(dmz->chunk_wq); - dmz_suspend_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_suspend_reclaim(dmz->dev[i].reclaim); cancel_delayed_work_sync(&dmz->flush_work); } @@ -1037,9 +1043,11 @@ static void dmz_suspend(struct dm_target *ti) static void dmz_resume(struct dm_target *ti) { struct dmz_target *dmz = ti->private; + int i; queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); - dmz_resume_reclaim(dmz->reclaim); + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_resume_reclaim(dmz->dev[i].reclaim); } static int dmz_iterate_devices(struct dm_target *ti, @@ -1100,7 +1108,10 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, int r = -EINVAL; if (!strcasecmp(argv[0], "reclaim")) { - dmz_schedule_reclaim(dmz->reclaim); + int i; + + for (i = 0; i < dmz->nr_ddevs; i++) + dmz_schedule_reclaim(dmz->dev[i].reclaim); r = 0; } else DMERR("unrecognized message %s", argv[0]); diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 983f5b5e9fa0..0cc3459f78ce 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -54,6 +54,7 @@ struct dmz_reclaim; struct dmz_dev { struct block_device *bdev; struct dmz_metadata *metadata; + struct dmz_reclaim *reclaim; char name[BDEVNAME_SIZE]; uuid_t uuid; @@ -229,23 +230,6 @@ static inline void dmz_activate_zone(struct dm_zone *zone) atomic_inc(&zone->refcount); } -/* - * Deactivate a zone. This decrement the zone reference counter - * indicating that all BIOs to the zone have completed when the count is 0. - */ -static inline void dmz_deactivate_zone(struct dm_zone *zone) -{ - atomic_dec(&zone->refcount); -} - -/* - * Test if a zone is active, that is, has a refcount > 0. - */ -static inline bool dmz_is_active(struct dm_zone *zone) -{ - return atomic_read(&zone->refcount); -} - int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); @@ -272,7 +256,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, /* * Functions defined in dm-zoned-reclaim.c */ -int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc); +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx); void dmz_dtr_reclaim(struct dmz_reclaim *zrc); void dmz_suspend_reclaim(struct dmz_reclaim *zrc); void dmz_resume_reclaim(struct dmz_reclaim *zrc); @@ -285,4 +269,22 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); bool dmz_check_bdev(struct dmz_dev *dmz_dev); +/* + * Deactivate a zone. This decrement the zone reference counter + * indicating that all BIOs to the zone have completed when the count is 0. + */ +static inline void dmz_deactivate_zone(struct dm_zone *zone) +{ + dmz_reclaim_bio_acc(zone->dev->reclaim); + atomic_dec(&zone->refcount); +} + +/* + * Test if a zone is active, that is, has a refcount > 0. + */ +static inline bool dmz_is_active(struct dm_zone *zone) +{ + return atomic_read(&zone->refcount); +} + #endif /* DM_ZONED_H */ -- cgit v1.2.3-59-g8ed1b From bd82fdabf162fec1404c4e22988b178c4f3dd23b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:51 +0200 Subject: dm zoned: move random and sequential zones into struct dmz_dev Random and sequential zones should be part of the respective device structure to make arbitration between devices possible. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 139 +++++++++++++++++++++++------------------ drivers/md/dm-zoned-reclaim.c | 15 +++-- drivers/md/dm-zoned-target.c | 25 ++++++-- drivers/md/dm-zoned.h | 18 ++++-- 4 files changed, 119 insertions(+), 78 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 71f263a78515..ce17bf3628c6 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -192,21 +192,12 @@ struct dmz_metadata { /* Zone allocation management */ struct mutex map_lock; struct dmz_mblock **map_mblk; - unsigned int nr_rnd; - atomic_t unmap_nr_rnd; - struct list_head unmap_rnd_list; - struct list_head map_rnd_list; unsigned int nr_cache; atomic_t unmap_nr_cache; struct list_head unmap_cache_list; struct list_head map_cache_list; - unsigned int nr_seq; - atomic_t unmap_nr_seq; - struct list_head unmap_seq_list; - struct list_head map_seq_list; - atomic_t nr_reserved_seq_zones; struct list_head reserved_seq_zones_list; @@ -279,14 +270,14 @@ unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) return zmd->nr_chunks; } -unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) { - return zmd->nr_rnd; + return zmd->dev[idx].nr_rnd; } -unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) { - return atomic_read(&zmd->unmap_nr_rnd); + return atomic_read(&zmd->dev[idx].unmap_nr_rnd); } unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) @@ -299,14 +290,14 @@ unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) return atomic_read(&zmd->unmap_nr_cache); } -unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) { - return zmd->nr_seq; + return zmd->dev[idx].nr_seq; } -unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) { - return atomic_read(&zmd->unmap_nr_seq); + return atomic_read(&zmd->dev[idx].unmap_nr_seq); } static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) @@ -1500,6 +1491,14 @@ static int dmz_init_zones(struct dmz_metadata *zmd) dev->metadata = zmd; zmd->nr_zones += dev->nr_zones; + + atomic_set(&dev->unmap_nr_rnd, 0); + INIT_LIST_HEAD(&dev->unmap_rnd_list); + INIT_LIST_HEAD(&dev->map_rnd_list); + + atomic_set(&dev->unmap_nr_seq, 0); + INIT_LIST_HEAD(&dev->unmap_seq_list); + INIT_LIST_HEAD(&dev->map_seq_list); } if (!zmd->nr_zones) { @@ -1720,9 +1719,9 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dmz_is_cache(dzone)) list_add_tail(&dzone->link, &zmd->map_cache_list); else if (dmz_is_rnd(dzone)) - list_add_tail(&dzone->link, &zmd->map_rnd_list); + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); else - list_add_tail(&dzone->link, &zmd->map_seq_list); + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); /* Check buffer zone */ bzone_id = le32_to_cpu(dmap[e].bzone_id); @@ -1756,7 +1755,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd) if (dmz_is_cache(bzone)) list_add_tail(&bzone->link, &zmd->map_cache_list); else - list_add_tail(&bzone->link, &zmd->map_rnd_list); + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); next: chunk++; e++; @@ -1781,9 +1780,9 @@ next: if (dmz_is_cache(dzone)) zmd->nr_cache++; else if (dmz_is_rnd(dzone)) - zmd->nr_rnd++; + dzone->dev->nr_rnd++; else - zmd->nr_seq++; + dzone->dev->nr_seq++; if (dmz_is_data(dzone)) { /* Already initialized */ @@ -1797,16 +1796,18 @@ next: list_add_tail(&dzone->link, &zmd->unmap_cache_list); atomic_inc(&zmd->unmap_nr_cache); } else if (dmz_is_rnd(dzone)) { - list_add_tail(&dzone->link, &zmd->unmap_rnd_list); - atomic_inc(&zmd->unmap_nr_rnd); + list_add_tail(&dzone->link, + &dzone->dev->unmap_rnd_list); + atomic_inc(&dzone->dev->unmap_nr_rnd); } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); set_bit(DMZ_RESERVED, &dzone->flags); atomic_inc(&zmd->nr_reserved_seq_zones); - zmd->nr_seq--; + dzone->dev->nr_seq--; } else { - list_add_tail(&dzone->link, &zmd->unmap_seq_list); - atomic_inc(&zmd->unmap_nr_seq); + list_add_tail(&dzone->link, + &dzone->dev->unmap_seq_list); + atomic_inc(&dzone->dev->unmap_nr_seq); } } @@ -1840,13 +1841,13 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) list_del_init(&zone->link); if (dmz_is_seq(zone)) { /* LRU rotate sequential zone */ - list_add_tail(&zone->link, &zmd->map_seq_list); + list_add_tail(&zone->link, &zone->dev->map_seq_list); } else if (dmz_is_cache(zone)) { /* LRU rotate cache zone */ list_add_tail(&zone->link, &zmd->map_cache_list); } else { /* LRU rotate random zone */ - list_add_tail(&zone->link, &zmd->map_rnd_list); + list_add_tail(&zone->link, &zone->dev->map_rnd_list); } } @@ -1928,14 +1929,24 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, { struct dm_zone *dzone = NULL; struct dm_zone *zone; - struct list_head *zone_list = &zmd->map_rnd_list; + struct list_head *zone_list; /* If we have cache zones select from the cache zone list */ if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; /* Try to relaim random zones, too, when idle */ - if (idle && list_empty(zone_list)) - zone_list = &zmd->map_rnd_list; + if (idle && list_empty(zone_list)) { + int i; + + for (i = 1; i < zmd->nr_devs; i++) { + zone_list = &zmd->dev[i].map_rnd_list; + if (!list_empty(zone_list)) + break; + } + } + } else { + /* Otherwise the random zones are on the first disk */ + zone_list = &zmd->dev[0].map_rnd_list; } list_for_each_entry(zone, zone_list, link) { @@ -1956,12 +1967,17 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) { struct dm_zone *zone; + int i; - list_for_each_entry(zone, &zmd->map_seq_list, link) { - if (!zone->bzone) - continue; - if (dmz_lock_zone_reclaim(zone)) - return zone; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + list_for_each_entry(zone, &dev->map_seq_list, link) { + if (!zone->bzone) + continue; + if (dmz_lock_zone_reclaim(zone)) + return zone; + } } return NULL; @@ -2147,7 +2163,7 @@ again: if (dmz_is_cache(bzone)) list_add_tail(&bzone->link, &zmd->map_cache_list); else - list_add_tail(&bzone->link, &zmd->map_rnd_list); + list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); out: dmz_unlock_map(zmd); @@ -2162,21 +2178,27 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) { struct list_head *list; struct dm_zone *zone; + unsigned int dev_idx = 0; +again: if (flags & DMZ_ALLOC_CACHE) list = &zmd->unmap_cache_list; else if (flags & DMZ_ALLOC_RND) - list = &zmd->unmap_rnd_list; + list = &zmd->dev[dev_idx].unmap_rnd_list; else - list = &zmd->unmap_seq_list; + list = &zmd->dev[dev_idx].unmap_seq_list; -again: if (list_empty(list)) { /* * No free zone: return NULL if this is for not reclaim. */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; + if (dev_idx < zmd->nr_devs) { + dev_idx++; + goto again; + } + /* * Fallback to the reserved sequential zones */ @@ -2195,9 +2217,9 @@ again: if (dmz_is_cache(zone)) atomic_dec(&zmd->unmap_nr_cache); else if (dmz_is_rnd(zone)) - atomic_dec(&zmd->unmap_nr_rnd); + atomic_dec(&zone->dev->unmap_nr_rnd); else - atomic_dec(&zmd->unmap_nr_seq); + atomic_dec(&zone->dev->unmap_nr_seq); if (dmz_is_offline(zone)) { dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); @@ -2227,14 +2249,14 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) list_add_tail(&zone->link, &zmd->unmap_cache_list); atomic_inc(&zmd->unmap_nr_cache); } else if (dmz_is_rnd(zone)) { - list_add_tail(&zone->link, &zmd->unmap_rnd_list); - atomic_inc(&zmd->unmap_nr_rnd); + list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); + atomic_inc(&zone->dev->unmap_nr_rnd); } else if (dmz_is_reserved(zone)) { list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); atomic_inc(&zmd->nr_reserved_seq_zones); } else { - list_add_tail(&zone->link, &zmd->unmap_seq_list); - atomic_inc(&zmd->unmap_nr_seq); + list_add_tail(&zone->link, &zone->dev->unmap_seq_list); + atomic_inc(&zone->dev->unmap_nr_seq); } wake_up_all(&zmd->free_wq); @@ -2254,9 +2276,9 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, if (dmz_is_cache(dzone)) list_add_tail(&dzone->link, &zmd->map_cache_list); else if (dmz_is_rnd(dzone)) - list_add_tail(&dzone->link, &zmd->map_rnd_list); + list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); else - list_add_tail(&dzone->link, &zmd->map_seq_list); + list_add_tail(&dzone->link, &dzone->dev->map_seq_list); } /* @@ -2824,18 +2846,11 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, INIT_LIST_HEAD(&zmd->mblk_dirty_list); mutex_init(&zmd->map_lock); - atomic_set(&zmd->unmap_nr_rnd, 0); - INIT_LIST_HEAD(&zmd->unmap_rnd_list); - INIT_LIST_HEAD(&zmd->map_rnd_list); atomic_set(&zmd->unmap_nr_cache, 0); INIT_LIST_HEAD(&zmd->unmap_cache_list); INIT_LIST_HEAD(&zmd->map_cache_list); - atomic_set(&zmd->unmap_nr_seq, 0); - INIT_LIST_HEAD(&zmd->unmap_seq_list); - INIT_LIST_HEAD(&zmd->map_seq_list); - atomic_set(&zmd->nr_reserved_seq_zones, 0); INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); @@ -2904,10 +2919,14 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->nr_data_zones, zmd->nr_chunks); dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); - dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", - zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); - dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", - zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); + for (i = 0; i < zmd->nr_devs; i++) { + dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", + dmz_nr_rnd_zones(zmd, i), + dmz_nr_unmap_rnd_zones(zmd, i)); + dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", + dmz_nr_seq_zones(zmd, i), + dmz_nr_unmap_seq_zones(zmd, i)); + } dmz_zmd_debug(zmd, " %u reserved sequential data zones", zmd->nr_reserved_seq); dmz_zmd_debug(zmd, "Format:"); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 09843645248a..18edf1b9bf52 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -447,15 +447,14 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; unsigned int nr_cache = dmz_nr_cache_zones(zmd); - unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); unsigned int nr_unmap, nr_zones; if (nr_cache) { nr_zones = nr_cache; nr_unmap = dmz_nr_unmap_cache_zones(zmd); } else { - nr_zones = nr_rnd; - nr_unmap = dmz_nr_unmap_rnd_zones(zmd); + nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); + nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); } return nr_unmap * 100 / nr_zones; } @@ -467,7 +466,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap) { unsigned int nr_reclaim; - nr_reclaim = dmz_nr_rnd_zones(zrc->metadata); + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx); if (dmz_nr_cache_zones(zrc->metadata)) { /* @@ -528,8 +527,8 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); - nr_rnd = dmz_nr_rnd_zones(zmd); + nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); + nr_rnd = dmz_nr_rnd_zones(zmd, zrc->dev_idx); DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->dev_idx, @@ -537,8 +536,8 @@ static void dmz_reclaim_work(struct work_struct *work) (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap, dmz_nr_unmap_cache_zones(zmd), dmz_nr_cache_zones(zmd), - dmz_nr_unmap_rnd_zones(zmd), - dmz_nr_rnd_zones(zmd)); + dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx), + dmz_nr_rnd_zones(zmd, zrc->dev_idx)); ret = dmz_do_reclaim(zrc); if (ret && ret != -EINTR) { diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 97d63d8e6c19..aa3d26d16441 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1075,17 +1075,30 @@ static void dmz_status(struct dm_target *ti, status_type_t type, ssize_t sz = 0; char buf[BDEVNAME_SIZE]; struct dmz_dev *dev; + int i; switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u zones %u/%u cache %u/%u random %u/%u sequential", + DMEMIT("%u zones %u/%u cache", dmz_nr_zones(dmz->metadata), dmz_nr_unmap_cache_zones(dmz->metadata), - dmz_nr_cache_zones(dmz->metadata), - dmz_nr_unmap_rnd_zones(dmz->metadata), - dmz_nr_rnd_zones(dmz->metadata), - dmz_nr_unmap_seq_zones(dmz->metadata), - dmz_nr_seq_zones(dmz->metadata)); + dmz_nr_cache_zones(dmz->metadata)); + for (i = 0; i < DMZ_MAX_DEVS; i++) { + if (!dmz->ddev[i]) + continue; + /* + * For a multi-device setup the first device + * contains only cache zones. + */ + if ((i == 0) && + (dmz_nr_cache_zones(dmz->metadata) > 0)) + continue; + DMEMIT(" %u/%u random %u/%u sequential", + dmz_nr_unmap_rnd_zones(dmz->metadata, i), + dmz_nr_rnd_zones(dmz->metadata, i), + dmz_nr_unmap_seq_zones(dmz->metadata, i), + dmz_nr_seq_zones(dmz->metadata, i)); + } break; case STATUSTYPE_TABLE: dev = &dmz->dev[0]; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 0cc3459f78ce..f2a760f62db5 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -67,6 +67,16 @@ struct dmz_dev { unsigned int flags; sector_t zone_nr_sectors; + + unsigned int nr_rnd; + atomic_t unmap_nr_rnd; + struct list_head unmap_rnd_list; + struct list_head map_rnd_list; + + unsigned int nr_seq; + atomic_t unmap_nr_seq; + struct list_head unmap_seq_list; + struct list_head map_seq_list; }; #define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \ @@ -213,10 +223,10 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone); unsigned int dmz_nr_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd); unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd); -unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd); +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx); +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx); unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd); unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd); unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd); -- cgit v1.2.3-59-g8ed1b From 4dba12881f882b629774796bb8655f5b1415d803 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:52 +0200 Subject: dm zoned: support arbitrary number of devices Remove the hard-coded limit of two devices and support an unlimited number of additional zoned devices. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 15 +++++- drivers/md/dm-zoned-target.c | 104 ++++++++++++++++++++++++----------------- 2 files changed, 74 insertions(+), 45 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index ce17bf3628c6..49bafc86aa9a 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1525,7 +1525,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd) */ zmd->sb[0].zone = dmz_get(zmd, 0); - zoned_dev = &zmd->dev[1]; + for (i = 1; i < zmd->nr_devs; i++) { + zoned_dev = &zmd->dev[i]; + + ret = blkdev_report_zones(zoned_dev->bdev, 0, + BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); + if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + } + return 0; } /* diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index aa3d26d16441..fc30c78dffdb 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -13,8 +13,6 @@ #define DMZ_MIN_BIOS 8192 -#define DMZ_MAX_DEVS 2 - /* * Zone BIO context. */ @@ -40,10 +38,10 @@ struct dm_chunk_work { * Target descriptor. */ struct dmz_target { - struct dm_dev *ddev[DMZ_MAX_DEVS]; + struct dm_dev **ddev; unsigned int nr_ddevs; - unsigned long flags; + unsigned int flags; /* Zoned block device information */ struct dmz_dev *dev; @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti) struct dmz_target *dmz = ti->private; int i; - for (i = 0; i < DMZ_MAX_DEVS; i++) { + for (i = 0; i < dmz->nr_ddevs; i++) { if (dmz->ddev[i]) { dm_put_device(ti, dmz->ddev[i]); dmz->ddev[i] = NULL; @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti) struct dmz_target *dmz = ti->private; struct dmz_dev *reg_dev, *zoned_dev; struct request_queue *q; + sector_t zone_nr_sectors = 0; + int i; /* - * When we have two devices, the first one must be a regular block - * device and the second a zoned block device. + * When we have more than on devices, the first one must be a + * regular block device and the others zoned block devices. */ - if (dmz->ddev[0] && dmz->ddev[1]) { + if (dmz->nr_ddevs > 1) { reg_dev = &dmz->dev[0]; if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { ti->error = "Primary disk is not a regular device"; return -EINVAL; } - zoned_dev = &dmz->dev[1]; - if (zoned_dev->flags & DMZ_BDEV_REGULAR) { - ti->error = "Secondary disk is not a zoned device"; - return -EINVAL; + for (i = 1; i < dmz->nr_ddevs; i++) { + zoned_dev = &dmz->dev[i]; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { + ti->error = "Secondary disk is not a zoned device"; + return -EINVAL; + } + q = bdev_get_queue(zoned_dev->bdev); + if (zone_nr_sectors && + zone_nr_sectors != blk_queue_zone_sectors(q)) { + ti->error = "Zone nr sectors mismatch"; + return -EINVAL; + } + zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->zone_nr_sectors = zone_nr_sectors; + zoned_dev->nr_zones = + blkdev_nr_zones(zoned_dev->bdev->bd_disk); } } else { reg_dev = NULL; @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti) ti->error = "Disk is not a zoned device"; return -EINVAL; } + q = bdev_get_queue(zoned_dev->bdev); + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); } - q = bdev_get_queue(zoned_dev->bdev); - zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); - zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); if (reg_dev) { - reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; + sector_t zone_offset; + + reg_dev->zone_nr_sectors = zone_nr_sectors; reg_dev->nr_zones = DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, reg_dev->zone_nr_sectors); - zoned_dev->zone_offset = reg_dev->nr_zones; + reg_dev->zone_offset = 0; + zone_offset = reg_dev->nr_zones; + for (i = 1; i < dmz->nr_ddevs; i++) { + dmz->dev[i].zone_offset = zone_offset; + zone_offset += dmz->dev[i].nr_zones; + } } return 0; } @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) int ret, i; /* Check arguments */ - if (argc < 1 || argc > 2) { + if (argc < 1) { ti->error = "Invalid argument count"; return -EINVAL; } @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Unable to allocate the zoned target descriptor"; return -ENOMEM; } - dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL); + dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); if (!dmz->dev) { ti->error = "Unable to allocate the zoned device descriptors"; kfree(dmz); return -ENOMEM; } + dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); + if (!dmz->ddev) { + ti->error = "Unable to allocate the dm device descriptors"; + ret = -ENOMEM; + goto err; + } dmz->nr_ddevs = argc; + ti->private = dmz; /* Get the target zoned block device */ - ret = dmz_get_zoned_device(ti, argv[0], 0, argc); - if (ret) - goto err; - - if (argc == 2) { - ret = dmz_get_zoned_device(ti, argv[1], 1, argc); - if (ret) { - dmz_put_zoned_device(ti); - goto err; - } + for (i = 0; i < argc; i++) { + ret = dmz_get_zoned_device(ti, argv[i], i, argc); + if (ret) + goto err_dev; } ret = dmz_fixup_devices(ti); - if (ret) { - dmz_put_zoned_device(ti); - goto err; - } + if (ret) + goto err_dev; /* Initialize metadata */ ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti, struct dmz_target *dmz = ti->private; unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); sector_t capacity; - int r; + int i, r; - capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1); - r = fn(ti, dmz->ddev[0], 0, capacity, data); - if (!r && dmz->ddev[1]) { - capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1); - r = fn(ti, dmz->ddev[1], 0, capacity, data); + for (i = 0; i < dmz->nr_ddevs; i++) { + capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); + r = fn(ti, dmz->ddev[i], 0, capacity, data); + if (r) + break; } return r; } @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dmz_nr_zones(dmz->metadata), dmz_nr_unmap_cache_zones(dmz->metadata), dmz_nr_cache_zones(dmz->metadata)); - for (i = 0; i < DMZ_MAX_DEVS; i++) { - if (!dmz->ddev[i]) - continue; + for (i = 0; i < dmz->nr_ddevs; i++) { /* * For a multi-device setup the first device * contains only cache zones. @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, status_type_t type, dev = &dmz->dev[0]; format_dev_t(buf, dev->bdev->bd_dev); DMEMIT("%s", buf); - if (dmz->dev[1].bdev) { - dev = &dmz->dev[1]; + for (i = 1; i < dmz->nr_ddevs; i++) { + dev = &dmz->dev[i]; format_dev_t(buf, dev->bdev->bd_dev); DMEMIT(" %s", buf); } -- cgit v1.2.3-59-g8ed1b From 22c1ef66c4cbb82baf81a28abedfe8ad20ad9126 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:53 +0200 Subject: dm zoned: allocate zone by device index When allocating a zone, pass in an indicator on which device the zone should be allocated; this increases performance for a multi-device setup because reclaim will now allocate zones on the device for which reclaim is running. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 17 +++++++++++------ drivers/md/dm-zoned-reclaim.c | 3 ++- drivers/md/dm-zoned.h | 3 ++- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 49bafc86aa9a..1a6cdab3e4ef 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2050,7 +2050,7 @@ again: goto out; /* Allocate a random zone */ - dzone = dmz_alloc_zone(zmd, alloc_flags); + dzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!dzone) { if (dmz_dev_is_dying(zmd)) { dzone = ERR_PTR(-EIO); @@ -2156,7 +2156,7 @@ again: goto out; /* Allocate a random zone */ - bzone = dmz_alloc_zone(zmd, alloc_flags); + bzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!bzone) { if (dmz_dev_is_dying(zmd)) { bzone = ERR_PTR(-EIO); @@ -2187,11 +2187,12 @@ out: * Get an unmapped (free) zone. * This must be called with the mapping lock held. */ -struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, + unsigned long flags) { struct list_head *list; struct dm_zone *zone; - unsigned int dev_idx = 0; + int i = 0; again: if (flags & DMZ_ALLOC_CACHE) @@ -2207,8 +2208,12 @@ again: */ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; - if (dev_idx < zmd->nr_devs) { - dev_idx++; + /* + * Try to allocate from other devices + */ + if (i < zmd->nr_devs) { + dev_idx = (dev_idx + 1) % zmd->nr_devs; + i++; goto again; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 18edf1b9bf52..5a04e34d17a9 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -288,7 +288,8 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) /* Get a free random or sequential zone */ dmz_lock_map(zmd); again: - szone = dmz_alloc_zone(zmd, alloc_flags | DMZ_ALLOC_RECLAIM); + szone = dmz_alloc_zone(zmd, zrc->dev_idx, + alloc_flags | DMZ_ALLOC_RECLAIM); if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) { alloc_flags = DMZ_ALLOC_RND; goto again; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index f2a760f62db5..ec020bb1caf7 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -214,7 +214,8 @@ bool dmz_dev_is_dying(struct dmz_metadata *zmd); #define DMZ_ALLOC_SEQ 0x04 #define DMZ_ALLOC_RECLAIM 0x10 -struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags); +struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, + unsigned int dev_idx, unsigned long flags); void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone); void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone, -- cgit v1.2.3-59-g8ed1b From 69875d443bc3bb1b2e1f77fe3da5ad5c8c729aa2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:54 +0200 Subject: dm zoned: select reclaim zone based on device index per-device reclaim should select zones on that device only. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 50 +++++++++++++++++------------------------- drivers/md/dm-zoned-reclaim.c | 3 ++- drivers/md/dm-zoned-target.c | 1 + drivers/md/dm-zoned.h | 5 ++++- 4 files changed, 27 insertions(+), 32 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 1a6cdab3e4ef..0cb90799b8ce 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1938,7 +1938,7 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) * Select a cache or random write zone for reclaim. */ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, - bool idle) + unsigned int idx, bool idle) { struct dm_zone *dzone = NULL; struct dm_zone *zone; @@ -1948,24 +1948,17 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, if (zmd->nr_cache) { zone_list = &zmd->map_cache_list; /* Try to relaim random zones, too, when idle */ - if (idle && list_empty(zone_list)) { - int i; - - for (i = 1; i < zmd->nr_devs; i++) { - zone_list = &zmd->dev[i].map_rnd_list; - if (!list_empty(zone_list)) - break; - } - } - } else { - /* Otherwise the random zones are on the first disk */ - zone_list = &zmd->dev[0].map_rnd_list; - } + if (idle && list_empty(zone_list)) + zone_list = &zmd->dev[idx].map_rnd_list; + } else + zone_list = &zmd->dev[idx].map_rnd_list; list_for_each_entry(zone, zone_list, link) { - if (dmz_is_buf(zone)) + if (dmz_is_buf(zone)) { dzone = zone->bzone; - else + if (dzone->dev->dev_idx != idx) + continue; + } else dzone = zone; if (dmz_lock_zone_reclaim(dzone)) return dzone; @@ -1977,20 +1970,16 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, /* * Select a buffered sequential zone for reclaim. */ -static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) +static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int idx) { struct dm_zone *zone; - int i; - - for (i = 0; i < zmd->nr_devs; i++) { - struct dmz_dev *dev = &zmd->dev[i]; - list_for_each_entry(zone, &dev->map_seq_list, link) { - if (!zone->bzone) - continue; - if (dmz_lock_zone_reclaim(zone)) - return zone; - } + list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { + if (!zone->bzone) + continue; + if (dmz_lock_zone_reclaim(zone)) + return zone; } return NULL; @@ -1999,7 +1988,8 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) /* * Select a zone for reclaim. */ -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle) { struct dm_zone *zone; @@ -2013,9 +2003,9 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle) */ dmz_lock_map(zmd); if (list_empty(&zmd->reserved_seq_zones_list)) - zone = dmz_get_seq_zone_for_reclaim(zmd); + zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); else - zone = dmz_get_rnd_zone_for_reclaim(zmd, idle); + zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); dmz_unlock_map(zmd); return zone; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 5a04e34d17a9..2261b4dd60b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -370,7 +370,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) int ret; /* Get a data zone */ - dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc)); + dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx, + dmz_target_idle(zrc)); if (!dzone) { DMDEBUG("(%s/%u): No zone found to reclaim", dmz_metadata_label(zmd), zrc->dev_idx); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index fc30c78dffdb..a907a9446c0b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -738,6 +738,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path, dev = &dmz->dev[idx]; } dev->bdev = bdev; + dev->dev_idx = idx; (void)bdevname(dev->bdev, dev->name); dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index ec020bb1caf7..22f11440b423 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -61,6 +61,8 @@ struct dmz_dev { sector_t capacity; + unsigned int dev_idx; + unsigned int nr_zones; unsigned int zone_offset; @@ -243,7 +245,8 @@ static inline void dmz_activate_zone(struct dm_zone *zone) int dmz_lock_zone_reclaim(struct dm_zone *zone); void dmz_unlock_zone_reclaim(struct dm_zone *zone); -struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle); +struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, + unsigned int dev_idx, bool idle); struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op); -- cgit v1.2.3-59-g8ed1b From 2094045fe5b5dda98c4ec6cb1ac7b12ba4382856 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:55 +0200 Subject: dm zoned: prefer full zones for reclaim Prefer full zones when selecting the next zone for reclaim. Signed-off-by: Hannes Reinecke Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 0cb90799b8ce..59a34895f5a8 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1941,7 +1941,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, unsigned int idx, bool idle) { struct dm_zone *dzone = NULL; - struct dm_zone *zone; + struct dm_zone *zone, *last = NULL; struct list_head *zone_list; /* If we have cache zones select from the cache zone list */ @@ -1958,6 +1958,13 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, dzone = zone->bzone; if (dzone->dev->dev_idx != idx) continue; + if (!last) { + last = dzone; + continue; + } + if (last->weight < dzone->weight) + continue; + dzone = last; } else dzone = zone; if (dmz_lock_zone_reclaim(dzone)) -- cgit v1.2.3-59-g8ed1b From 27d49ac1dd751897506ba51df7226fc0ce7ef681 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Jun 2020 13:09:56 +0200 Subject: dm zoned: check superblock location When specifying several devices the superblock location must be checked to ensure the devices are specified in the correct order. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 59a34895f5a8..314ce31a2c43 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -997,7 +997,7 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, struct dmz_dev *dev = dsb->dev; unsigned int nr_meta_zones, nr_data_zones; u32 crc, stored_crc; - u64 gen; + u64 gen, sb_block; if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", @@ -1026,6 +1026,14 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, return -ENXIO; } + sb_block = le64_to_cpu(sb->sb_block); + if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { + dmz_dev_err(dev, "Invalid superblock position " + "(is %llu expected %llu)", + sb_block, + (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); + return -EINVAL; + } if (zmd->sb_version > 1) { uuid_t sb_uuid; -- cgit v1.2.3-59-g8ed1b From a862e4e2154289fc12aa9e70f33614d9c70f3be4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 26 May 2020 16:06:56 -0400 Subject: dm mpath: simplify __must_push_back Remove micro-optimization that infers device is between presuspend and resume (was done purely to avoid call to dm_noflush_suspending, which isn't expensive anyway). Remove flags argument since they are no longer checked. And remove must_push_back_bio() since it was simply a call to __must_push_back(). Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 95f16d816585..4c34d037aa35 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -457,33 +457,15 @@ do { \ /* * Check whether bios must be queued in the device-mapper core rather * than here in the target. - * - * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold - * the same value then we are not between multipath_presuspend() - * and multipath_resume() calls and we have no need to check - * for the DMF_NOFLUSH_SUSPENDING flag. */ -static bool __must_push_back(struct multipath *m, unsigned long flags) +static bool __must_push_back(struct multipath *m) { - return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) != - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) && - dm_noflush_suspending(m->ti)); + return dm_noflush_suspending(m->ti); } -/* - * Following functions use READ_ONCE to get atomic access to - * all m->flags to avoid taking spinlock - */ static bool must_push_back_rq(struct multipath *m) { - unsigned long flags = READ_ONCE(m->flags); - return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags); -} - -static bool must_push_back_bio(struct multipath *m) -{ - unsigned long flags = READ_ONCE(m->flags); - return __must_push_back(m, flags); + return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m); } /* @@ -620,7 +602,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, return DM_MAPIO_SUBMITTED; if (!pgpath) { - if (must_push_back_bio(m)) + if (__must_push_back(m)) return DM_MAPIO_REQUEUE; dm_report_EIO(m); return DM_MAPIO_KILL; @@ -1642,7 +1624,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (must_push_back_bio(m)) { + if (__must_push_back(m)) { r = DM_ENDIO_REQUEUE; } else { dm_report_EIO(m); -- cgit v1.2.3-59-g8ed1b From 553ec94cb4b4937b48f81e27de33f71325d1a227 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 27 May 2020 16:32:51 -0400 Subject: dm mpath: restrict queue_if_no_path state machine Do not allow saving disabled queue_if_no_path if already saved as enabled; implies multiple suspends (which shouldn't ever happen). Log if this unlikely scenario is ever triggered. Also, only write MPATHF_SAVED_QUEUE_IF_NO_PATH during presuspend or if "fail_if_no_path" message. MPATHF_SAVED_QUEUE_IF_NO_PATH is no longer always modified, e.g.: even if queue_if_no_path()'s save_old_value argument wasn't set. This just implies a bit tighter control over the management of MPATHF_SAVED_QUEUE_IF_NO_PATH. Side-effect is multipath_resume() doesn't reset MPATHF_QUEUE_IF_NO_PATH unless MPATHF_SAVED_QUEUE_IF_NO_PATH was set (during presuspend); and at that time the MPATHF_SAVED_QUEUE_IF_NO_PATH bit gets cleared. So MPATHF_SAVED_QUEUE_IF_NO_PATH's use is much more narrow in scope. Last, but not least, do _not_ disable queue_if_no_path during noflush suspend. There is no need/benefit to saving off queue_if_no_path via MPATHF_SAVED_QUEUE_IF_NO_PATH and clearing MPATHF_QUEUE_IF_NO_PATH for noflush suspend -- by avoiding this needless queue_if_no_path flag churn there is less potential for MPATHF_QUEUE_IF_NO_PATH to get lost. Which avoids potential for IOs to be errored back up to userspace during DM multipath's handling of path failures. That said, this last change papers over a reported issue concerning request-based dm-multipath's interaction with blk-mq, relative to suspend and resume: multipath_endio is being called _before_ multipath_resume. This should never happen if DM suspend's blk_mq_quiesce_queue() + dm_wait_for_completion() is genuinely waiting for all inflight blk-mq requests to complete. Similarly: drivers/md/dm.c:__dm_resume() clearly calls dm_table_resume_targets() _before_ dm_start_queue()'s blk_mq_unquiesce_queue() is called. If the queue isn't even restarted until after multipath_resume(); the BIG question that still needs answering is: how can multipath_end_io beat multipath_resume in a race!? Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 4c34d037aa35..bc846cf7b0d8 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -695,12 +695,25 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, bool save_old_value) { unsigned long flags; + bool queue_if_no_path_bit, saved_queue_if_no_path_bit; spin_lock_irqsave(&m->lock, flags); - assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, - (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || - (!save_old_value && queue_if_no_path)); + + queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + + if (save_old_value) { + if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) { + DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!", + dm_device_name(dm_table_get_md(m->ti->table))); + } else + assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit); + } else if (!queue_if_no_path && saved_queue_if_no_path_bit) { + /* due to "fail_if_no_path" message, need to honor it. */ + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); + spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -1653,16 +1666,19 @@ done: } /* - * Suspend can't complete until all the I/O is processed so if - * the last path fails we must error any remaining I/O. - * Note that if the freeze_bdev fails while suspending, the - * queue_if_no_path state is lost - userspace should reset it. + * Suspend with flush can't complete until all the I/O is processed + * so if the last path fails we must error any remaining I/O. + * - Note that if the freeze_bdev fails while suspending, the + * queue_if_no_path state is lost - userspace should reset it. + * Otherwise, during noflush suspend, queue_if_no_path will not change. */ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; - queue_if_no_path(m, false, true); + /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ + if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) + queue_if_no_path(m, false, true); } static void multipath_postsuspend(struct dm_target *ti) @@ -1683,8 +1699,10 @@ static void multipath_resume(struct dm_target *ti) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); + if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } spin_unlock_irqrestore(&m->lock, flags); } -- cgit v1.2.3-59-g8ed1b From 4c3f48380fedbd714fc95958f503c1b5adf3ee6b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 29 May 2020 15:59:13 -0400 Subject: dm mpath: enhance queue_if_no_path debugging Add more DMDEBUG that shows arguments passed and caller, and another that shows state of related flags at end of queue_if_no_path(). Also add queue_if_no_path DMDEBUG to multipath_resume(). Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bc846cf7b0d8..b17da3046611 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -692,10 +692,14 @@ static void process_queued_bios(struct work_struct *work) * If we run out of usable paths, should we queue I/O or error it? */ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, - bool save_old_value) + bool save_old_value, const char *caller) { unsigned long flags; bool queue_if_no_path_bit, saved_queue_if_no_path_bit; + const char *dm_dev_name = dm_device_name(dm_table_get_md(m->ti->table)); + + DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d", + dm_dev_name, __func__, caller, queue_if_no_path, save_old_value); spin_lock_irqsave(&m->lock, flags); @@ -705,7 +709,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, if (save_old_value) { if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) { DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!", - dm_device_name(dm_table_get_md(m->ti->table))); + dm_dev_name); } else assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit); } else if (!queue_if_no_path && saved_queue_if_no_path_bit) { @@ -714,6 +718,12 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, } assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); + DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d", + dm_dev_name, __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), + dm_noflush_suspending(m->ti)); + spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -734,7 +744,7 @@ static void queue_if_no_path_timeout_work(struct timer_list *t) struct mapped_device *md = dm_table_get_md(m->ti->table); DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md)); - queue_if_no_path(m, false, false); + queue_if_no_path(m, false, false, __func__); } /* @@ -1074,7 +1084,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) argc--; if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, true, false); + r = queue_if_no_path(m, true, false, __func__); continue; } @@ -1678,7 +1688,7 @@ static void multipath_presuspend(struct dm_target *ti) /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) - queue_if_no_path(m, false, true); + queue_if_no_path(m, false, true, __func__); } static void multipath_postsuspend(struct dm_target *ti) @@ -1703,6 +1713,12 @@ static void multipath_resume(struct dm_target *ti) set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); } + + DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d", + dm_device_name(dm_table_get_md(m->ti->table)), __func__, + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); + spin_unlock_irqrestore(&m->lock, flags); } @@ -1862,13 +1878,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, true, false); + r = queue_if_no_path(m, true, false, __func__); spin_lock_irqsave(&m->lock, flags); enable_nopath_timeout(m); spin_unlock_irqrestore(&m->lock, flags); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, false, false); + r = queue_if_no_path(m, false, false, __func__); disable_nopath_timeout(m); goto out; } -- cgit v1.2.3-59-g8ed1b From 04867370ec40b708bd335df641182ddab0c59685 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 4 Jun 2020 10:44:34 -0400 Subject: dm mpath: add DM device name to Failing/Reinstating path log messages When there are many DM multipath devices it really helps to have additional context for which DM device a failed or reinstated path is part of. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index b17da3046611..78cff42d987e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1285,7 +1285,9 @@ static int fail_path(struct pgpath *pgpath) if (!pgpath->is_active) goto out; - DMWARN("Failing path %s.", pgpath->path.dev->name); + DMWARN("%s: Failing path %s.", + dm_device_name(dm_table_get_md(m->ti->table)), + pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); pgpath->is_active = false; @@ -1324,7 +1326,9 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->is_active) goto out; - DMWARN("Reinstating path %s.", pgpath->path.dev->name); + DMWARN("%s: Reinstating path %s.", + dm_device_name(dm_table_get_md(m->ti->table)), + pgpath->path.dev->name); r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); if (r) -- cgit v1.2.3-59-g8ed1b From 64611a15ca9da91ff532982429c44686f4593b5f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Jun 2020 12:01:26 -0700 Subject: dm crypt: avoid truncating the logical block size queue_limits::logical_block_size got changed from unsigned short to unsigned int, but it was forgotten to update crypt_io_hints() to use the new type. Fix it. Fixes: ad6bf88a6c19 ("block: fix an integer overflow in logical block size") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 71c651465bdd..000ddfab5ba0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3312,7 +3312,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_segment_size = PAGE_SIZE; limits->logical_block_size = - max_t(unsigned short, limits->logical_block_size, cc->sector_size); + max_t(unsigned, limits->logical_block_size, cc->sector_size); limits->physical_block_size = max_t(unsigned, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); -- cgit v1.2.3-59-g8ed1b