From e29158b4fff7b582643dc020343d52773aaef0c1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:46 -0700 Subject: null_blk: Improve nullb_device_##NAME##_store() readability Introduce a local variable to make the code easier to read. This patch does not change any functionality but makes the next patch in this series easier to read. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 0e7da5015ccd..f5747cfd806f 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -274,10 +274,11 @@ static ssize_t \ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ size_t count) \ { \ - if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \ + struct nullb_device *dev = to_nullb_device(item); \ + \ + if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ return -EBUSY; \ - return nullb_device_##TYPE##_attr_store( \ - &to_nullb_device(item)->NAME, page, count); \ + return nullb_device_##TYPE##_attr_store(&dev->NAME, page, count); \ } \ CONFIGFS_ATTR(nullb_device_, NAME); -- cgit v1.2.3-59-g8ed1b From 45919fbfe1c487c17ea1d198534339a5e8abeae3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:47 -0700 Subject: null_blk: Enable modifying 'submit_queues' after an instance has been configured This patch makes it possible to test blk_mq_update_nr_hw_queues() from inside a VM. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 75 ++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index f5747cfd806f..f5e0dffb4624 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -227,7 +227,7 @@ static ssize_t nullb_device_uint_attr_store(unsigned int *val, int result; result = kstrtouint(page, 0, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -241,7 +241,7 @@ static ssize_t nullb_device_ulong_attr_store(unsigned long *val, unsigned long tmp; result = kstrtoul(page, 0, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -255,7 +255,7 @@ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, int result; result = kstrtobool(page, &tmp); - if (result) + if (result < 0) return result; *val = tmp; @@ -263,7 +263,7 @@ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, } /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ -#define NULLB_DEVICE_ATTR(NAME, TYPE) \ +#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ static ssize_t \ nullb_device_##NAME##_show(struct config_item *item, char *page) \ { \ @@ -274,32 +274,57 @@ static ssize_t \ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ size_t count) \ { \ + int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY; \ struct nullb_device *dev = to_nullb_device(item); \ + TYPE new_value; \ + int ret; \ \ - if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ - return -EBUSY; \ - return nullb_device_##TYPE##_attr_store(&dev->NAME, page, count); \ + ret = nullb_device_##TYPE##_attr_store(&new_value, page, count); \ + if (ret < 0) \ + return ret; \ + if (apply_fn) \ + ret = apply_fn(dev, new_value); \ + else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ + ret = -EBUSY; \ + if (ret < 0) \ + return ret; \ + dev->NAME = new_value; \ + return count; \ } \ CONFIGFS_ATTR(nullb_device_, NAME); -NULLB_DEVICE_ATTR(size, ulong); -NULLB_DEVICE_ATTR(completion_nsec, ulong); -NULLB_DEVICE_ATTR(submit_queues, uint); -NULLB_DEVICE_ATTR(home_node, uint); -NULLB_DEVICE_ATTR(queue_mode, uint); -NULLB_DEVICE_ATTR(blocksize, uint); -NULLB_DEVICE_ATTR(irqmode, uint); -NULLB_DEVICE_ATTR(hw_queue_depth, uint); -NULLB_DEVICE_ATTR(index, uint); -NULLB_DEVICE_ATTR(blocking, bool); -NULLB_DEVICE_ATTR(use_per_node_hctx, bool); -NULLB_DEVICE_ATTR(memory_backed, bool); -NULLB_DEVICE_ATTR(discard, bool); -NULLB_DEVICE_ATTR(mbps, uint); -NULLB_DEVICE_ATTR(cache_size, ulong); -NULLB_DEVICE_ATTR(zoned, bool); -NULLB_DEVICE_ATTR(zone_size, ulong); -NULLB_DEVICE_ATTR(zone_nr_conv, uint); +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + struct nullb *nullb = dev->nullb; + struct blk_mq_tag_set *set; + + if (!nullb) + return 0; + + set = nullb->tag_set; + blk_mq_update_nr_hw_queues(set, submit_queues); + return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +} + +NULLB_DEVICE_ATTR(size, ulong, NULL); +NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(home_node, uint, NULL); +NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(irqmode, uint, NULL); +NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +NULLB_DEVICE_ATTR(index, uint, NULL); +NULLB_DEVICE_ATTR(blocking, bool, NULL); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +NULLB_DEVICE_ATTR(discard, bool, NULL); +NULLB_DEVICE_ATTR(mbps, uint, NULL); +NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +NULLB_DEVICE_ATTR(zoned, bool, NULL); +NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { -- cgit v1.2.3-59-g8ed1b From dd85b4922de1b70f0729d2a7856db619e210a8ec Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Thu, 17 Oct 2019 14:19:43 -0700 Subject: null_blk: return fixed zoned reads > write pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A zoned block device maintains a write pointer within a zone, and reads beyond the write pointer are undefined. Fill data buffer returned above the write pointer with 0xFF. Signed-off-by: Ajay Joshi Reviewed-by: Damien Le Moal Reviewed-by: Matias Bjørling Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 8 ++++++++ drivers/block/null_blk_main.c | 26 +++++++++++++++++++++++++- drivers/block/null_blk_zoned.c | 21 +++++++++++++++++++-- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index a235c45e22a7..93c2a3d403da 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -96,6 +96,8 @@ int null_zone_report(struct gendisk *disk, sector_t sector, blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, sector_t nr_sectors); +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len); #else static inline int null_zone_init(struct nullb_device *dev) { @@ -115,5 +117,11 @@ static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd, { return BLK_STS_NOTSUPP; } +static inline size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, + unsigned int len) +{ + return len; +} #endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index f5e0dffb4624..ea7a4d6b7848 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1022,6 +1022,16 @@ next: return 0; } +static void nullb_fill_pattern(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off) +{ + void *dst; + + dst = kmap_atomic(page); + memset(dst + off, 0xFF, len); + kunmap_atomic(dst); +} + static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) { size_t temp; @@ -1062,10 +1072,24 @@ static int null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector, bool is_fua) { + struct nullb_device *dev = nullb->dev; + unsigned int valid_len = len; int err = 0; if (!is_write) { - err = copy_from_nullb(nullb, page, off, sector, len); + if (dev->zoned) + valid_len = null_zone_valid_read_len(nullb, + sector, len); + + if (valid_len) { + err = copy_from_nullb(nullb, page, off, + sector, valid_len); + off += valid_len; + len -= valid_len; + } + + if (len) + nullb_fill_pattern(nullb, page, len, off); flush_dcache_page(page); } else { flush_dcache_page(page); diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index eabc116832a7..e020f17dac9f 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -84,6 +84,24 @@ int null_zone_report(struct gendisk *disk, sector_t sector, return 0; } +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len) +{ + struct nullb_device *dev = nullb->dev; + struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + unsigned int nr_sectors = len >> SECTOR_SHIFT; + + /* Read must be below the write pointer position */ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || + sector + nr_sectors <= zone->wp) + return len; + + if (sector > zone->wp) + return 0; + + return (zone->wp - sector) << SECTOR_SHIFT; +} + static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors) { @@ -121,8 +139,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) { struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; + struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; size_t i; switch (req_op(cmd->rq)) { -- cgit v1.2.3-59-g8ed1b From e3fc3f3d0943b126f76b8533960e4168412d9e5a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 21 Sep 2019 09:00:31 +0300 Subject: md/raid0: Fix an error message in raid0_make_request() The first argument to WARN() is supposed to be a condition. The original code will just print the mdname() instead of the full warning message. Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.") Signed-off-by: Dan Carpenter Signed-off-by: Song Liu --- drivers/md/raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f61693e59684..3956ea502f97 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -615,7 +615,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev = map_sector(mddev, zone, sector, §or); break; default: - WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); return true; } -- cgit v1.2.3-59-g8ed1b From fadcbd2901a0f7c8721f3bdb69eac95c272dc8ed Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Sep 2019 13:53:50 +0200 Subject: md/bitmap: avoid race window between md_bitmap_resize and bitmap_file_clear_bit We need to move "spin_lock_irq(&bitmap->counts.lock)" before unmap previous storage, otherwise panic like belows could happen as follows. [ 902.353802] sdl: detected capacity change from 1077936128 to 3221225472 [ 902.616948] general protection fault: 0000 [#1] SMP [snip] [ 902.618588] CPU: 12 PID: 33698 Comm: md0_raid1 Tainted: G O 4.14.144-1-pserver #4.14.144-1.1~deb10 [ 902.618870] Hardware name: Supermicro SBA-7142G-T4/BHQGE, BIOS 3.00 10/24/2012 [ 902.619120] task: ffff9ae1860fc600 task.stack: ffffb52e4c704000 [ 902.619301] RIP: 0010:bitmap_file_clear_bit+0x90/0xd0 [md_mod] [ 902.619464] RSP: 0018:ffffb52e4c707d28 EFLAGS: 00010087 [ 902.619626] RAX: ffe8008b0d061000 RBX: ffff9ad078c87300 RCX: 0000000000000000 [ 902.619792] RDX: ffff9ad986341868 RSI: 0000000000000803 RDI: ffff9ad078c87300 [ 902.619986] RBP: ffff9ad0ed7a8000 R08: 0000000000000000 R09: 0000000000000000 [ 902.620154] R10: ffffb52e4c707ec0 R11: ffff9ad987d1ed44 R12: ffff9ad0ed7a8360 [ 902.620320] R13: 0000000000000003 R14: 0000000000060000 R15: 0000000000000800 [ 902.620487] FS: 0000000000000000(0000) GS:ffff9ad987d00000(0000) knlGS:0000000000000000 [ 902.620738] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 902.620901] CR2: 000055ff12aecec0 CR3: 0000001005207000 CR4: 00000000000406e0 [ 902.621068] Call Trace: [ 902.621256] bitmap_daemon_work+0x2dd/0x360 [md_mod] [ 902.621429] ? find_pers+0x70/0x70 [md_mod] [ 902.621597] md_check_recovery+0x51/0x540 [md_mod] [ 902.621762] raid1d+0x5c/0xeb0 [raid1] [ 902.621939] ? try_to_del_timer_sync+0x4d/0x80 [ 902.622102] ? del_timer_sync+0x35/0x40 [ 902.622265] ? schedule_timeout+0x177/0x360 [ 902.622453] ? call_timer_fn+0x130/0x130 [ 902.622623] ? find_pers+0x70/0x70 [md_mod] [ 902.622794] ? md_thread+0x94/0x150 [md_mod] [ 902.622959] md_thread+0x94/0x150 [md_mod] [ 902.623121] ? wait_woken+0x80/0x80 [ 902.623280] kthread+0x119/0x130 [ 902.623437] ? kthread_create_on_node+0x60/0x60 [ 902.623600] ret_from_fork+0x22/0x40 [ 902.624225] RIP: bitmap_file_clear_bit+0x90/0xd0 [md_mod] RSP: ffffb52e4c707d28 Because mdadm was running on another cpu to do resize, so bitmap_resize was called to replace bitmap as below shows. PID: 38801 TASK: ffff9ad074a90e00 CPU: 0 COMMAND: "mdadm" [exception RIP: queued_spin_lock_slowpath+56] [snip] -- -- #5 [ffffb52e60f17c58] queued_spin_lock_slowpath at ffffffff9c0b27b8 #6 [ffffb52e60f17c58] bitmap_resize at ffffffffc0399877 [md_mod] #7 [ffffb52e60f17d30] raid1_resize at ffffffffc0285bf9 [raid1] #8 [ffffb52e60f17d50] update_size at ffffffffc038a31a [md_mod] #9 [ffffb52e60f17d70] md_ioctl at ffffffffc0395ca4 [md_mod] And the procedure to keep resize bitmap safe is allocate new storage space, then quiesce, copy bits, replace bitmap, and re-start. However the daemon (bitmap_daemon_work) could happen even the array is quiesced, which means when bitmap_file_clear_bit is triggered by raid1d, then it thinks it should be fine to access store->filemap since counts->lock is held, but resize could change the storage without the protection of the lock. Cc: Jack Wang Cc: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b092c7b5282f..3ad18246fcb3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2139,6 +2139,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2154,7 +2155,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); - spin_lock_irq(&bitmap->counts.lock); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; -- cgit v1.2.3-59-g8ed1b From 775d78319f1ceb32be8eb3b1202ccdc60e9cb7f1 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 16 Sep 2019 13:15:14 -0400 Subject: md: improve handling of bio with REQ_PREFLUSH in md_flush_request() If pers->make_request fails in md_flush_request(), the bio is lost. To fix this, pass back a bool to indicate if the original make_request call should continue to handle the I/O and instead of assuming the flush logic will push it to completion. Convert md_flush_request to return a bool and no longer calls the raid driver's make_request function. If the return is true, then the md flush logic has or will complete the bio and the md make_request call is done. If false, then the md make_request function needs to keep processing like it is a normal bio. Let the original call to md_handle_request handle any need to retry sending the bio to the raid driver's make_request function should it be needed. Also mark md_flush_request and the make_request function pointer as __must_check to issue warnings should these critical return values be ignored. Fixes: 2bc13b83e629 ("md: batch flush requests.") Cc: stable@vger.kernel.org # # v4.19+ Cc: NeilBrown Signed-off-by: David Jeffery Reviewed-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md-linear.c | 5 ++--- drivers/md/md-multipath.c | 5 ++--- drivers/md/md.c | 11 +++++++++-- drivers/md/md.h | 4 ++-- drivers/md/raid0.c | 5 ++--- drivers/md/raid1.c | 5 ++--- drivers/md/raid10.c | 5 ++--- drivers/md/raid5.c | 4 ++-- 8 files changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index c766c559d36d..26c75c0199fa 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 6780938d2991..152f9e65a226 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); diff --git a/drivers/md/md.c b/drivers/md/md.c index 1be7abeb24fd..b8dd56b746da 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -550,7 +550,13 @@ static void md_submit_flush_data(struct work_struct *ws) } } -void md_flush_request(struct mddev *mddev, struct bio *bio) +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); @@ -575,9 +581,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + return false; } } + return true; } EXPORT_SYMBOL(md_flush_request); diff --git a/drivers/md/md.h b/drivers/md/md.h index c5e3ff398b59..5f86f8adb0a4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -550,7 +550,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - bool (*make_request)(struct mddev *mddev, struct bio *bio); + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() @@ -703,7 +703,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3956ea502f97..f2b83bd2fee6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -575,10 +575,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) unsigned chunk_sects; unsigned sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0466ee2453b4..bb29aeefcbd0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1567,10 +1567,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } /* * There is a limit to the maximum size, but diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 299c7b1c9718..2eca0a81a8c9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1525,10 +1525,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) int chunk_sects = chunk_mask + 1; int sectors = bio_sectors(bio); - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (!md_write_start(mddev, bio)) return false; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 223e97ab27e6..12a8ce83786e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5592,8 +5592,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (ret == 0) return true; if (ret == -ENODEV) { - md_flush_request(mddev, bi); - return true; + if (md_flush_request(mddev, bi)) + return true; } /* ret == -EAGAIN, fallback */ /* -- cgit v1.2.3-59-g8ed1b From 6a5cb53aaa4ef515ddeffa04ce18b771121127b4 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 16 Oct 2019 16:00:03 +0800 Subject: md: no longer compare spare disk superblock events in super_load We have a test case as follow: mdadm -CR /dev/md1 -l 1 -n 4 /dev/sd[a-d] \ --assume-clean --bitmap=internal mdadm -S /dev/md1 mdadm -A /dev/md1 /dev/sd[b-c] --run --force mdadm --zero /dev/sda mdadm /dev/md1 -a /dev/sda echo offline > /sys/block/sdc/device/state echo offline > /sys/block/sdb/device/state sleep 5 mdadm -S /dev/md1 echo running > /sys/block/sdb/device/state echo running > /sys/block/sdc/device/state mdadm -A /dev/md1 /dev/sd[a-c] --run --force When we readd /dev/sda to the array, it started to do recovery. After offline the other two disks in md1, the recovery have been interrupted and superblock update info cannot be written to the offline disks. While the spare disk (/dev/sda) can continue to update superblock info. After stopping the array and assemble it, we found the array run fail, with the follow kernel message: [ 172.986064] md: kicking non-fresh sdb from array! [ 173.004210] md: kicking non-fresh sdc from array! [ 173.022383] md/raid1:md1: active with 0 out of 4 mirrors [ 173.022406] md1: failed to create bitmap (-5) [ 173.023466] md: md1 stopped. Since both sdb and sdc have the value of 'sb->events' smaller than that in sda, they have been kicked from the array. However, the only remained disk sda is in 'spare' state before stop and it cannot be added to conf->mirrors[] array. In the end, raid array assemble and run fail. In fact, we can use the older disk sdb or sdc to assemble the array. That means we should not choose the 'spare' disk as the fresh disk in analyze_sbs(). To fix the problem, we do not compare superblock events when it is a spare disk, as same as validate_super. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/md.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b8dd56b746da..6f0ecfe8eab2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1156,7 +1156,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->desc_nr = sb->this_disk.number; if (!refdev) { - ret = 1; + /* + * Insist on good event counter while assembling, except + * for spares (which don't need an event count) + */ + if (sb->disks[rdev->desc_nr].state & ( + (1<sb_page); @@ -1172,7 +1180,14 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + /* + * Insist on good event counter while assembling, except + * for spares (which don't need an event count) + */ + if (sb->disks[rdev->desc_nr].state & ( + (1< ev2)) ret = 1; else ret = 0; @@ -1532,6 +1547,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + __u64 role; /* * Calculate the position of the superblock in 512byte sectors. @@ -1665,8 +1681,20 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (!refdev) { - ret = 1; + /* + * Insist of good event counter while assembling, except for + * spares (which don't need an event count) + */ + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (role < MD_DISK_ROLE_MAX || + role == MD_DISK_ROLE_JOURNAL)) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1683,7 +1711,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + /* + * Insist of good event counter while assembling, except for + * spares (which don't need an event count) + */ + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (role < MD_DISK_ROLE_MAX || + role == MD_DISK_ROLE_JOURNAL) && ev1 > ev2) ret = 1; else ret = 0; @@ -3604,7 +3639,7 @@ abort_free: * Check a full RAID array for plausibility */ -static void analyze_sbs(struct mddev *mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3625,6 +3660,12 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3659,6 +3700,8 @@ static void analyze_sbs(struct mddev *mddev) clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. @@ -5577,7 +5620,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) -- cgit v1.2.3-59-g8ed1b From a4414aedf4bc923c1faa24d72d14f14c2b93e48f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:16:41 +0200 Subject: block: mtip32xx: Spelling s/configration/configuration/ Fix misspelling of "configuration". Signed-off-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 964f78cfffa0..f6bafa9a68b9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -129,7 +129,7 @@ struct mtip_compat_ide_task_request_s { /* * This function check_for_surprise_removal is called * while card is removed from the system and it will - * read the vendor id from the configration space + * read the vendor id from the configuration space * * @pdev Pointer to the pci_dev structure. * -- cgit v1.2.3-59-g8ed1b From efcfec579f6139528c9e6925eca2bc4a36da65c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Oct 2019 20:29:48 -0700 Subject: loop: fix no-unmap write-zeroes request behavior Currently, if the loop device receives a WRITE_ZEROES request, it asks the underlying filesystem to punch out the range. This behavior is correct if unmapping is allowed. However, a NOUNMAP request means that the caller doesn't want us to free the storage backing the range, so punching out the range is incorrect behavior. To satisfy a NOUNMAP | WRITE_ZEROES request, loop should ask the underlying filesystem to FALLOC_FL_ZERO_RANGE, which is (according to the fallocate documentation) required to ensure that the entire range is backed by real storage, which suffices for our purposes. Fixes: 19372e2769179dd ("loop: implement REQ_OP_WRITE_ZEROES") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f6f77eaa7217..ef6e251857c8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -417,18 +417,20 @@ out_free_page: return ret; } -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) +static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, + int mode) { /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * We use fallocate to manipulate the space mappings used by the image + * a.k.a. discard/zerorange. However we do not support this if + * encryption is enabled, because it may give an attacker useful + * information. */ struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; + mode |= FALLOC_FL_KEEP_SIZE; + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { ret = -EOPNOTSUPP; goto out; @@ -596,9 +598,17 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); - case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - return lo_discard(lo, rq, pos); + /* + * If the caller doesn't want deallocation, call zeroout to + * write zeroes the range. Otherwise, punch them out. + */ + return lo_fallocate(lo, rq, pos, + (rq->cmd_flags & REQ_NOUNMAP) ? + FALLOC_FL_ZERO_RANGE : + FALLOC_FL_PUNCH_HOLE); + case REQ_OP_DISCARD: + return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: if (lo->transfer) return lo_write_transfer(lo, rq, pos); -- cgit v1.2.3-59-g8ed1b From f16583614222d015968541f2e50447c67c277f74 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 14:51:34 -0700 Subject: nvme-fc: Sync nvme-fc header to FC-NVME-2 Sync the header to FC-NVME-2 r1.06 (T11-2019-00210-v001). Includes some minor mods where pre-release field names changed by the time the spec was released. Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme-fc.h | 182 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 137 insertions(+), 45 deletions(-) diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 067c9fea64fe..e8c30b39bb27 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -4,33 +4,60 @@ */ /* - * This file contains definitions relative to FC-NVME r1.14 (16-020vB). - * The fcnvme_lsdesc_cr_assoc_cmd struct reflects expected r1.16 content. + * This file contains definitions relative to FC-NVME-2 r1.06 + * (T11-2019-00210-v001). */ #ifndef _NVME_FC_H #define _NVME_FC_H 1 +#include -#define NVME_CMD_SCSI_ID 0xFD +#define NVME_CMD_FORMAT_ID 0xFD #define NVME_CMD_FC_ID FC_TYPE_NVME /* FC-NVME Cmd IU Flags */ -#define FCNVME_CMD_FLAGS_DIRMASK 0x03 -#define FCNVME_CMD_FLAGS_WRITE 0x01 -#define FCNVME_CMD_FLAGS_READ 0x02 +enum { + FCNVME_CMD_FLAGS_DIRMASK = 0x03, + FCNVME_CMD_FLAGS_WRITE = (1 << 0), + FCNVME_CMD_FLAGS_READ = (1 << 1), + + FCNVME_CMD_FLAGS_PICWP = (1 << 2), +}; + +enum { + FCNVME_CMD_CAT_MASK = 0x0F, + FCNVME_CMD_CAT_ADMINQ = 0x01, + FCNVME_CMD_CAT_CSSMASK = 0x07, + FCNVME_CMD_CAT_CSSFLAG = 0x08, +}; + +static inline __u8 fccmnd_set_cat_admin(__u8 rsv_cat) +{ + return (rsv_cat & ~FCNVME_CMD_CAT_MASK) | FCNVME_CMD_CAT_ADMINQ; +} + +static inline __u8 fccmnd_set_cat_css(__u8 rsv_cat, __u8 css) +{ + return (rsv_cat & ~FCNVME_CMD_CAT_MASK) | FCNVME_CMD_CAT_CSSFLAG | + (css & FCNVME_CMD_CAT_CSSMASK); +} struct nvme_fc_cmd_iu { - __u8 scsi_id; + __u8 format_id; __u8 fc_id; __be16 iu_len; - __u8 rsvd4[3]; + __u8 rsvd4[2]; + __u8 rsv_cat; __u8 flags; __be64 connection_id; __be32 csn; __be32 data_len; struct nvme_command sqe; - __be32 rsvd88[2]; + __u8 dps; + __u8 lbads; + __be16 ms; + __be32 rsvd92; }; #define NVME_FC_SIZEOF_ZEROS_RSP 12 @@ -38,11 +65,12 @@ struct nvme_fc_cmd_iu { enum { FCNVME_SC_SUCCESS = 0, FCNVME_SC_INVALID_FIELD = 1, - FCNVME_SC_INVALID_CONNID = 2, + /* reserved 2 */ + FCNVME_SC_ILL_CONN_PARAMS = 3, }; struct nvme_fc_ersp_iu { - __u8 status_code; + __u8 ersp_result; __u8 rsvd1; __be16 iu_len; __be32 rsn; @@ -53,14 +81,44 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME Link Services */ +#define FCNVME_NVME_SR_OPCODE 0x01 + +struct nvme_fc_nvme_sr_iu { + __u8 fc_id; + __u8 opcode; + __u8 rsvd2; + __u8 retry_rctl; + __be32 rsvd4; +}; + + +enum { + FCNVME_SRSTAT_ACC = 0x0, + FCNVME_SRSTAT_INV_FCID = 0x1, + /* reserved 0x2 */ + FCNVME_SRSTAT_LOGICAL_ERR = 0x3, + FCNVME_SRSTAT_INV_QUALIF = 0x4, + FCNVME_SRSTAT_UNABL2PERFORM = 0x9, +}; + +struct nvme_fc_nvme_sr_rsp_iu { + __u8 fc_id; + __u8 opcode; + __u8 rsvd2; + __u8 status; + __be32 rsvd4; +}; + + +/* FC-NVME Link Services - LS cmd values (w0 bits 31:24) */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, FCNVME_LS_ACC = 2, - FCNVME_LS_CREATE_ASSOCIATION = 3, - FCNVME_LS_CREATE_CONNECTION = 4, - FCNVME_LS_DISCONNECT = 5, + FCNVME_LS_CREATE_ASSOCIATION = 3, /* Create Association */ + FCNVME_LS_CREATE_CONNECTION = 4, /* Create I/O Connection */ + FCNVME_LS_DISCONNECT_ASSOC = 5, /* Disconnect Association */ + FCNVME_LS_DISCONNECT_CONN = 6, /* Disconnect Connection */ }; /* FC-NVME Link Service Descriptors */ @@ -117,14 +175,17 @@ enum fcnvme_ls_rjt_reason { FCNVME_RJT_RC_UNSUP = 0x0b, /* command not supported */ - FCNVME_RJT_RC_INPROG = 0x0e, - /* command already in progress */ - FCNVME_RJT_RC_INV_ASSOC = 0x40, - /* Invalid Association ID*/ + /* Invalid Association ID */ FCNVME_RJT_RC_INV_CONN = 0x41, - /* Invalid Connection ID*/ + /* Invalid Connection ID */ + + FCNVME_RJT_RC_INV_PARAM = 0x42, + /* Invalid Parameters */ + + FCNVME_RJT_RC_INSUF_RES = 0x43, + /* Insufficient Resources */ FCNVME_RJT_RC_VENDOR = 0xff, /* vendor specific error */ @@ -138,14 +199,32 @@ enum fcnvme_ls_rjt_explan { FCNVME_RJT_EXP_OXID_RXID = 0x17, /* invalid OX_ID-RX_ID combination */ - FCNVME_RJT_EXP_INSUF_RES = 0x29, - /* insufficient resources */ - FCNVME_RJT_EXP_UNAB_DATA = 0x2a, /* unable to supply requested data */ FCNVME_RJT_EXP_INV_LEN = 0x2d, /* Invalid payload length */ + + FCNVME_RJT_EXP_INV_ERSP_RAT = 0x40, + /* Invalid NVMe_ERSP Ratio */ + + FCNVME_RJT_EXP_INV_CTLR_ID = 0x41, + /* Invalid Controller ID */ + + FCNVME_RJT_EXP_INV_QUEUE_ID = 0x42, + /* Invalid Queue ID */ + + FCNVME_RJT_EXP_INV_SQSIZE = 0x43, + /* Invalid Submission Queue Size */ + + FCNVME_RJT_EXP_INV_HOSTID = 0x44, + /* Invalid HOST ID */ + + FCNVME_RJT_EXP_INV_HOSTNQN = 0x45, + /* Invalid HOSTNQN */ + + FCNVME_RJT_EXP_INV_SUBNQN = 0x46, + /* Invalid SUBNQN */ }; /* FCNVME_LSDESC_RJT */ @@ -209,21 +288,11 @@ struct fcnvme_lsdesc_cr_conn_cmd { __be32 rsvd52; }; -/* Disconnect Scope Values */ -enum { - FCNVME_DISCONN_ASSOCIATION = 0, - FCNVME_DISCONN_CONNECTION = 1, -}; - /* FCNVME_LSDESC_DISCONN_CMD */ struct fcnvme_lsdesc_disconn_cmd { __be32 desc_tag; /* FCNVME_LSDESC_xxx */ __be32 desc_len; - u8 rsvd8[3]; - /* note: scope is really a 1 bit field */ - u8 scope; /* FCNVME_DISCONN_xxx */ - __be32 rsvd12; - __be64 id; + __be32 rsvd8[4]; }; /* FCNVME_LSDESC_CONN_ID */ @@ -242,9 +311,14 @@ struct fcnvme_lsdesc_assoc_id { /* r_ctl values */ enum { - FCNVME_RS_RCTL_DATA = 1, - FCNVME_RS_RCTL_XFER_RDY = 5, - FCNVME_RS_RCTL_RSP = 8, + FCNVME_RS_RCTL_CMND = 0x6, + FCNVME_RS_RCTL_DATA = 0x1, + FCNVME_RS_RCTL_CONF = 0x3, + FCNVME_RS_RCTL_SR = 0x9, + FCNVME_RS_RCTL_XFER_RDY = 0x5, + FCNVME_RS_RCTL_RSP = 0x7, + FCNVME_RS_RCTL_ERSP = 0x8, + FCNVME_RS_RCTL_SR_RSP = 0xA, }; @@ -264,7 +338,10 @@ struct fcnvme_ls_acc_hdr { struct fcnvme_ls_rqst_w0 w0; __be32 desc_list_len; struct fcnvme_lsdesc_rqst rqst; - /* Followed by cmd-specific ACC descriptors, see next definitions */ + /* + * Followed by cmd-specific ACCEPT descriptors, see xxx_acc + * definitions below + */ }; /* FCNVME_LS_CREATE_ASSOCIATION */ @@ -302,25 +379,39 @@ struct fcnvme_ls_cr_conn_acc { struct fcnvme_lsdesc_conn_id connectid; }; -/* FCNVME_LS_DISCONNECT */ -struct fcnvme_ls_disconnect_rqst { +/* FCNVME_LS_DISCONNECT_ASSOC */ +struct fcnvme_ls_disconnect_assoc_rqst { struct fcnvme_ls_rqst_w0 w0; __be32 desc_list_len; struct fcnvme_lsdesc_assoc_id associd; struct fcnvme_lsdesc_disconn_cmd discon_cmd; }; -struct fcnvme_ls_disconnect_acc { +struct fcnvme_ls_disconnect_assoc_acc { + struct fcnvme_ls_acc_hdr hdr; +}; + + +/* FCNVME_LS_DISCONNECT_CONN */ +struct fcnvme_ls_disconnect_conn_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_disconn_cmd connectid; +}; + +struct fcnvme_ls_disconnect_conn_acc { struct fcnvme_ls_acc_hdr hdr; }; /* - * Yet to be defined in FC-NVME: + * Default R_A_TOV is pulled in from fc_fs.h but needs conversion + * from ms to seconds for our use. */ -#define NVME_FC_CONNECT_TIMEOUT_SEC 2 /* 2 seconds */ -#define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ -#define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ +#define FC_TWO_TIMES_R_A_TOV (2 * (FC_DEF_R_A_TOV / 1000)) +#define NVME_FC_LS_TIMEOUT_SEC FC_TWO_TIMES_R_A_TOV +#define NVME_FC_TGTOP_TIMEOUT_SEC FC_TWO_TIMES_R_A_TOV /* * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>" @@ -328,6 +419,7 @@ struct fcnvme_ls_disconnect_acc { * infront of the <16hexdigits>. Without is considered the "min" string * and with is considered the "max" string. The hexdigits may be upper * or lower case. + * Note: FC-NVME-2 standard requires a "0x" prefix. */ #define NVME_FC_TRADDR_NNLEN 3 /* "?n-" */ #define NVME_FC_TRADDR_OXNNLEN 5 /* "?n-0x" */ -- cgit v1.2.3-59-g8ed1b From 53b2b2f59967c0b7eb4df265136b1cc25b9fb287 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 14:51:35 -0700 Subject: nvme-fc and nvmet-fc: sync with FC-NVME-2 header changes Sync sources with revised structure and field names to correspond with FC-NVME-2 header sync-up. Tested interoperability with success: - prior initiator with new target - prior target with new initiator - new on new Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 24 +++++++++++------------- drivers/nvme/target/fc.c | 27 +++++++++++++++------------ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 265f89e11d8b..e09c61ec0b96 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1224,7 +1224,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, lsreq->rqstlen = sizeof(*assoc_rqst); lsreq->rspaddr = assoc_acc; lsreq->rsplen = sizeof(*assoc_acc); - lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC; + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; ret = nvme_fc_send_ls_req(ctrl->rport, lsop); if (ret) @@ -1332,7 +1332,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, lsreq->rqstlen = sizeof(*conn_rqst); lsreq->rspaddr = conn_acc; lsreq->rsplen = sizeof(*conn_acc); - lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC; + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; ret = nvme_fc_send_ls_req(ctrl->rport, lsop); if (ret) @@ -1413,8 +1413,8 @@ nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status) static void nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) { - struct fcnvme_ls_disconnect_rqst *discon_rqst; - struct fcnvme_ls_disconnect_acc *discon_acc; + struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst; + struct fcnvme_ls_disconnect_assoc_acc *discon_acc; struct nvmefc_ls_req_op *lsop; struct nvmefc_ls_req *lsreq; int ret; @@ -1430,11 +1430,11 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) lsreq = &lsop->ls_req; lsreq->private = (void *)&lsop[1]; - discon_rqst = (struct fcnvme_ls_disconnect_rqst *) + discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *) (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz); - discon_acc = (struct fcnvme_ls_disconnect_acc *)&discon_rqst[1]; + discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1]; - discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT; + discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC; discon_rqst->desc_list_len = cpu_to_be32( sizeof(struct fcnvme_lsdesc_assoc_id) + sizeof(struct fcnvme_lsdesc_disconn_cmd)); @@ -1451,14 +1451,12 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) discon_rqst->discon_cmd.desc_len = fcnvme_lsdesc_len( sizeof(struct fcnvme_lsdesc_disconn_cmd)); - discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION; - discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id); lsreq->rqstaddr = discon_rqst; lsreq->rqstlen = sizeof(*discon_rqst); lsreq->rspaddr = discon_acc; lsreq->rsplen = sizeof(*discon_acc); - lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC; + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop, nvme_fc_disconnect_assoc_done); @@ -1662,7 +1660,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) (freq->rcv_rsplen / 4) || be32_to_cpu(op->rsp_iu.xfrd_len) != freq->transferred_length || - op->rsp_iu.status_code || + op->rsp_iu.ersp_result || sqe->common.command_id != cqe->command_id)) { status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); dev_info(ctrl->ctrl.device, @@ -1672,7 +1670,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len), be32_to_cpu(op->rsp_iu.xfrd_len), freq->transferred_length, - op->rsp_iu.status_code, + op->rsp_iu.ersp_result, sqe->common.command_id, cqe->command_id); goto done; @@ -1731,7 +1729,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, op->rq = rq; op->rqno = rqno; - cmdiu->scsi_id = NVME_CMD_SCSI_ID; + cmdiu->format_id = NVME_CMD_FORMAT_ID; cmdiu->fc_id = NVME_CMD_FC_ID; cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index ce8d819f86cc..61b617698d3f 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1495,20 +1495,20 @@ static void nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_ls_iod *iod) { - struct fcnvme_ls_disconnect_rqst *rqst = - (struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf; - struct fcnvme_ls_disconnect_acc *acc = - (struct fcnvme_ls_disconnect_acc *)iod->rspbuf; + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + (struct fcnvme_ls_disconnect_assoc_rqst *)iod->rqstbuf; + struct fcnvme_ls_disconnect_assoc_acc *acc = + (struct fcnvme_ls_disconnect_assoc_acc *)iod->rspbuf; struct nvmet_fc_tgt_assoc *assoc; int ret = 0; memset(acc, 0, sizeof(*acc)); - if (iod->rqstdatalen < sizeof(struct fcnvme_ls_disconnect_rqst)) + if (iod->rqstdatalen < sizeof(struct fcnvme_ls_disconnect_assoc_rqst)) ret = VERR_DISCONN_LEN; else if (rqst->desc_list_len != fcnvme_lsdesc_len( - sizeof(struct fcnvme_ls_disconnect_rqst))) + sizeof(struct fcnvme_ls_disconnect_assoc_rqst))) ret = VERR_DISCONN_RQST_LEN; else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) ret = VERR_ASSOC_ID; @@ -1523,8 +1523,11 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, fcnvme_lsdesc_len( sizeof(struct fcnvme_lsdesc_disconn_cmd))) ret = VERR_DISCONN_CMD_LEN; - else if ((rqst->discon_cmd.scope != FCNVME_DISCONN_ASSOCIATION) && - (rqst->discon_cmd.scope != FCNVME_DISCONN_CONNECTION)) + /* + * As the standard changed on the LS, check if old format and scope + * something other than Association (e.g. 0). + */ + else if (rqst->discon_cmd.rsvd8[0]) ret = VERR_DISCONN_SCOPE; else { /* match an active association */ @@ -1556,8 +1559,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, fcnvme_lsdesc_len( - sizeof(struct fcnvme_ls_disconnect_acc)), - FCNVME_LS_DISCONNECT); + sizeof(struct fcnvme_ls_disconnect_assoc_acc)), + FCNVME_LS_DISCONNECT_ASSOC); /* release get taken in nvmet_fc_find_target_assoc */ nvmet_fc_tgt_a_put(iod->assoc); @@ -1632,7 +1635,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport, /* Creates an IO Queue/Connection */ nvmet_fc_ls_create_connection(tgtport, iod); break; - case FCNVME_LS_DISCONNECT: + case FCNVME_LS_DISCONNECT_ASSOC: /* Terminate a Queue/Connection or the Association */ nvmet_fc_ls_disconnect(tgtport, iod); break; @@ -2299,7 +2302,7 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port, /* validate iu, so the connection id can be used to find the queue */ if ((cmdiubuf_len != sizeof(*cmdiu)) || - (cmdiu->scsi_id != NVME_CMD_SCSI_ID) || + (cmdiu->format_id != NVME_CMD_FORMAT_ID) || (cmdiu->fc_id != NVME_CMD_FC_ID) || (be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4))) return -EIO; -- cgit v1.2.3-59-g8ed1b From 44fbf3bb1ac3dbebc6bd07eb68abf2d0badfae65 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 14:51:36 -0700 Subject: nvme-fc: Set new cmd set indicator in nvme-fc cmnd iu Set the new category field in the FC-NVME CMND_IU based on queue number. Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e09c61ec0b96..e099a74f666e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1732,6 +1732,11 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, cmdiu->format_id = NVME_CMD_FORMAT_ID; cmdiu->fc_id = NVME_CMD_FC_ID; cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); + if (queue->qnum) + cmdiu->rsv_cat = fccmnd_set_cat_css(0, + (NVME_CC_CSS_NVM >> NVME_CC_CSS_SHIFT)); + else + cmdiu->rsv_cat = fccmnd_set_cat_admin(0); op->fcp_req.cmddma = fc_dma_map_single(ctrl->lport->dev, &op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE); -- cgit v1.2.3-59-g8ed1b From 7db394848ece0e0706dfe8e4940b24e949f3b88f Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 15:27:11 -0700 Subject: nvme-fc: clarify error messages Change wording on a couple of messages to clarify what happened. Signed-off-by: Ewan D. Milne Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e099a74f666e..2284b7f9e8b9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1264,7 +1264,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, if (fcret) { ret = -EBADF; dev_err(ctrl->dev, - "q %d connect failed: %s\n", + "q %d Create Association LS failed: %s\n", queue->qnum, validation_errors[fcret]); } else { ctrl->association_id = @@ -1363,7 +1363,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (fcret) { ret = -EBADF; dev_err(ctrl->dev, - "q %d connect failed: %s\n", + "q %d Create I/O Connection LS failed: %s\n", queue->qnum, validation_errors[fcret]); } else { queue->connection_id = @@ -1376,7 +1376,7 @@ out_free_buffer: out_no_memory: if (ret) dev_err(ctrl->dev, - "queue %d connect command failed (%d).\n", + "queue %d connect I/O queue failed (%d).\n", queue->qnum, ret); return ret; } @@ -2698,7 +2698,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) /* warn if maxcmd is lower than queue_size */ dev_warn(ctrl->ctrl.device, "queue_size %zu > ctrl maxcmd %u, reducing " - "to queue_size\n", + "to maxcmd\n", opts->queue_size, ctrl->ctrl.maxcmd); opts->queue_size = ctrl->ctrl.maxcmd; } @@ -2706,7 +2706,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (opts->queue_size > ctrl->ctrl.sqsize + 1) { /* warn if sqsize is lower than queue_size */ dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, clamping down\n", + "queue_size %zu > ctrl sqsize %u, reducing " + "to sqsize\n", opts->queue_size, ctrl->ctrl.sqsize + 1); opts->queue_size = ctrl->ctrl.sqsize + 1; } -- cgit v1.2.3-59-g8ed1b From bcde5f0fc7d318c98d4234b52bcc1a87fc2162d9 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 14:27:22 -0700 Subject: nvme-fc: ensure association_id is cleared regardless of a Disconnect LS Code today only clears the association_id if a Disconnect LS is transmit. Remove ambiguity and unconditionally clear the association_id if the association has been terminated. Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2284b7f9e8b9..714a1c3aa0c5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1462,9 +1462,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) nvme_fc_disconnect_assoc_done); if (ret) kfree(lsop); - - /* only meaningful part to terminating the association */ - ctrl->association_id = 0; } @@ -2743,6 +2740,7 @@ out_term_aen_ops: out_disconnect_admin_queue: /* send a Disconnect(association) LS to fc-nvme target */ nvme_fc_xmt_disconnect_assoc(ctrl); + ctrl->association_id = 0; out_delete_hw_queue: __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); out_free_queue: @@ -2834,6 +2832,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) if (ctrl->association_id) nvme_fc_xmt_disconnect_assoc(ctrl); + ctrl->association_id = 0; + if (ctrl->ctrl.tagset) { nvme_fc_delete_hw_io_queues(ctrl); nvme_fc_free_io_queues(ctrl); -- cgit v1.2.3-59-g8ed1b From 58a8df67e057e979e76f8dc881766da3f7137f99 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 13 Oct 2019 19:57:31 +0300 Subject: nvme: introduce nvme_is_aen_req function This function improves code readability and reduces code duplication. Signed-off-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 3 +-- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/host/tcp.c | 4 ++-- drivers/nvme/target/loop.c | 4 ++-- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 38a83ef5bcd3..912f9500ed11 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -445,6 +445,11 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) put_device(ctrl->device); } +static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) +{ + return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH; +} + void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bb88681f4dc3..7082116e9206 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -967,8 +967,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvmeq->qid == 0 && - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { + if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe->status, &cqe->result); return; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4d280160dd3f..154fa4e32ad8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1501,8 +1501,8 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_rdma_queue_idx(queue) == 0 && - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) + if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue), + cqe->command_id))) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 385a5212c10f..124fda67613a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -491,8 +491,8 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_tcp_queue_id(queue) == 0 && - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) + if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue), + cqe->command_id))) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 748a39fca771..bd1f81f97ab7 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -102,8 +102,8 @@ static void nvme_loop_queue_response(struct nvmet_req *req) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_loop_queue_idx(queue) == 0 && - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { + if (unlikely(nvme_is_aen_req(nvme_loop_queue_idx(queue), + cqe->command_id))) { nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); } else { -- cgit v1.2.3-59-g8ed1b From 4d764bb9a92bc63afc3befe36a0bedfddff1398a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 13 Oct 2019 19:57:32 +0300 Subject: nvmet: use bio_io_error instead of duplicating it This commit doesn't change any logic. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 32008d85172b..f2618dc2ef3a 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -261,12 +261,10 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req) if (bio) { bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; - if (status) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - } else { + if (status) + bio_io_error(bio); + else submit_bio(bio); - } } else { nvmet_req_complete(req, status); } -- cgit v1.2.3-59-g8ed1b From e522f446027845e3c8b563d021f37e8f3d30c9d9 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 13 Oct 2019 19:57:33 +0300 Subject: nvmet: add unlikely check at nvmet_req_alloc_sgl The call to sgl_alloc shouldn't fail so add this simple optimization to the fast path. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 3a67e244e568..6b39cfc6ade1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -966,7 +966,7 @@ int nvmet_req_alloc_sgl(struct nvmet_req *req) } req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt); - if (!req->sg) + if (unlikely(!req->sg)) return -ENOMEM; return 0; -- cgit v1.2.3-59-g8ed1b From 59534b9d606e7f566970a6f8621c6022f7c19ff1 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 13 Oct 2019 19:57:34 +0300 Subject: nvmet-rdma: add unlikely check at nvmet_rdma_map_sgl_keyed The calls to nvmet_req_alloc_sgl and rdma_rw_ctx_init should usually succeed, so add this simple optimization to the fast path. Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 36d906a7f70d..ccf982164136 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -672,13 +672,13 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, return 0; ret = nvmet_req_alloc_sgl(&rsp->req); - if (ret < 0) + if (unlikely(ret < 0)) goto error_out; ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, nvmet_data_dir(&rsp->req)); - if (ret < 0) + if (unlikely(ret < 0)) goto error_out; rsp->n_rdma += ret; -- cgit v1.2.3-59-g8ed1b From 2dc3947b53f573e8a75ea9cbec5588df88ca502e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Oct 2019 19:57:35 +0300 Subject: nvme: introduce "Command Aborted By host" status code Fix the status code of canceled requests initiated by the host according to TP4028 (Status Code 0x371): "Command Aborted By host: The command was aborted as a result of host action (e.g., the host disconnected the Fabric connection)." Also in a multipath environment, unless otherwise specified, errors of this type (path related) should be retried using a different path, if one is available. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 1 + include/linux/nvme.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd7dea36c3b6..dfa122beb4cf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -298,7 +298,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) if (blk_mq_request_completed(req)) return true; - nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; blk_mq_complete_request(req); return true; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 30de7efef003..103c04fa7746 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -95,6 +95,7 @@ void nvme_failover_req(struct request *req) } break; case NVME_SC_HOST_PATH_ERROR: + case NVME_SC_HOST_ABORTED_CMD: /* * Temporary transport disruption in talking to the controller. * Try to send on a new path. diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f61d6906e59d..a260cd754f28 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1368,6 +1368,7 @@ enum { NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, NVME_SC_HOST_PATH_ERROR = 0x370, + NVME_SC_HOST_ABORTED_CMD = 0x371, NVME_SC_CRD = 0x1800, NVME_SC_DNR = 0x4000, -- cgit v1.2.3-59-g8ed1b From 16686f3a6c3cd6316dbc5cba886242c73f713237 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Oct 2019 19:57:36 +0300 Subject: nvme: move common call to nvme_cleanup_cmd to core layer nvme_cleanup_cmd should be called for each call to nvme_setup_cmd (symmetrical functions). Move the call for nvme_cleanup_cmd to the common core layer and call it during nvme_complete_rq for the good flow. For error flow, each transport will call nvme_cleanup_cmd independently. Also take care of a special case of path failure, where we call nvme_complete_rq without doing nvme_setup_cmd. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/fc.c | 3 +-- drivers/nvme/host/pci.c | 1 - drivers/nvme/host/rdma.c | 12 +++++------- drivers/nvme/target/loop.c | 1 - 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfa122beb4cf..9c743610e6ea 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -268,6 +268,8 @@ void nvme_complete_rq(struct request *req) trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + if (nvme_req(req)->ctrl->kas) nvme_req(req)->ctrl->comp_seen = true; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 714a1c3aa0c5..679a721ae229 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2173,8 +2173,6 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, rq_dma_dir(rq)); - nvme_cleanup_cmd(rq); - sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); freq->sg_cnt = 0; @@ -2305,6 +2303,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (!(op->flags & FCOP_FLAGS_AEN)) nvme_fc_unmap_data(ctrl, op->rq, op); + nvme_cleanup_cmd(op->rq); nvme_fc_ctrl_put(ctrl); if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE && diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7082116e9206..612f92255f9d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -924,7 +924,6 @@ static void nvme_pci_complete_rq(struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dev *dev = iod->nvmeq->dev; - nvme_cleanup_cmd(req); if (blk_integrity_rq(req)) dma_unmap_page(dev->dev, iod->meta_dma, rq_integrity_vec(req)->bv_len, rq_data_dir(req)); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 154fa4e32ad8..05f2dfa3d218 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1160,8 +1160,6 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, } ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); - - nvme_cleanup_cmd(rq); sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); } @@ -1760,7 +1758,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(err < 0)) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", err); - nvme_cleanup_cmd(rq); goto err; } @@ -1771,18 +1768,19 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr ? &req->reg_wr.wr : NULL); - if (unlikely(err)) { - nvme_rdma_unmap_data(queue, rq); - goto err; - } + if (unlikely(err)) + goto err_unmap; return BLK_STS_OK; +err_unmap: + nvme_rdma_unmap_data(queue, rq); err: if (err == -ENOMEM || err == -EAGAIN) ret = BLK_STS_RESOURCE; else ret = BLK_STS_IOERR; + nvme_cleanup_cmd(rq); unmap_qe: ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), DMA_TO_DEVICE); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index bd1f81f97ab7..5b7b19774bb0 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -76,7 +76,6 @@ static void nvme_loop_complete_rq(struct request *req) { struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); - nvme_cleanup_cmd(req); sg_free_table_chained(&iod->sg_table, SG_CHUNK_SIZE); nvme_complete_rq(req); } -- cgit v1.2.3-59-g8ed1b From 48c9e85b23464a7d1e3ebd70b79cc3a2d97d3222 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Mon, 14 Oct 2019 11:16:07 -0600 Subject: nvme: resync include/linux/nvme.h with nvmecli Update enumerations and structures in include/linux/nvme.h to resync with the nvmecli. All the updates are mentioned in the ratified NVMe 1.4 spec https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf Reviewed-by: Christoph Hellwig Signed-off-by: Revanth Rajashekar Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a260cd754f28..3eca4f7d8510 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -107,8 +107,22 @@ enum { NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ - NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_BPINFO = 0x0040, /* Boot Partition Information */ + NVME_REG_BPRSEL = 0x0044, /* Boot Partition Read Select */ + NVME_REG_BPMBL = 0x0048, /* Boot Partition Memory Buffer + * Location + */ + NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ + NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ + NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ + NVME_REG_PMREBS = 0x0e0c, /* Persistent Memory Region Elasticity + * Buffer Size + */ + NVME_REG_PMRSWTP = 0x0e10, /* Persistent Memory Region Sustained + * Write Throughput + */ NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ }; @@ -295,6 +309,14 @@ enum { NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, + NVME_CTRL_CTRATT_128_ID = 1 << 0, + NVME_CTRL_CTRATT_NON_OP_PSP = 1 << 1, + NVME_CTRL_CTRATT_NVM_SETS = 1 << 2, + NVME_CTRL_CTRATT_READ_RECV_LVLS = 1 << 3, + NVME_CTRL_CTRATT_ENDURANCE_GROUPS = 1 << 4, + NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, + NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, + NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, }; struct nvme_lbaf { @@ -352,6 +374,9 @@ enum { NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, NVME_ID_CNS_CTRL_LIST = 0x13, + NVME_ID_CNS_SCNDRY_CTRL_LIST = 0x15, + NVME_ID_CNS_NS_GRANULARITY = 0x16, + NVME_ID_CNS_UUID_LIST = 0x17, }; enum { @@ -409,7 +434,8 @@ struct nvme_smart_log { __u8 avail_spare; __u8 spare_thresh; __u8 percent_used; - __u8 rsvd6[26]; + __u8 endu_grp_crit_warn_sumry; + __u8 rsvd7[25]; __u8 data_units_read[16]; __u8 data_units_written[16]; __u8 host_reads[16]; @@ -423,7 +449,11 @@ struct nvme_smart_log { __le32 warning_temp_time; __le32 critical_comp_time; __le16 temp_sensor[8]; - __u8 rsvd216[296]; + __le32 thm_temp1_trans_count; + __le32 thm_temp2_trans_count; + __le32 thm_temp1_total_time; + __le32 thm_temp2_total_time; + __u8 rsvd232[280]; }; struct nvme_fw_slot_info_log { @@ -440,6 +470,7 @@ enum { NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, + NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, }; struct nvme_effects_log { @@ -563,6 +594,7 @@ enum nvme_opcode { nvme_cmd_compare = 0x05, nvme_cmd_write_zeroes = 0x08, nvme_cmd_dsm = 0x09, + nvme_cmd_verify = 0x0c, nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, @@ -806,10 +838,14 @@ enum nvme_admin_opcode { nvme_admin_ns_mgmt = 0x0d, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_dev_self_test = 0x14, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, nvme_admin_directive_send = 0x19, nvme_admin_directive_recv = 0x1a, + nvme_admin_virtual_mgmt = 0x1c, + nvme_admin_nvme_mi_send = 0x1d, + nvme_admin_nvme_mi_recv = 0x1e, nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -873,6 +909,7 @@ enum { NVME_FEAT_PLM_CONFIG = 0x13, NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_HOST_BEHAVIOR = 0x16, + NVME_FEAT_SANITIZE = 0x17, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -883,6 +920,10 @@ enum { NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST = 0x07, + NVME_LOG_TELEMETRY_CTRL = 0x08, + NVME_LOG_ENDURANCE_GROUP = 0x09, NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, @@ -1290,7 +1331,11 @@ enum { NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_SANITIZE_FAILED = 0x1C, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1D, + NVME_SC_NS_WRITE_PROTECTED = 0x20, + NVME_SC_CMD_INTERRUPTED = 0x21, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, @@ -1328,6 +1373,8 @@ enum { NVME_SC_NS_NOT_ATTACHED = 0x11a, NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, NVME_SC_CTRL_LIST_INVALID = 0x11c, + NVME_SC_BP_WRITE_PROHIBITED = 0x11e, + NVME_SC_PMR_SAN_PROHIBITED = 0x123, /* * I/O Command Set Specific - NVM commands: -- cgit v1.2.3-59-g8ed1b From 314d48dd224897e35ddcaf5a1d7d133b5adddeb7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 21 Oct 2019 12:40:03 +0900 Subject: nvme: Cleanup and rename nvme_block_nr() Rename nvme_block_nr() to nvme_sect_to_lba() and use SECTOR_SHIFT instead of its hard coded value 9. Also add a comment to decribe this helper. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/nvme.h | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c743610e6ea..64815fa1a254 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -589,7 +589,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } __rq_for_each_bio(bio, req) { - u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; if (n < segments) { @@ -630,7 +630,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); cmnd->write_zeroes.slba = - cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->write_zeroes.control = 0; @@ -654,7 +654,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 912f9500ed11..37eb94fb797d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -418,9 +418,12 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); } -static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) +/* + * Convert a 512B sector number to a device logical block number. + */ +static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) { - return (sector >> (ns->lba_shift - 9)); + return sector >> (ns->lba_shift - SECTOR_SHIFT); } static inline void nvme_end_request(struct request *req, __le16 status, -- cgit v1.2.3-59-g8ed1b From e08f2ae850929d40e66268ee47e443e7ea56eeb7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 21 Oct 2019 12:40:04 +0900 Subject: nvme: Introduce nvme_lba_to_sect() Introduce the new helper function nvme_lba_to_sect() to convert a device logical block number to a 512B sector number. Use this new helper in obvious places, cleaning up the code. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 +++++++------- drivers/nvme/host/nvme.h | 8 ++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 64815fa1a254..b4214e54f2d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1611,7 +1611,7 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) static void nvme_set_chunk_size(struct nvme_ns *ns) { - u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); + u32 chunk_size = nvme_lba_to_sect(ns, ns->noiob); blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } @@ -1648,8 +1648,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { - u32 max_sectors; - unsigned short bs = 1 << ns->lba_shift; + u64 max_blocks; if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) @@ -1665,11 +1664,12 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) * nvme_init_identify() if available. */ if (ns->ctrl->max_hw_sectors == UINT_MAX) - max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; + max_blocks = (u64)USHRT_MAX + 1; else - max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; + max_blocks = ns->ctrl->max_hw_sectors + 1; - blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); + blk_queue_max_write_zeroes_sectors(disk->queue, + nvme_lba_to_sect(ns, max_blocks)); } static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1712,7 +1712,7 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { - sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); + sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 37eb94fb797d..2637d9dd278f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -426,6 +426,14 @@ static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) return sector >> (ns->lba_shift - SECTOR_SHIFT); } +/* + * Convert a device logical block number to a 512B sector number. + */ +static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba) +{ + return lba << (ns->lba_shift - SECTOR_SHIFT); +} + static inline void nvme_end_request(struct request *req, __le16 status, union nvme_result result) { -- cgit v1.2.3-59-g8ed1b From e0bace71779d602e419a2f3b35f3bff2cfc638a3 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 23 Oct 2019 10:35:39 -0600 Subject: nvmet-tcp: Don't check data_len in nvmet_tcp_map_data() None of the other transports check data_len which is verified in core code. The function should instead check that the sgl length is non-zero. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index d535080b781f..1e2d92f818ad 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -320,7 +320,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; u32 len = le32_to_cpu(sgl->length); - if (!cmd->req.data_len) + if (!len) return 0; if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | -- cgit v1.2.3-59-g8ed1b From c73eebc07a0fc8f4ea0f69c65803595d6471bb47 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 23 Oct 2019 10:35:40 -0600 Subject: nvmet-tcp: Don't set the request's data_len It's not apprporiate for the transports to set the data_len field of the request which is only used by the core. In this case, just use a variable on the stack to store the length of the sgl for comparison. Reviewed-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1e2d92f818ad..3378480c49f6 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -813,13 +813,11 @@ free_crypto: static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) { + size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); int ret; - /* recover the expected data transfer length */ - req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); - if (!nvme_is_write(cmd->req.cmd) || - req->data_len > cmd->req.port->inline_data_size) { + data_len > cmd->req.port->inline_data_size) { nvmet_prepare_receive_pdu(queue); return; } -- cgit v1.2.3-59-g8ed1b From 2cb6963a16e9e114486decf591af7cb2d69cb154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2019 10:35:41 -0600 Subject: nvmet: Introduce common execute function for get_log_page and identify Instead of picking the sub-command handler to execute in a nested switch statement introduce a landing functions that calls out to the appropriate sub-command handler. This will allow us to have a common place in the handler to check the transfer length in a future patch. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig [split patch, update change log] Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 93 ++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 831a062d27cb..3665b45d6515 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -282,6 +282,33 @@ out: nvmet_req_complete(req, status); } +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + switch (req->cmd->get_log_page.lid) { + case NVME_LOG_ERROR: + return nvmet_execute_get_log_page_error(req); + case NVME_LOG_SMART: + return nvmet_execute_get_log_page_smart(req); + case NVME_LOG_FW_SLOT: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + return nvmet_execute_get_log_page_noop(req); + case NVME_LOG_CHANGED_NS: + return nvmet_execute_get_log_changed_ns(req); + case NVME_LOG_CMD_EFFECTS: + return nvmet_execute_get_log_cmd_effects_ns(req); + case NVME_LOG_ANA: + return nvmet_execute_get_log_page_ana(req); + } + pr_err("unhandled lid %d on qid %d\n", + req->cmd->get_log_page.lid, req->sq->qid); + req->error_loc = offsetof(struct nvme_get_log_page_command, lid); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -565,6 +592,25 @@ out: nvmet_req_complete(req, status); } +static void nvmet_execute_identify(struct nvmet_req *req) +{ + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_NS: + return nvmet_execute_identify_ns(req); + case NVME_ID_CNS_CTRL: + return nvmet_execute_identify_ctrl(req); + case NVME_ID_CNS_NS_ACTIVE_LIST: + return nvmet_execute_identify_nslist(req); + case NVME_ID_CNS_NS_DESC_LIST: + return nvmet_execute_identify_desclist(req); + } + + pr_err("unhandled identify cns %d on qid %d\n", + req->cmd->identify.cns, req->sq->qid); + req->error_loc = offsetof(struct nvme_identify, cns); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); +} + /* * A "minimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really @@ -819,52 +865,13 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_admin_get_log_page: + req->execute = nvmet_execute_get_log_page; req->data_len = nvmet_get_log_page_len(cmd); - - switch (cmd->get_log_page.lid) { - case NVME_LOG_ERROR: - req->execute = nvmet_execute_get_log_page_error; - return 0; - case NVME_LOG_SMART: - req->execute = nvmet_execute_get_log_page_smart; - return 0; - case NVME_LOG_FW_SLOT: - /* - * We only support a single firmware slot which always - * is active, so we can zero out the whole firmware slot - * log and still claim to fully implement this mandatory - * log page. - */ - req->execute = nvmet_execute_get_log_page_noop; - return 0; - case NVME_LOG_CHANGED_NS: - req->execute = nvmet_execute_get_log_changed_ns; - return 0; - case NVME_LOG_CMD_EFFECTS: - req->execute = nvmet_execute_get_log_cmd_effects_ns; - return 0; - case NVME_LOG_ANA: - req->execute = nvmet_execute_get_log_page_ana; - return 0; - } - break; + return 0; case nvme_admin_identify: + req->execute = nvmet_execute_identify; req->data_len = NVME_IDENTIFY_DATA_SIZE; - switch (cmd->identify.cns) { - case NVME_ID_CNS_NS: - req->execute = nvmet_execute_identify_ns; - return 0; - case NVME_ID_CNS_CTRL: - req->execute = nvmet_execute_identify_ctrl; - return 0; - case NVME_ID_CNS_NS_ACTIVE_LIST: - req->execute = nvmet_execute_identify_nslist; - return 0; - case NVME_ID_CNS_NS_DESC_LIST: - req->execute = nvmet_execute_identify_desclist; - return 0; - } - break; + return 0; case nvme_admin_abort_cmd: req->execute = nvmet_execute_abort; req->data_len = 0; -- cgit v1.2.3-59-g8ed1b From 6f86f2c9d94d55c4d3a6f1ffbc2e1115b5cb38a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2019 10:35:42 -0600 Subject: nvmet: Cleanup discovery execute handlers Push the lid and cns check into their respective handlers and, while we're at it, rename the functions to be consistent with other discovery handlers. Signed-off-by: Christoph Hellwig [split patch, update changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/discovery.c | 44 ++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 3764a8900850..825e61e61b0c 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -157,7 +157,7 @@ static size_t discovery_log_entries(struct nvmet_req *req) return entries; } -static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) +static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) { const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -171,6 +171,13 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) u16 status = 0; void *buffer; + if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lid); + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + goto out; + } + /* Spec requires dword aligned offsets */ if (offset & 0x3) { status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; @@ -227,12 +234,18 @@ out: nvmet_req_complete(req, status); } -static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) +static void nvmet_execute_disc_identify(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; u16 status = 0; + if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { + req->error_loc = offsetof(struct nvme_identify, cns); + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + goto out; + } + id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) { status = NVME_SC_INTERNAL; @@ -344,31 +357,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) return 0; case nvme_admin_get_log_page: req->data_len = nvmet_get_log_page_len(cmd); - - switch (cmd->get_log_page.lid) { - case NVME_LOG_DISC: - req->execute = nvmet_execute_get_disc_log_page; - return 0; - default: - pr_err("unsupported get_log_page lid %d\n", - cmd->get_log_page.lid); - req->error_loc = - offsetof(struct nvme_get_log_page_command, lid); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; - } + req->execute = nvmet_execute_disc_get_log_page; + return 0; case nvme_admin_identify: req->data_len = NVME_IDENTIFY_DATA_SIZE; - switch (cmd->identify.cns) { - case NVME_ID_CNS_CTRL: - req->execute = - nvmet_execute_identify_disc_ctrl; - return 0; - default: - pr_err("unsupported identify cns %d\n", - cmd->identify.cns); - req->error_loc = offsetof(struct nvme_identify, cns); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; - } + req->execute = nvmet_execute_disc_identify; + return 0; default: pr_err("unhandled cmd %d\n", cmd->common.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); -- cgit v1.2.3-59-g8ed1b From 59ef0eaa7741c3543f98220cc132c61bf0230bce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2019 10:35:43 -0600 Subject: nvmet: Introduce nvmet_dsm_len() helper Similar to the nvmet_rw_len helper. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig [split patch, update changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-file.c | 3 +-- drivers/nvme/target/nvmet.h | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 05453f5d1448..7481556da6e6 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -379,8 +379,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) return 0; case nvme_cmd_dsm: req->execute = nvmet_file_execute_dsm; - req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) * - sizeof(struct nvme_dsm_range); + req->data_len = nvmet_dsm_len(req); return 0; case nvme_cmd_write_zeroes: req->execute = nvmet_file_execute_write_zeroes; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c51f8dd01dc4..6ccf2d098d9f 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -495,6 +495,12 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) req->ns->blksize_shift; } +static inline u32 nvmet_dsm_len(struct nvmet_req *req) +{ + return (le32_to_cpu(req->cmd->dsm.nr) + 1) * + sizeof(struct nvme_dsm_range); +} + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); /* Convert a 32-bit number to a 16-bit 0's based number */ -- cgit v1.2.3-59-g8ed1b From e9061c397839eea34207668bfedce0a6c18c5015 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2019 10:35:44 -0600 Subject: nvmet: Remove the data_len field from the nvmet_req struct Instead of storing the expected length and checking it when it's executed, just check the length inside the command themselves. A new helper, nvmet_check_data_len() is created to help with this check. Signed-off-by: Christoph Hellwig [split patch, udpate changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 35 ++++++++++++++++++++++++----------- drivers/nvme/target/core.c | 16 ++++++++++++---- drivers/nvme/target/discovery.c | 18 ++++++++++++------ drivers/nvme/target/fabrics-cmd.c | 15 ++++++++++++--- drivers/nvme/target/io-cmd-bdev.c | 19 +++++++++++++------ drivers/nvme/target/io-cmd-file.c | 19 ++++++++++++------- drivers/nvme/target/nvmet.h | 3 +-- 7 files changed, 86 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3665b45d6515..cd2c3a79f3b5 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -31,7 +31,7 @@ u64 nvmet_get_log_page_offset(struct nvme_command *cmd) static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) { - nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); + nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->transfer_len)); } static void nvmet_execute_get_log_page_error(struct nvmet_req *req) @@ -134,7 +134,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) u16 status = NVME_SC_INTERNAL; unsigned long flags; - if (req->data_len != sizeof(*log)) + if (req->transfer_len != sizeof(*log)) goto out; log = kzalloc(sizeof(*log), GFP_KERNEL); @@ -196,7 +196,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) u16 status = NVME_SC_INTERNAL; size_t len; - if (req->data_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32)) + if (req->transfer_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32)) goto out; mutex_lock(&ctrl->lock); @@ -206,7 +206,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) len = ctrl->nr_changed_ns * sizeof(__le32); status = nvmet_copy_to_sgl(req, 0, ctrl->changed_ns_list, len); if (!status) - status = nvmet_zero_sgl(req, len, req->data_len - len); + status = nvmet_zero_sgl(req, len, req->transfer_len - len); ctrl->nr_changed_ns = 0; nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR); mutex_unlock(&ctrl->lock); @@ -284,6 +284,9 @@ out: static void nvmet_execute_get_log_page(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, nvmet_get_log_page_len(req->cmd))) + return; + switch (req->cmd->get_log_page.lid) { case NVME_LOG_ERROR: return nvmet_execute_get_log_page_error(req); @@ -594,6 +597,9 @@ out: static void nvmet_execute_identify(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, NVME_IDENTIFY_DATA_SIZE)) + return; + switch (req->cmd->identify.cns) { case NVME_ID_CNS_NS: return nvmet_execute_identify_ns(req); @@ -620,6 +626,8 @@ static void nvmet_execute_identify(struct nvmet_req *req) */ static void nvmet_execute_abort(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, 0)) + return; nvmet_set_result(req, 1); nvmet_req_complete(req, 0); } @@ -704,6 +712,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; + if (!nvmet_check_data_len(req, 0)) + return; + switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: nvmet_set_result(req, @@ -767,6 +778,9 @@ static void nvmet_execute_get_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; + if (!nvmet_check_data_len(req, 0)) + return; + switch (cdw10 & 0xff) { /* * These features are mandatory in the spec, but we don't @@ -831,6 +845,9 @@ void nvmet_execute_async_event(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; + if (!nvmet_check_data_len(req, 0)) + return; + mutex_lock(&ctrl->lock); if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { mutex_unlock(&ctrl->lock); @@ -847,6 +864,9 @@ void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; + if (!nvmet_check_data_len(req, 0)) + return; + pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); @@ -866,31 +886,24 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_admin_get_log_page: req->execute = nvmet_execute_get_log_page; - req->data_len = nvmet_get_log_page_len(cmd); return 0; case nvme_admin_identify: req->execute = nvmet_execute_identify; - req->data_len = NVME_IDENTIFY_DATA_SIZE; return 0; case nvme_admin_abort_cmd: req->execute = nvmet_execute_abort; - req->data_len = 0; return 0; case nvme_admin_set_features: req->execute = nvmet_execute_set_features; - req->data_len = 0; return 0; case nvme_admin_get_features: req->execute = nvmet_execute_get_features; - req->data_len = 0; return 0; case nvme_admin_async_event: req->execute = nvmet_execute_async_event; - req->data_len = 0; return 0; case nvme_admin_keep_alive: req->execute = nvmet_execute_keep_alive; - req->data_len = 0; return 0; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6b39cfc6ade1..565def19d593 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -930,13 +930,21 @@ void nvmet_req_uninit(struct nvmet_req *req) } EXPORT_SYMBOL_GPL(nvmet_req_uninit); -void nvmet_req_execute(struct nvmet_req *req) +bool nvmet_check_data_len(struct nvmet_req *req, size_t data_len) { - if (unlikely(req->data_len != req->transfer_len)) { + if (unlikely(data_len != req->transfer_len)) { req->error_loc = offsetof(struct nvme_common_command, dptr); nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); - } else - req->execute(req); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(nvmet_check_data_len); + +void nvmet_req_execute(struct nvmet_req *req) +{ + req->execute(req); } EXPORT_SYMBOL_GPL(nvmet_req_execute); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 825e61e61b0c..7a868c3e8e95 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -171,6 +171,9 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) u16 status = 0; void *buffer; + if (!nvmet_check_data_len(req, data_len)) + return; + if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { req->error_loc = offsetof(struct nvme_get_log_page_command, lid); @@ -240,6 +243,9 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) struct nvme_id_ctrl *id; u16 status = 0; + if (!nvmet_check_data_len(req, NVME_IDENTIFY_DATA_SIZE)) + return; + if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { req->error_loc = offsetof(struct nvme_identify, cns); status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; @@ -286,6 +292,9 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 stat; + if (!nvmet_check_data_len(req, 0)) + return; + switch (cdw10 & 0xff) { case NVME_FEAT_KATO: stat = nvmet_set_feat_kato(req); @@ -309,6 +318,9 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 stat = 0; + if (!nvmet_check_data_len(req, 0)) + return; + switch (cdw10 & 0xff) { case NVME_FEAT_KATO: nvmet_get_feat_kato(req); @@ -341,26 +353,20 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_admin_set_features: req->execute = nvmet_execute_disc_set_features; - req->data_len = 0; return 0; case nvme_admin_get_features: req->execute = nvmet_execute_disc_get_features; - req->data_len = 0; return 0; case nvme_admin_async_event: req->execute = nvmet_execute_async_event; - req->data_len = 0; return 0; case nvme_admin_keep_alive: req->execute = nvmet_execute_keep_alive; - req->data_len = 0; return 0; case nvme_admin_get_log_page: - req->data_len = nvmet_get_log_page_len(cmd); req->execute = nvmet_execute_disc_get_log_page; return 0; case nvme_admin_identify: - req->data_len = NVME_IDENTIFY_DATA_SIZE; req->execute = nvmet_execute_disc_identify; return 0; default: diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index d16b55ffe79f..f7297473d9eb 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -12,6 +12,9 @@ static void nvmet_execute_prop_set(struct nvmet_req *req) u64 val = le64_to_cpu(req->cmd->prop_set.value); u16 status = 0; + if (!nvmet_check_data_len(req, 0)) + return; + if (req->cmd->prop_set.attrib & 1) { req->error_loc = offsetof(struct nvmf_property_set_command, attrib); @@ -38,6 +41,9 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) u16 status = 0; u64 val = 0; + if (!nvmet_check_data_len(req, 0)) + return; + if (req->cmd->prop_get.attrib & 1) { switch (le32_to_cpu(req->cmd->prop_get.offset)) { case NVME_REG_CAP: @@ -82,11 +88,9 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) switch (cmd->fabrics.fctype) { case nvme_fabrics_type_property_set: - req->data_len = 0; req->execute = nvmet_execute_prop_set; break; case nvme_fabrics_type_property_get: - req->data_len = 0; req->execute = nvmet_execute_prop_get; break; default: @@ -147,6 +151,9 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmet_ctrl *ctrl = NULL; u16 status = 0; + if (!nvmet_check_data_len(req, sizeof(struct nvmf_connect_data))) + return; + d = kmalloc(sizeof(*d), GFP_KERNEL); if (!d) { status = NVME_SC_INTERNAL; @@ -211,6 +218,9 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) u16 qid = le16_to_cpu(c->qid); u16 status = 0; + if (!nvmet_check_data_len(req, sizeof(struct nvmf_connect_data))) + return; + d = kmalloc(sizeof(*d), GFP_KERNEL); if (!d) { status = NVME_SC_INTERNAL; @@ -281,7 +291,6 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } - req->data_len = sizeof(struct nvmf_connect_data); if (cmd->connect.qid == 0) req->execute = nvmet_execute_admin_connect; else diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index f2618dc2ef3a..04a9cd2a2604 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -150,6 +150,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sector_t sector; int op, op_flags = 0, i; + if (!nvmet_check_data_len(req, nvmet_rw_len(req))) + return; + if (!req->sg_cnt) { nvmet_req_complete(req, 0); return; @@ -170,7 +173,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sector = le64_to_cpu(req->cmd->rw.slba); sector <<= (req->ns->blksize_shift - 9); - if (req->data_len <= NVMET_MAX_INLINE_DATA_LEN) { + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { bio = &req->b.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); } else { @@ -207,6 +210,9 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) { struct bio *bio = &req->b.inline_bio; + if (!nvmet_check_data_len(req, 0)) + return; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); bio->bi_private = req; @@ -272,6 +278,9 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req) static void nvmet_bdev_execute_dsm(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, nvmet_dsm_len(req))) + return; + switch (le32_to_cpu(req->cmd->dsm.attributes)) { case NVME_DSMGMT_AD: nvmet_bdev_execute_discard(req); @@ -293,6 +302,9 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) sector_t nr_sector; int ret; + if (!nvmet_check_data_len(req, 0)) + return; + sector = le64_to_cpu(write_zeroes->slba) << (req->ns->blksize_shift - 9); nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << @@ -317,20 +329,15 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) case nvme_cmd_read: case nvme_cmd_write: req->execute = nvmet_bdev_execute_rw; - req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: req->execute = nvmet_bdev_execute_flush; - req->data_len = 0; return 0; case nvme_cmd_dsm: req->execute = nvmet_bdev_execute_dsm; - req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) * - sizeof(struct nvme_dsm_range); return 0; case nvme_cmd_write_zeroes: req->execute = nvmet_bdev_execute_write_zeroes; - req->data_len = 0; return 0; default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 7481556da6e6..caebfce06605 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -126,7 +126,7 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) mempool_free(req->f.bvec, req->ns->bvec_pool); } - if (unlikely(ret != req->data_len)) + if (unlikely(ret != req->transfer_len)) status = errno_to_nvme_status(req, ret); nvmet_req_complete(req, status); } @@ -146,7 +146,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) is_sync = true; pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; - if (unlikely(pos + req->data_len > req->ns->size)) { + if (unlikely(pos + req->transfer_len > req->ns->size)) { nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); return true; } @@ -173,7 +173,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) nr_bvec--; } - if (WARN_ON_ONCE(total_len != req->data_len)) { + if (WARN_ON_ONCE(total_len != req->transfer_len)) { ret = -EIO; goto complete; } @@ -232,6 +232,9 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) { ssize_t nr_bvec = req->sg_cnt; + if (!nvmet_check_data_len(req, nvmet_rw_len(req))) + return; + if (!req->sg_cnt || !nr_bvec) { nvmet_req_complete(req, 0); return; @@ -273,6 +276,8 @@ static void nvmet_file_flush_work(struct work_struct *w) static void nvmet_file_execute_flush(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, 0)) + return; INIT_WORK(&req->f.work, nvmet_file_flush_work); schedule_work(&req->f.work); } @@ -331,6 +336,8 @@ static void nvmet_file_dsm_work(struct work_struct *w) static void nvmet_file_execute_dsm(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, nvmet_dsm_len(req))) + return; INIT_WORK(&req->f.work, nvmet_file_dsm_work); schedule_work(&req->f.work); } @@ -359,6 +366,8 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) { + if (!nvmet_check_data_len(req, 0)) + return; INIT_WORK(&req->f.work, nvmet_file_write_zeroes_work); schedule_work(&req->f.work); } @@ -371,19 +380,15 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) case nvme_cmd_read: case nvme_cmd_write: req->execute = nvmet_file_execute_rw; - req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: req->execute = nvmet_file_execute_flush; - req->data_len = 0; return 0; case nvme_cmd_dsm: req->execute = nvmet_file_execute_dsm; - req->data_len = nvmet_dsm_len(req); return 0; case nvme_cmd_write_zeroes: req->execute = nvmet_file_execute_write_zeroes; - req->data_len = 0; return 0; default: pr_err("unhandled cmd for file ns %d on qid %d\n", diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6ccf2d098d9f..ff55f1005b35 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -304,8 +304,6 @@ struct nvmet_req { } f; }; int sg_cnt; - /* data length as parsed from the command: */ - size_t data_len; /* data length as parsed from the SGL descriptor: */ size_t transfer_len; @@ -375,6 +373,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); +bool nvmet_check_data_len(struct nvmet_req *req, size_t data_len); void nvmet_req_execute(struct nvmet_req *req); void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgl(struct nvmet_req *req); -- cgit v1.2.3-59-g8ed1b From be3f3114ddd58d12f64b872247bb1bc46df56b36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2019 10:35:45 -0600 Subject: nvmet: Open code nvmet_req_execute() Now that nvmet_req_execute does nothing, open code it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig [split patch, update changelog] Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 6 ------ drivers/nvme/target/fc.c | 4 ++-- drivers/nvme/target/loop.c | 2 +- drivers/nvme/target/nvmet.h | 1 - drivers/nvme/target/rdma.c | 4 ++-- drivers/nvme/target/tcp.c | 6 +++--- 6 files changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 565def19d593..cde58c001b23 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -942,12 +942,6 @@ bool nvmet_check_data_len(struct nvmet_req *req, size_t data_len) } EXPORT_SYMBOL_GPL(nvmet_check_data_len); -void nvmet_req_execute(struct nvmet_req *req) -{ - req->execute(req); -} -EXPORT_SYMBOL_GPL(nvmet_req_execute); - int nvmet_req_alloc_sgl(struct nvmet_req *req) { struct pci_dev *p2p_dev = NULL; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 61b617698d3f..a0db6371b43e 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2018,7 +2018,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } /* data transfer complete, resume with nvmet layer */ - nvmet_req_execute(&fod->req); + fod->req.execute(&fod->req); break; case NVMET_FCOP_READDATA: @@ -2234,7 +2234,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, * can invoke the nvmet_layer now. If read data, cmd completion will * push the data */ - nvmet_req_execute(&fod->req); + fod->req.execute(&fod->req); return; transport_error: diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 5b7b19774bb0..856eb0652f89 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -125,7 +125,7 @@ static void nvme_loop_execute_work(struct work_struct *work) struct nvme_loop_iod *iod = container_of(work, struct nvme_loop_iod, work); - nvmet_req_execute(&iod->req); + iod->req.execute(&iod->req); } static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index ff55f1005b35..46df45e837c9 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -374,7 +374,6 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); bool nvmet_check_data_len(struct nvmet_req *req, size_t data_len); -void nvmet_req_execute(struct nvmet_req *req); void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgl(struct nvmet_req *req); void nvmet_req_free_sgl(struct nvmet_req *req); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index ccf982164136..37d262a65877 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -603,7 +603,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) return; } - nvmet_req_execute(&rsp->req); + rsp->req.execute(&rsp->req); } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, @@ -746,7 +746,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { - nvmet_req_execute(&rsp->req); + rsp->req.execute(&rsp->req); } return true; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 3378480c49f6..af674fc0bb1e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -930,7 +930,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) goto out; } - nvmet_req_execute(&queue->cmd->req); + queue->cmd->req.execute(&queue->cmd->req); out: nvmet_prepare_receive_pdu(queue); return ret; @@ -1050,7 +1050,7 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) nvmet_tcp_prep_recv_ddgst(cmd); return 0; } - nvmet_req_execute(&cmd->req); + cmd->req.execute(&cmd->req); } nvmet_prepare_receive_pdu(queue); @@ -1090,7 +1090,7 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && cmd->rbytes_done == cmd->req.transfer_len) - nvmet_req_execute(&cmd->req); + cmd->req.execute(&cmd->req); ret = 0; out: nvmet_prepare_receive_pdu(queue); -- cgit v1.2.3-59-g8ed1b From d4b3a1741130dfc812b0825db4cb1c61032da183 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 Oct 2019 09:55:58 -0700 Subject: nvmet: fill discovery controller sn, fr and mn correctly Discovery controllers need this information as well. Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/discovery.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 7a868c3e8e95..0c2274b21e15 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -241,6 +241,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; + const char model[] = "Linux"; u16 status = 0; if (!nvmet_check_data_len(req, NVME_IDENTIFY_DATA_SIZE)) @@ -258,8 +259,13 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) goto out; } + memset(id->sn, ' ', sizeof(id->sn)); + bin2hex(id->sn, &ctrl->subsys->serial, + min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); memset(id->fr, ' ', sizeof(id->fr)); - strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + memcpy_and_pad(id->fr, sizeof(id->fr), + UTS_RELEASE, strlen(UTS_RELEASE), ' '); /* no limit on data transfer sizes for now */ id->mdts = 0; -- cgit v1.2.3-59-g8ed1b From 05d3046ff755474557e885f38f85e9b2a032cec0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:24:00 +0200 Subject: nvme-pci: Spelling s/resdicovered/rediscovered/ Fix misspelling of "rediscovered". Signed-off-by: Geert Uytterhoeven Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 612f92255f9d..1b1b0db45567 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2961,7 +2961,7 @@ static int nvme_suspend(struct device *dev) /* * Clearing npss forces a controller reset on resume. The - * correct value will be resdicovered then. + * correct value will be rediscovered then. */ nvme_dev_disable(ndev, true); ctrl->npss = 0; -- cgit v1.2.3-59-g8ed1b From d84dd8cde6742054a2c802df841fa5aab5b99122 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Oct 2019 15:38:58 +0200 Subject: nvmet: clean up command parsing a bit Move the special cases for fabrics commands and the discovery controller to nvmet_parse_admin_cmd in preparation for adding passthrough support. Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 5 +++++ drivers/nvme/target/core.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index cd2c3a79f3b5..56c21b501185 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -879,6 +879,11 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) struct nvme_command *cmd = req->cmd; u16 ret; + if (nvme_is_fabrics(cmd)) + return nvmet_parse_fabrics_cmd(req); + if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + return nvmet_parse_discovery_cmd(req); + ret = nvmet_check_ctrl_status(req, cmd); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cde58c001b23..28438b833c1b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -892,14 +892,10 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, } if (unlikely(!req->sq->ctrl)) - /* will return an error for any Non-connect command: */ + /* will return an error for any non-connect command: */ status = nvmet_parse_connect_cmd(req); else if (likely(req->sq->qid != 0)) status = nvmet_parse_io_cmd(req); - else if (nvme_is_fabrics(req->cmd)) - status = nvmet_parse_fabrics_cmd(req); - else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) - status = nvmet_parse_discovery_cmd(req); else status = nvmet_parse_admin_cmd(req); -- cgit v1.2.3-59-g8ed1b From 9dea0c81ee4a7b5d8e5bc0d4cfa2ee4f0e7b13f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Oct 2019 11:23:26 -0700 Subject: nvmet: add plugging for read/write when ns is bdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With reference to the following issue reported on the mailing list :- http://lists.infradead.org/pipermail/linux-nvme/2019-October/027604.html this patch adds plugging for the bdev-ns under nvmet_bdev_execute_rw(). We can see the following performance improvement in random write workload I/Os with the setup described in the link when device_path configured as /dev/md0. Without this patch :-   write: IOPS=40.8k, BW=159MiB/s (167MB/s)(4777MiB/30002msec)   write: IOPS=41.2k, BW=161MiB/s (169MB/s)(4831MiB/30011msec)     slat (usec): min=8,  max=10823, avg=15.64,  stdev=16.85     slat (usec): min=8,  max=401,   avg=15.40,  stdev= 9.56     clat (usec): min=54, max=2492,  avg=759.07, stdev=172.62     clat (usec): min=56, max=1997,  avg=768.06, stdev=178.72 With this patch :-   write: IOPS=123k, BW=480MiB/s (504MB/s)(14.1GiB/30011msec)   write: IOPS=123k, BW=481MiB/s (504MB/s)(14.1GiB/30002msec)     slat (usec): min=8,  max=9941,  avg=13.31,  stdev= 8.04     slat (usec): min=8,  max=289,   avg=13.31,  stdev= 3.37     clat (usec): min=43, max=17635, avg=245.46, stdev=171.23     clat (usec): min=44, max=17751, avg=245.25, stdev=183.14 Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 04a9cd2a2604..07e4f8c579d3 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -147,6 +147,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) int sg_cnt = req->sg_cnt; struct bio *bio; struct scatterlist *sg; + struct blk_plug plug; sector_t sector; int op, op_flags = 0, i; @@ -185,6 +186,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) bio->bi_end_io = nvmet_bio_done; bio_set_op_attrs(bio, op, op_flags); + blk_start_plug(&plug); for_each_sg(req->sg, sg, req->sg_cnt, i) { while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) != sg->length) { @@ -204,6 +206,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) } submit_bio(bio); + blk_finish_plug(&plug); } static void nvmet_bdev_execute_flush(struct nvmet_req *req) -- cgit v1.2.3-59-g8ed1b From 716fd9c119a982c34af196fe9205d7a617f38e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 08:12:23 +0100 Subject: nvmet: stop using bio_set_op_attrs bio_set_op_attrs has been long deprecated, replace it with a direct assignment of the flags to bio->bi_opf. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 07e4f8c579d3..b6fca0e421ef 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -149,7 +149,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) struct scatterlist *sg; struct blk_plug plug; sector_t sector; - int op, op_flags = 0, i; + int op, i; if (!nvmet_check_data_len(req, nvmet_rw_len(req))) return; @@ -160,16 +160,15 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) } if (req->cmd->rw.opcode == nvme_cmd_write) { - op = REQ_OP_WRITE; - op_flags = REQ_SYNC | REQ_IDLE; + op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) - op_flags |= REQ_FUA; + op |= REQ_FUA; } else { op = REQ_OP_READ; } if (is_pci_p2pdma_page(sg_page(req->sg))) - op_flags |= REQ_NOMERGE; + op |= REQ_NOMERGE; sector = le64_to_cpu(req->cmd->rw.slba); sector <<= (req->ns->blksize_shift - 9); @@ -184,7 +183,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) bio->bi_iter.bi_sector = sector; bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = op; blk_start_plug(&plug); for_each_sg(req->sg, sg, req->sg_cnt, i) { @@ -195,7 +194,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = op; bio_chain(bio, prev); submit_bio(prev); -- cgit v1.2.3-59-g8ed1b From 64fab7290dc3561729bbc1e35895a517eb2e549e Mon Sep 17 00:00:00 2001 From: Prabhath Sajeepa Date: Mon, 28 Oct 2019 16:56:48 -0600 Subject: nvme: Fix parsing of ANA log page Check validity of offset into ANA log buffer before accessing nvme_ana_group_desc. This check ensures the size of ANA log buffer >= offset + sizeof(nvme_ana_group_desc) Reviewed-by: Sagi Grimberg Signed-off-by: Prabhath Sajeepa Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 103c04fa7746..682be6195a95 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -445,8 +445,14 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { struct nvme_ana_group_desc *desc = base + offset; - u32 nr_nsids = le32_to_cpu(desc->nnsids); - size_t nsid_buf_size = nr_nsids * sizeof(__le32); + u32 nr_nsids; + size_t nsid_buf_size; + + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + + nr_nsids = le32_to_cpu(desc->nnsids); + nsid_buf_size = nr_nsids * sizeof(__le32); if (WARN_ON_ONCE(desc->grpid == 0)) return -EINVAL; @@ -466,8 +472,6 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, return error; offset += nsid_buf_size; - if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) - return -EINVAL; } return 0; -- cgit v1.2.3-59-g8ed1b From 2e2d6f7e44a2b9f96ca8af445ae0150a6cefde41 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:48 +0900 Subject: dm: add zone open, close and finish support Implement REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH support to allow explicit control of zone states. Contains contributions from Matias Bjorling, Hans Holmberg and Damien Le Moal. Acked-by: Mike Snitzer Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/md/dm-flakey.c | 7 +++---- drivers/md/dm-linear.c | 2 +- drivers/md/dm.c | 5 +++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 2900fbde89b3..76587e9af0ef 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } @@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); pb->bio_submitted = false; - /* Do not fail reset zone */ - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; /* Are we alive ? */ @@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ecefe6703736..97acafd48c85 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1a5e328c443a..bc143c1b2333 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1174,7 +1174,8 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, + * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1627,7 +1628,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ - } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { + } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; error = __split_and_process_non_flush(&ci); -- cgit v1.2.3-59-g8ed1b From da644b2cc1a4664ff7f75d3ae50e3fcf638580d9 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:49 +0900 Subject: null_blk: add zone open, close, and finish support Implement REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH support to allow explicit control of zone states. Contains contributions from Matias Bjorling, Hans Holmberg, Keith Busch and Damien Le Moal. Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index e020f17dac9f..be7646205b8c 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -136,13 +136,14 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_OK; } -static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector) { struct nullb_device *dev = cmd->nq->dev; struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; size_t i; - switch (req_op(cmd->rq)) { + switch (op) { case REQ_OP_ZONE_RESET_ALL: for (i = 0; i < dev->nr_zones; i++) { if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL) @@ -158,6 +159,29 @@ static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; break; + case REQ_OP_ZONE_OPEN: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + if (zone->cond == BLK_ZONE_COND_FULL) + return BLK_STS_IOERR; + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + if (zone->cond == BLK_ZONE_COND_FULL) + return BLK_STS_IOERR; + + zone->cond = BLK_ZONE_COND_CLOSED; + break; + case REQ_OP_ZONE_FINISH: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; + break; default: cmd->error = BLK_STS_NOTSUPP; break; @@ -173,7 +197,10 @@ blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, return null_zone_write(cmd, sector, nr_sectors); case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: - return null_zone_reset(cmd, sector); + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return null_zone_mgmt(cmd, op, sector); default: return BLK_STS_OK; } -- cgit v1.2.3-59-g8ed1b From 5fa4f8bac9516b988d2ccd3f05f4267f8da24269 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 25 Oct 2019 09:08:56 +0200 Subject: md/raid1: avoid soft lockup under high load As all I/O is being pushed through a kernel thread the softlockup watchdog might be triggered under high load. Signed-off-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/raid1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bb29aeefcbd0..a409ab6f30bc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -819,6 +819,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) else generic_make_request(bio); bio = next; + cond_resched(); } } -- cgit v1.2.3-59-g8ed1b From 228fc7d76db68732677230a3c64337908fd298e3 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 30 Oct 2019 18:47:02 +0800 Subject: md: avoid invalid memory access for array sb->dev_roles we need to gurantee 'desc_nr' valid before access array of sb->dev_roles. In addition, we should avoid .load_super always return '0' when level is LEVEL_MULTIPATH, which is not expected. Reported-by: coverity-bot Addresses-Coverity-ID: 1487373 ("Memory - illegal accesses") Fixes: 6a5cb53aaa4e ("md: no longer compare spare disk superblock events in super_load") Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/md.c | 51 ++++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6f0ecfe8eab2..805b33e27496 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1105,6 +1105,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1155,13 +1156,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor else rdev->desc_nr = sb->this_disk.number; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + sb->disks[rdev->desc_nr].state & + ((1<disks[rdev->desc_nr].state & ( - (1<disks[rdev->desc_nr].state & ( - (1< ev2)) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1547,7 +1544,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; - __u64 role; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1681,17 +1678,16 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; if (!refdev) { - /* - * Insist of good event counter while assembling, except for - * spares (which don't need an event count) - */ - if (rdev->desc_nr >= 0 && - rdev->desc_nr < le32_to_cpu(sb->max_dev) && - (role < MD_DISK_ROLE_MAX || - role == MD_DISK_ROLE_JOURNAL)) + if (!spare_disk) ret = 1; else ret = 0; @@ -1711,14 +1707,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - /* - * Insist of good event counter while assembling, except for - * spares (which don't need an event count) - */ - if (rdev->desc_nr >= 0 && - rdev->desc_nr < le32_to_cpu(sb->max_dev) && - (role < MD_DISK_ROLE_MAX || - role == MD_DISK_ROLE_JOURNAL) && ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; -- cgit v1.2.3-59-g8ed1b From 45422b704db392a6d79d07ee3e3670b11048bd53 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 11 Nov 2019 16:43:20 -0800 Subject: md/raid10: prevent access of uninitialized resync_pages offset Due to unneeded multiplication in the out_free_pages portion of r10buf_pool_alloc(), when using a 3-copy raid10 layout, it is possible to access a resync_pages offset that has not been initialized. This access translates into a crash of the system within resync_free_pages() while passing a bad pointer to put_page(). Remove the multiplication, preventing access to the uninitialized area. Fixes: f0250618361db ("md: raid10: don't use bio's vec table to manage resync pages") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: John Pittman Suggested-by: David Jeffery Reviewed-by: Laurence Oberman Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2eca0a81a8c9..ec136e44aef7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -191,7 +191,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) out_free_pages: while (--j >= 0) - resync_free_pages(&rps[j * 2]); + resync_free_pages(&rps[j]); j = 0; out_free_bio: -- cgit v1.2.3-59-g8ed1b From c0e0954e909c17b43d176ab219fc598964616ae6 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:15 +0800 Subject: bcache: fix fifo index swapping condition in journal_pin_cmp() Fifo structure journal.pin is implemented by a cycle buffer, if the back index reaches highest location of the cycle buffer, it will be swapped to 0. Once the swapping happens, it means a smaller fifo index might be associated to a newer journal entry. So the btree node with oldest journal entry won't be selected in bch_btree_leaf_dirty() to reference the dirty B+tree leaf node. This problem may cause bcache journal won't protect unflushed oldest B+tree dirty leaf node in power failure, and this B+tree leaf node is possible to beinconsistent after reboot from power failure. This patch fixes the fifo index comparing logic in journal_pin_cmp(), to avoid potential corrupted B+tree leaf node when the back index of journal pin is swapped. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 26 ++++++++++++++++++++++++++ drivers/md/bcache/journal.h | 4 ---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ba434d9ac720..00523cd1db80 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -528,6 +528,32 @@ static void btree_node_write_work(struct work_struct *w) mutex_unlock(&b->write_lock); } +/* return true if journal pin 'l' is newer than 'r' */ +static bool journal_pin_cmp(struct cache_set *c, + atomic_t *l, + atomic_t *r) +{ + int l_idx, r_idx, f_idx, b_idx; + bool ret = false; + + l_idx = fifo_idx(&(c)->journal.pin, (l)); + r_idx = fifo_idx(&(c)->journal.pin, (r)); + f_idx = (c)->journal.pin.front; + b_idx = (c)->journal.pin.back; + + if (l_idx > r_idx) + ret = true; + /* in case fifo back pointer is swapped */ + if (b_idx < f_idx) { + if (l_idx <= b_idx && r_idx >= f_idx) + ret = true; + else if (l_idx >= f_idx && r_idx <= b_idx) + ret = false; + } + + return ret; +} + static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { struct bset *i = btree_bset_last(b); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index f2ea34d5f431..06b3eaab7d16 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -157,10 +157,6 @@ struct journal_device { }; #define BTREE_FLUSH_NR 8 - -#define journal_pin_cmp(c, l, r) \ - (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) - #define JOURNAL_PIN 20000 #define journal_full(j) \ -- cgit v1.2.3-59-g8ed1b From 34cf78bf34d48dddddfeeadb44f9841d7864997a Mon Sep 17 00:00:00 2001 From: Guoju Fang Date: Wed, 13 Nov 2019 16:03:16 +0800 Subject: bcache: fix a lost wake-up problem caused by mca_cannibalize_lock This patch fix a lost wake-up problem caused by the race between mca_cannibalize_lock and bch_cannibalize_unlock. Consider two processes, A and B. Process A is executing mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock and is executing bch_cannibalize_unlock. The problem happens that after process A executes cmpxchg and will execute prepare_to_wait. In this timeslice process B executes wake_up, but after that process A executes prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A goes to sleep but no one will wake up it. This problem may cause bcache device to dead. Signed-off-by: Guoju Fang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 12 ++++++++---- drivers/md/bcache/super.c | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..3653faf3bf48 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -582,6 +582,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 00523cd1db80..39d7fc1ef1ee 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -910,15 +910,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -953,10 +955,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..ebb854ed05a4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); -- cgit v1.2.3-59-g8ed1b From 2d8869518a525c9bce5f5268419df9dfbe3dfdeb Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:17 +0800 Subject: bcache: fix static checker warning in bcache_device_free() Commit cafe56359144 ("bcache: A block layer cache") leads to the following static checker warning: ./drivers/md/bcache/super.c:770 bcache_device_free() warn: variable dereferenced before check 'd->disk' (see line 766) drivers/md/bcache/super.c 762 static void bcache_device_free(struct bcache_device *d) 763 { 764 lockdep_assert_held(&bch_register_lock); 765 766 pr_info("%s stopped", d->disk->disk_name); ^^^^^^^^^ Unchecked dereference. 767 768 if (d->c) 769 bcache_device_detach(d); 770 if (d->disk && d->disk->flags & GENHD_FL_UP) ^^^^^^^ Check too late. 771 del_gendisk(d->disk); 772 if (d->disk && d->disk->queue) 773 blk_cleanup_queue(d->disk->queue); 774 if (d->disk) { 775 ida_simple_remove(&bcache_device_idx, 776 first_minor_to_idx(d->disk->first_minor)); 777 put_disk(d->disk); 778 } 779 It is not 100% sure that the gendisk struct of bcache device will always be there, the warning makes sense when there is problem in block core. This patch tries to remove the static checking warning by checking d->disk to avoid NULL pointer deferences. Reported-by: Dan Carpenter Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ebb854ed05a4..7beccede5360 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); -- cgit v1.2.3-59-g8ed1b From aaf8dbeab5865720c66db60ae8329309e81a0c9c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:18 +0800 Subject: bcache: add more accurate error messages in read_super() Previous code only returns "Not a bcache superblock" for both bcache super block offset and magic error. This patch addss more accurate error messages, - for super block unmatched offset: "Not a bcache superblock (bad offset)" - for super block unmatched magic number: "Not a bcache superblock (bad magic)" Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7beccede5360..623fdaf10c4c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; -- cgit v1.2.3-59-g8ed1b From 41fa4deef90ba1cd048b740317f50b9decae9fc8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:19 +0800 Subject: bcache: deleted code comments for dead code in bch_data_insert_keys() In request.c:bch_data_insert_keys(), there is code comment for a piece of dead code. This patch deletes the dead code and its code comment since they are useless in practice. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..73478a91a342 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; - /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL); -- cgit v1.2.3-59-g8ed1b From 06c1526da97dd0022973de3fc41b79b2d431b435 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:20 +0800 Subject: bcache: add code comment bch_keylist_pop() and bch_keylist_pop_front() This patch adds simple code comments for bch_keylist_pop() and bch_keylist_pop_front() in bset.c, to make the code more easier to be understand. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..f37a429f093d 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; } +/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys); -- cgit v1.2.3-59-g8ed1b From 84c529aea182939e68f618ed9813740c9165c7eb Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 13 Nov 2019 16:03:21 +0800 Subject: bcache: fix deadlock in bcache_allocator bcache_allocator can call the following: bch_allocator_thread() -> bch_prio_write() -> bch_bucket_alloc() -> wait on &ca->set->bucket_wait But the wake up event on bucket_wait is supposed to come from bch_allocator_thread() itself => deadlock: [ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds. [ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232 [ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1158.504413] bcache_allocato D 0 15861 2 0x80004000 [ 1158.504419] Call Trace: [ 1158.504429] __schedule+0x2a8/0x670 [ 1158.504432] schedule+0x2d/0x90 [ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache] [ 1158.504453] ? wait_woken+0x80/0x80 [ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache] [ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache] [ 1158.504491] kthread+0x121/0x140 [ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache] [ 1158.504506] ? kthread_park+0xb0/0xb0 [ 1158.504510] ret_from_fork+0x35/0x40 Fix by making the call to bch_prio_write() non-blocking, so that bch_allocator_thread() never waits on itself. Moreover, make sure to wake up the garbage collector thread when bch_prio_write() is failing to allocate buckets. BugLink: https://bugs.launchpad.net/bugs/1784665 BugLink: https://bugs.launchpad.net/bugs/1796292 Signed-off-by: Andrea Righi Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 5 ++++- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 27 +++++++++++++++++++++------ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3653faf3bf48..50241e045c70 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 623fdaf10c4c..d1352fcc6ff2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } static void prio_read(struct cache *ca, uint64_t bucket) @@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; -- cgit v1.2.3-59-g8ed1b From 5dccefd3ea0b33cf3e5a45cbccc7e0bf22791655 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:22 +0800 Subject: bcache: add code comments in bch_btree_leaf_dirty() This patch adds code comments in bch_btree_leaf_dirty() to explain why w->journal should always reference the eldest journal pin of all the writing bkeys in the btree node. To make the bcache journal code to be easier to be understood. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 39d7fc1ef1ee..48e33ee0d876 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -569,6 +569,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) set_btree_node_dirty(b); + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { -- cgit v1.2.3-59-g8ed1b From c5fcdedcee4e6ae15c0eb5e0fbe25467e57d2963 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:23 +0800 Subject: bcache: add idle_max_writeback_rate sysfs interface For writeback mode, if there is no regular I/O request for a while, the writeback rate will be set to the maximum value (1TB/s for now). This is good for most of the storage workload, but there are still people don't what the maximum writeback rate in I/O idle time. This patch adds a sysfs interface file idle_max_writeback_rate to permit people to disable maximum writeback rate. Then the minimum writeback rate can be advised by writeback_rate_minimum in the bcache device's sysfs interface. Reported-by: Christian Balzer Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 7 +++++++ drivers/md/bcache/writeback.c | 4 ++++ 4 files changed, 13 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 50241e045c70..9198c1b480d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -724,6 +724,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d1352fcc6ff2..77e9869345e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 627dcea0f5b6..733e2ddf3c78 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -747,6 +748,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +867,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; -- cgit v1.2.3-59-g8ed1b From 9fcc34b1a6dd4b8e5337e2b6ef45e428897eca6b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 13 Nov 2019 16:03:24 +0800 Subject: bcache: at least try to shrink 1 node in bch_mca_scan() In bch_mca_scan(), the number of shrinking btree node is calculated by code like this, unsigned long nr = sc->nr_to_scan; nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); variable sc->nr_to_scan is number of objects (here is bcache B+tree nodes' number) to shrink, and pointer variable sc is sent from memory management code as parametr of a callback. If sc->nr_to_scan is smaller than c->btree_pages, after the above calculation, variable 'nr' will be 0 and nothing will be shrunk. It is frequeently observed that only 1 or 2 is set to sc->nr_to_scan and make nr to be zero. Then bch_mca_scan() will do nothing more then acquiring and releasing mutex c->bucket_lock. This patch checkes whether nr is 0 after the above calculation, if 0 is the result then set 1 to variable 'n'. Then at least bch_mca_scan() will try to shrink a single B+tree node. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 48e33ee0d876..3df5fa4a501c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -754,6 +754,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; -- cgit v1.2.3-59-g8ed1b From 651bbba57ada682a8651768df6979598e28e3b8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2019 16:03:25 +0800 Subject: bcache: remove the extra cflags for request.o There is no block directory this file needs includes from. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock -- cgit v1.2.3-59-g8ed1b From 15fbb2312f32cf99bd8e0247ac0240c9bce0ba47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2019 16:03:26 +0800 Subject: bcache: don't export symbols None of the exported bcache symbols are actually used anywhere. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 15 --------------- drivers/md/bcache/closure.c | 7 ------- 2 files changed, 22 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f37a429f093d..cffcdc9feefb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -311,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free); int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -344,7 +343,6 @@ err: bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -363,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -680,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next); /* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -734,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree); /* Insert */ @@ -782,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key); static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -857,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge); void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -877,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert); unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -933,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key); /* Lookup */ @@ -1079,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, return i.l; } -EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -1134,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1167,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } -EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1198,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init); static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1315,7 +1302,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1368,7 +1354,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index c12cd809ab19..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } -EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -131,7 +127,6 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -149,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -162,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *closure_debug; -- cgit v1.2.3-59-g8ed1b From f1934892bd76ec127af4dd1ed17b73bc29b944a8 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:28 +0200 Subject: drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET As it is consistent with prefixes of other write life time hints. Signed-off-by: Eugene Syromiatnikov Signed-off-by: Song Liu --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 12a8ce83786e..f0fc538bfe59 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1134,7 +1134,7 @@ again: bi->bi_iter.bi_size = STRIPE_SIZE; bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1187,7 +1187,7 @@ again: rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_iter.bi_size = STRIPE_SIZE; rbi->bi_write_hint = sh->dev[i].write_hint; - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload -- cgit v1.2.3-59-g8ed1b From 0815ef3c019d280eb1b38e63ca7280f0f7db2bf8 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:34 +0200 Subject: drivers/md/raid5-ppl.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET As it is consistent with prefixes of other write life time hints. Signed-off-by: Eugene Syromiatnikov Signed-off-by: Song Liu --- drivers/md/raid5-ppl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 18a4064a61a8..cab5b1352892 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); - ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; + ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); -- cgit v1.2.3-59-g8ed1b From 00b89892c869f34528deca957b10d1468c4e8b38 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Nov 2019 08:35:47 -0700 Subject: Revert "bcache: fix fifo index swapping condition in journal_pin_cmp()" Coly says: "Guoju Fang talked to me today, he told me this change was unnecessary and I was over-thought. Then I realize fifo_idx() uses a mask to handle the array index overflow condition, so the index swap in journal_pin_cmp() won't happen. And yes, Guoju and Kent are correct. Since you already applied this patch, can you please to remove this patch from your for-next branch? This single patch does not break thing, but it is unecessary at this moment." This reverts commit c0e0954e909c17b43d176ab219fc598964616ae6. Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 26 -------------------------- drivers/md/bcache/journal.h | 4 ++++ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3df5fa4a501c..14d6c33b0957 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -528,32 +528,6 @@ static void btree_node_write_work(struct work_struct *w) mutex_unlock(&b->write_lock); } -/* return true if journal pin 'l' is newer than 'r' */ -static bool journal_pin_cmp(struct cache_set *c, - atomic_t *l, - atomic_t *r) -{ - int l_idx, r_idx, f_idx, b_idx; - bool ret = false; - - l_idx = fifo_idx(&(c)->journal.pin, (l)); - r_idx = fifo_idx(&(c)->journal.pin, (r)); - f_idx = (c)->journal.pin.front; - b_idx = (c)->journal.pin.back; - - if (l_idx > r_idx) - ret = true; - /* in case fifo back pointer is swapped */ - if (b_idx < f_idx) { - if (l_idx <= b_idx && r_idx >= f_idx) - ret = true; - else if (l_idx >= f_idx && r_idx <= b_idx) - ret = false; - } - - return ret; -} - static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { struct bset *i = btree_bset_last(b); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 06b3eaab7d16..f2ea34d5f431 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -157,6 +157,10 @@ struct journal_device { }; #define BTREE_FLUSH_NR 8 + +#define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + #define JOURNAL_PIN 20000 #define journal_full(j) \ -- cgit v1.2.3-59-g8ed1b