From 5b8e432dbb0e20bd8e99033f4a0bfa0d38c0e08e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 15 Jan 2019 00:41:43 +0800 Subject: xen/blkback: add stack variable 'blkif' in connect_ring() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As 'be->blkif' is used for many times in connect_ring(), the stack variable 'blkif' is added to substitute 'be-blkif'. Suggested-by: Paul Durrant Signed-off-by: Dongli Zhang Reviewed-by: Paul Durrant Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index a4bc74e72c39..a4aadac772e5 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -1023,6 +1023,7 @@ fail: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; unsigned int pers_grants; char protocol[64] = ""; int err, i; @@ -1033,25 +1034,25 @@ static int connect_ring(struct backend_info *be) pr_debug("%s %s\n", __func__, dev->otherend); - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol", "%63s", protocol); if (err <= 0) strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; else { xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -ENOSYS; } pers_grants = xenbus_read_unsigned(dev->otherend, "feature-persistent", 0); - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; + blkif->vbd.feature_gnt_persistent = pers_grants; + blkif->vbd.overflow_max_grants = 0; /* * Read the number of hardware queues from frontend. @@ -1067,16 +1068,16 @@ static int connect_ring(struct backend_info *be) requested_num_queues, xenblk_max_queues); return -ENOSYS; } - be->blkif->nr_rings = requested_num_queues; - if (xen_blkif_alloc_rings(be->blkif)) + blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(blkif)) return -ENOMEM; pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, - be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + blkif->nr_rings, blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); - if (be->blkif->nr_rings == 1) - return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + if (blkif->nr_rings == 1) + return read_per_ring_refs(&blkif->rings[0], dev->otherend); else { xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; xspath = kmalloc(xspathsize, GFP_KERNEL); @@ -1085,10 +1086,10 @@ static int connect_ring(struct backend_info *be) return -ENOMEM; } - for (i = 0; i < be->blkif->nr_rings; i++) { + for (i = 0; i < blkif->nr_rings; i++) { memset(xspath, 0, xspathsize); snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); - err = read_per_ring_refs(&be->blkif->rings[i], xspath); + err = read_per_ring_refs(&blkif->rings[i], xspath); if (err) { kfree(xspath); return err; -- cgit v1.2.3-59-g8ed1b From 4a8c31a1c6f526ec96a35e613f2a71e26ffbd7dd Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sun, 24 Feb 2019 10:17:27 -0500 Subject: xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront The xenstore 'ring-page-order' is used globally for each blkback queue and therefore should be read from xenstore only once. However, it is obtained in read_per_ring_refs() which might be called multiple times during the initialization of each blkback queue. If the blkfront is malicious and the 'ring-page-order' is set in different value by blkfront every time before blkback reads it, this may end up at the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in xen_blkif_disconnect() when frontend is destroyed. This patch reworks connect_ring() to read xenstore 'ring-page-order' only once. Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 72 +++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index a4aadac772e5..24896ffb04ed 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) int err, i, j; struct xen_blkif *blkif = ring->blkif; struct xenbus_device *dev = blkif->be->dev; - unsigned int ring_page_order, nr_grefs, evtchn; + unsigned int nr_grefs, evtchn; err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); @@ -936,43 +936,42 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) return err; } - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", - &ring_page_order); - if (err != 1) { - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); + nr_grefs = blkif->nr_ring_pages; + + if (unlikely(!nr_grefs)) { + WARN_ON(true); + return -EINVAL; + } + + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, + "%u", &ring_ref[i]); + if (err != 1) { + if (nr_grefs == 1) + break; + err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); + xenbus_dev_fatal(dev, err, "reading %s/%s", + dir, ring_ref_name); return err; } - nr_grefs = 1; - } else { - unsigned int i; + } + + if (err != 1) { + WARN_ON(nr_grefs != 1); - if (ring_page_order > xen_blkif_max_ring_order) { + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", + &ring_ref[0]); + if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dir, ring_page_order, - xen_blkif_max_ring_order); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } - - nr_grefs = 1 << ring_page_order; - for (i = 0; i < nr_grefs; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, - "%u", &ring_ref[i]); - if (err != 1) { - err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/%s", - dir, ring_ref_name); - return err; - } - } } - blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -1031,6 +1030,7 @@ static int connect_ring(struct backend_info *be) size_t xspathsize; const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ unsigned int requested_num_queues = 0; + unsigned int ring_page_order; pr_debug("%s %s\n", __func__, dev->otherend); @@ -1076,6 +1076,20 @@ static int connect_ring(struct backend_info *be) blkif->nr_rings, blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); + ring_page_order = xenbus_read_unsigned(dev->otherend, + "ring-page-order", 0); + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, + "requested ring page order %d exceed max:%d", + ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + blkif->nr_ring_pages = 1 << ring_page_order; + if (blkif->nr_rings == 1) return read_per_ring_refs(&blkif->rings[0], dev->otherend); else { -- cgit v1.2.3-59-g8ed1b From 05b700ba6003fd98c41314f390df36e2b893e167 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 3 Mar 2019 21:17:48 +0800 Subject: block: fix segment calculation for passthrough IO blk_recount_segments() can be called in bio_add_pc_page() for calculating how many segments this bio will has after one page is added to this bio. If the resulted segment number is beyond the queue limit, the added page will be removed. The try-and-fix policy requires blk_recount_segments(__blk_recalc_rq_segments) to not consider the segment number limit. Unfortunately bvec_split_segs() does check this limit, and causes small segment number returned to bio_add_pc_page(), then page still may be added to the bio even though segment number limit becomes broken. Fixes this issue by not considering segment number limit when calcualting bio's segment number. Fixes: dcebd755926b ("block: use bio_for_each_bvec() to compute multi-page bvec count") Cc: Christoph Hellwig Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 22467f475ab4..1c9d4f0f96ea 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -180,7 +180,7 @@ static unsigned get_max_segment_size(struct request_queue *q, */ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, unsigned *nsegs, unsigned *last_seg_size, - unsigned *front_seg_size, unsigned *sectors) + unsigned *front_seg_size, unsigned *sectors, unsigned max_segs) { unsigned len = bv->bv_len; unsigned total_len = 0; @@ -190,7 +190,7 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, * Multi-page bvec may be too big to hold in one segment, so the * current bvec has to be splitted as multiple segments. */ - while (len && new_nsegs + *nsegs < queue_max_segments(q)) { + while (len && new_nsegs + *nsegs < max_segs) { seg_size = get_max_segment_size(q, bv->bv_offset + total_len); seg_size = min(seg_size, len); @@ -240,6 +240,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); + const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { /* @@ -254,14 +255,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * Consider this a new segment if we're splitting in * the middle of this vector. */ - if (nsegs < queue_max_segments(q) && + if (nsegs < max_segs && sectors < max_sectors) { /* split in the middle of bvec */ bv.bv_len = (max_sectors - sectors) << 9; bvec_split_segs(q, &bv, &nsegs, &seg_size, &front_seg_size, - §ors); + §ors, max_segs); } goto split; } @@ -283,7 +284,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, continue; } new_segment: - if (nsegs == queue_max_segments(q)) + if (nsegs == max_segs) goto split; bvprv = bv; @@ -296,7 +297,7 @@ new_segment: if (nsegs == 1 && seg_size > front_seg_size) front_seg_size = seg_size; } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, - &front_seg_size, §ors)) { + &front_seg_size, §ors, max_segs)) { goto split; } } @@ -415,7 +416,7 @@ new_segment: bvprv = bv; prev = 1; bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, - &front_seg_size, NULL); + &front_seg_size, NULL, UINT_MAX); } bbio = bio; } -- cgit v1.2.3-59-g8ed1b From 9205e44916b2ca2e959be4210133292a19e79b0d Mon Sep 17 00:00:00 2001 From: Javier González Date: Thu, 7 Mar 2019 13:18:53 +0100 Subject: pblk: fix max_io calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calculating the maximun I/O size allowed into the buffer, consider the write size (ws_opt) used by the write thread in order to cover the case in which, due to flushes, the mem and subm pointers are disaligned by (ws_opt - 1). This case currently translates into a stall when an I/O of the largest possible size is submitted. Fixes: f9f9d1ae2c66 ("lightnvm: pblk: prevent stall due to wb threshold") Signed-off-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index b014957dde0b..a5f8bc2defbc 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -233,10 +233,15 @@ void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; + /* Maximize I/O size and ansure that back threshold is respected */ + if (threshold) + rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; + else + rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; + atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); atomic_set(&rl->rb_space, -1); -- cgit v1.2.3-59-g8ed1b From a596d08677320925b69e70c0fdc4c0f59384a65e Mon Sep 17 00:00:00 2001 From: Mariusz Dabrowski Date: Mon, 18 Feb 2019 15:04:09 +0100 Subject: raid5: set write hint for PPL When the Partial Parity Log is enabled, circular buffer is used to store PPL data. Each write to RAID device causes overwrite of data in this buffer so some write_hint can be set to those request to help drives handle garbage collection. This patch adds new sysfs attribute which can be used to specify which write_hint should be assigned to PPL. Acked-by: Guoqing Jiang Signed-off-by: Mariusz Dabrowski Signed-off-by: Song Liu --- Documentation/admin-guide/md.rst | 3 ++ drivers/md/raid5-log.h | 1 + drivers/md/raid5-ppl.c | 63 ++++++++++++++++++++++++++++++++++++++++ drivers/md/raid5.c | 1 + 4 files changed, 68 insertions(+) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 84de718f24a4..3c51084ffd37 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -756,3 +756,6 @@ These currently include: The cache mode for raid5. raid5 could include an extra disk for caching. The mode can be "write-throuth" and "write-back". The default is "write-through". + + ppl_write_hint + NVMe stream ID to be set for each PPL write request. diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index bfb811407061..43c714a8798c 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -45,6 +45,7 @@ extern void ppl_stripe_write_finished(struct stripe_head *sh); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); extern void ppl_quiesce(struct r5conf *conf, int quiesce); extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) { diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 3a7c36326589..f2b3020c2ac8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -21,6 +21,7 @@ #include #include "md.h" #include "raid5.h" +#include "raid5-log.h" /* * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for @@ -116,6 +117,8 @@ struct ppl_conf { /* stripes to retry if failed to allocate io_unit */ struct list_head no_mem_stripes; spinlock_t no_mem_stripes_lock; + + unsigned short write_hint; }; struct ppl_log { @@ -476,6 +479,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + bio->bi_write_hint = ppl_conf->write_hint; pr_debug("%s: log->current_io_sector: %llu\n", __func__, (unsigned long long)log->next_io_sector); @@ -505,6 +509,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; + bio->bi_write_hint = prev->bi_write_hint; bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); @@ -1409,6 +1414,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); + ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); @@ -1503,3 +1509,60 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) return ret; } + +static ssize_t +ppl_write_hint_show(struct mddev *mddev, char *buf) +{ + size_t ret = 0; + struct r5conf *conf; + struct ppl_conf *ppl_conf = NULL; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf && raid5_has_ppl(conf)) + ppl_conf = conf->log_private; + ret = sprintf(buf, "%d\n", ppl_conf ? ppl_conf->write_hint : 0); + spin_unlock(&mddev->lock); + + return ret; +} + +static ssize_t +ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + struct ppl_conf *ppl_conf; + int err = 0; + unsigned short new; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtou16(page, 10, &new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + } else if (raid5_has_ppl(conf)) { + ppl_conf = conf->log_private; + if (!ppl_conf) + err = -EINVAL; + else + ppl_conf->write_hint = new; + } else { + err = -EINVAL; + } + + mddev_unlock(mddev); + + return err ?: len; +} + +struct md_sysfs_entry +ppl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR, + ppl_write_hint_show, + ppl_write_hint_store); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cecea901ab8c..09562d7cc080 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6660,6 +6660,7 @@ static struct attribute *raid5_attrs[] = { &raid5_skip_copy.attr, &raid5_rmw_level.attr, &r5c_journal_mode.attr, + &ppl_write_hint.attr, NULL, }; static struct attribute_group raid5_attrs_group = { -- cgit v1.2.3-59-g8ed1b From b761dcf1217760a42f7897c31dcb649f59b2333e Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 8 Mar 2019 23:52:05 +0800 Subject: It's wrong to add len to sector_nr in raid10 reshape twice In reshape_request it already adds len to sector_nr already. It's wrong to add len to sector_nr again after adding pages to bio. If there is bad block it can't copy one chunk at a time, it needs to goto read_more. Now the sector_nr is wrong. It can cause data corruption. Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index abb5d382f64d..ecef42bfe19d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4670,7 +4670,6 @@ read_more: atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; generic_make_request(read_bio); - sector_nr += nr_sectors; sectors_done += nr_sectors; if (sector_nr <= last) goto read_more; -- cgit v1.2.3-59-g8ed1b From e406f12dde1a8375d77ea02d91f313fb1a9c6aec Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 4 Mar 2019 16:48:54 -0600 Subject: md: Fix failed allocation of md_register_thread mddev->sync_thread can be set to NULL on kzalloc failure downstream. The patch checks for such a scenario and frees allocated resources. Committer node: Added similar fix to raid5.c, as suggested by Guoqing. Cc: stable@vger.kernel.org # v3.16+ Acked-by: Guoqing Jiang Signed-off-by: Aditya Pakki Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 ++ drivers/md/raid5.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ecef42bfe19d..3b6880dd648d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3939,6 +3939,8 @@ static int raid10_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto out_free_conf; } return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 09562d7cc080..992fd08437d8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7403,6 +7403,8 @@ static int raid5_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto abort; } /* Ok, everything is just fine now */ -- cgit v1.2.3-59-g8ed1b From d9d53ed3f77ff4057ce714c0d169c28a653504e7 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 13 Mar 2019 18:54:54 +0100 Subject: nvme: add get-feature to admin cmds tracer This will print get-feature cmd in more informative way. For example, run "nvme get-feature /dev/nvme0 -n 1 -f 0x9 -c 10" will trace: nvme-3907 [008] .... 1763.635054: nvme_setup_cmd: nvme0: qid=0, cmdid=6, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_admin_get_features fid=0x9 sel=0x0 cdw11=0xa) -0 [001] d.h. 1763.635112: nvme_sq: nvme0: qid=0, head=27, tail=27 -0 [008] ..s. 1763.635121: nvme_complete_rq: nvme0: qid=0, cmdid=6, res=10, retries=0, flags=0x2, status=0 Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/trace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 58456de78bb2..5f24ea7a28eb 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -50,7 +50,19 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + return ret; +} static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { @@ -101,6 +113,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_get_features: + return nvme_trace_admin_get_features(p, cdw10); default: return nvme_trace_common(p, cdw10); } -- cgit v1.2.3-59-g8ed1b From 415df90b437f2b026ed37af2f812e41fc06c7f90 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Mar 2019 18:54:55 +0100 Subject: nvme: don't warn on block content change effects A write or flush IO passthrough command is expected to change the logical block content, so don't warn on these as no additional handling is necessary. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07bf2bff3a76..dc1641247b17 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1250,7 +1250,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (ns) { if (ctrl->effects) effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~NVME_CMD_EFFECTS_CSUPP) + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", opcode, effects); -- cgit v1.2.3-59-g8ed1b From 81fe92849928d65159d707b7b28febffbef94559 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Mar 2019 18:54:56 +0100 Subject: nvme-trace: fix cdw10 buffer overrun The field is defined to be a 24 byte array, we don't need to multiply the sizeof() that field by the number of dwords it covers. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 244d7c177e5a..97d3c77365b8 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -108,7 +108,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, - 6 * sizeof(__entry->cdw10)); + sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), -- cgit v1.2.3-59-g8ed1b From a63b83700ba89c300f705167d06bf122f3666287 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Mar 2019 18:54:57 +0100 Subject: nvme: put ns_head ref if namespace fails allocation In case nvme_alloc_ns fails after we initialize ns_head but before we add the ns to the controller namespaces list we need to explicitly put the ns_head reference because when we teardown the controller we won't find it, causing us to leak a dangling subsystem eventually. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc1641247b17..d57a84f45ed0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3304,6 +3304,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); out_free_id: kfree(id); out_free_queue: -- cgit v1.2.3-59-g8ed1b From 01fc08ff1f2f3f17d5947f18e62ed93c391aa3ce Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 13 Mar 2019 18:54:58 +0100 Subject: nvme: update comment to make the code easier to read After commit a686ed75c0fb ("nvme: introduce a helper function for controller deletion), nvme_delete_ctrl_sync no longer use flush_work. Update comment, accordingly. Signed-off-by: Yufen Yu Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d57a84f45ed0..b92fab434066 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -179,8 +179,8 @@ static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) int ret = 0; /* - * Keep a reference until the work is flushed since ->delete_ctrl - * can free the controller. + * Keep a reference until nvme_do_delete_ctrl() complete, + * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) -- cgit v1.2.3-59-g8ed1b From d11de63f2b519f0a162b834013b6d3a46dbf3886 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 13 Mar 2019 18:54:59 +0100 Subject: nvme-loop: init nvmet_ctrl fatal_err_work when allocate After commit 4d43d395fe (workqueue: Try to catch flush_work() without INIT_WORK()), it can cause warning when delete nvme-loop device, trace like: [ 76.601272] Call Trace: [ 76.601646] ? del_timer+0x72/0xa0 [ 76.602156] __cancel_work_timer+0x1ae/0x270 [ 76.602791] cancel_work_sync+0x14/0x20 [ 76.603407] nvmet_ctrl_free+0x1b7/0x2f0 [nvmet] [ 76.604091] ? free_percpu+0x168/0x300 [ 76.604652] nvmet_sq_destroy+0x106/0x240 [nvmet] [ 76.605346] nvme_loop_destroy_admin_queue+0x30/0x60 [nvme_loop] [ 76.606220] nvme_loop_shutdown_ctrl+0xc3/0xf0 [nvme_loop] [ 76.607026] nvme_loop_delete_ctrl_host+0x19/0x30 [nvme_loop] [ 76.607871] nvme_do_delete_ctrl+0x75/0xb0 [ 76.608477] nvme_sysfs_delete+0x7d/0xc0 [ 76.609057] dev_attr_store+0x24/0x40 [ 76.609603] sysfs_kf_write+0x4c/0x60 [ 76.610144] kernfs_fop_write+0x19a/0x260 [ 76.610742] __vfs_write+0x1c/0x60 [ 76.611246] vfs_write+0xfa/0x280 [ 76.611739] ksys_write+0x6e/0x120 [ 76.612238] __x64_sys_write+0x1e/0x30 [ 76.612787] do_syscall_64+0xbf/0x3a0 [ 76.613329] entry_SYSCALL_64_after_hwframe+0x44/0xa9 We fix it by moving fatal_err_work init to nvmet_alloc_ctrl(), which may more reasonable. Signed-off-by: Yufen Yu Reviewed-by: Sagi Grimberg Reviewed-by: Bart Van Assche Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index d44ede147263..2d73b66e3686 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1163,6 +1163,15 @@ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) put_device(ctrl->p2p_client); } +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) { @@ -1205,6 +1214,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); @@ -1308,21 +1318,11 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) kref_put(&ctrl->ref, nvmet_ctrl_free); } -static void nvmet_fatal_error_handler(struct work_struct *work) -{ - struct nvmet_ctrl *ctrl = - container_of(work, struct nvmet_ctrl, fatal_err_work); - - pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); - ctrl->ops->delete_ctrl(ctrl); -} - void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { mutex_lock(&ctrl->lock); if (!(ctrl->csts & NVME_CSTS_CFS)) { ctrl->csts |= NVME_CSTS_CFS; - INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); schedule_work(&ctrl->fatal_err_work); } mutex_unlock(&ctrl->lock); -- cgit v1.2.3-59-g8ed1b From 9f7d8ae2f79479ce13d987c8f3b1500b8937fc5d Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:00 +0100 Subject: nvme-fc: use nr_phys_segments to determine existence of sgl For some nvme command, when issued by the nvme core layer, there is an internal buffer which can cause blk_rq_payload_bytes() to return a non-zero value yet there is no actual/real command payload and sg list. An example is the WRITE ZEROES command. To address this, when making choices on whether to dma map an sgl, use blk_rq_nr_phys_segments() instead of blk_rq_payload_bytes(). When there is a sgl, blk_rq_payload_bytes() will return the amount of data to be transferred by the sgl. Signed-off-by: Chaitanya Kulkarni Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b29b12498a1a..ba8f2a9cbdaf 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2107,7 +2107,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, freq->sg_cnt = 0; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -2304,12 +2304,23 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = blk_rq_payload_bytes(rq); - if (data_len) + /* + * nvme core doesn't quite treat the rq opaquely. Commands such + * as WRITE ZEROES will return a non-zero rq payload_bytes yet + * there is no actual payload to be transferred. + * To get it right, key data transmission on there being 1 or + * more physical segments in the sg list. If there is no + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { + data_len = blk_rq_payload_bytes(rq); io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); - else + } else { + data_len = 0; io_dir = NVMEFC_FCP_NODATA; + } + return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } -- cgit v1.2.3-59-g8ed1b From 06f3d71ea071b70e62bcc146cd9ff7ed1f9d4e43 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:01 +0100 Subject: nvme-fc: fix numa_node when dev is null A recent change added a numa_node field to the nvme controller and has the transport assign the node using dev_to_node(). However, fcloop registers with a NULL device struct, so the dev_to_node() call oops. Revise the assignment to assign no node when device struct is null. Fixes: 103e515efa89b ("nvme: add a numa_node field to struct nvme_ctrl") Reported-by: Mike Snitzer Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Mike Snitzer [hch: small coding style fixup] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ba8f2a9cbdaf..23f6bad19274 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3017,7 +3017,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; - ctrl->ctrl.numa_node = dev_to_node(lport->dev); + if (lport->dev) + ctrl->ctrl.numa_node = dev_to_node(lport->dev); + else + ctrl->ctrl.numa_node = NUMA_NO_NODE; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; -- cgit v1.2.3-59-g8ed1b From 834d3710a093aa18c8aa88e6e1892180abadebaf Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:02 +0100 Subject: nvme-fc: reject reconnect if io queue count is reduced to zero If: - A successful connect has occurred with an io queue count greater than zero and namespaces detected and running. - An error or something occurs which causes a termination of the prior association and then starts a reconnect, - The reconnect then creates a new controller, but for whatever reason, nvme_set_queue_count() results in io queue count set to zero. This will skip io queue and tag set changes. - But... the controller will transition to live, calling nvme_start_ctrl, which calls nvme_start_queues(), which then releases I/Os into the transport which then sends them to the driver. As there are no queues, things eventually hit the driver looking for a handle, which was cleared when the original controller was reset, and it can't proceed. Worst case, things progress, but everything fails. In the failing scenario, the nvme_set_features(NVME_FEAT_NUM_QUEUES) command actually failed with a NVME_SC_INTERNAL error. For some reason, although nvme_set_queue_count() saw the error and set io queue count to zero, it doesn't return a failure status to the transport, which allows the transport to continue using the controller. Fix the problem by simply rejecting the new association if at least 1 I/O queue can't be created. The association reject will fail the reconnect attempt and fall into the reconnect retry policy. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 23f6bad19274..f3b9d91ba0df 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2475,6 +2475,7 @@ static int nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1; unsigned int nr_io_queues; int ret; @@ -2487,6 +2488,13 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) return ret; } + if (!nr_io_queues && prior_ioq_cnt) { + dev_info(ctrl->ctrl.device, + "Fail Reconnect: At least 1 io queue " + "required (was %d)\n", prior_ioq_cnt); + return -ENOSPC; + } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ if (ctrl->ctrl.queue_count == 1) @@ -2500,6 +2508,10 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + if (prior_ioq_cnt != nr_io_queues) + dev_info(ctrl->ctrl.device, + "reconnect: revising io queue count from %d to %d\n", + prior_ioq_cnt, nr_io_queues); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); return 0; -- cgit v1.2.3-59-g8ed1b From 0191e7405b687339a5540c1562acdecefd70eb3f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:03 +0100 Subject: nvmet-fc: fix issues with targetport assoc_list list walking There are two changes: 1) The logic in the __nvmet_fc_free_assoc() routine is bad. It uses "safe" routines assuming pointers will come back valid. However, the intervening next structure being linked can be removed from the list and the resulting safe pointers are bad, resulting in NULL ptrs being hit. Correct by scheduling a work element to perform the association delete, which can be done while under lock. 2) Prior patch that added the work element scheduling left a possible reference on the object if the work element couldn't be scheduled. Correct by doing the put on a failing schedule_work() call. Signed-off-by: Nigel Kirkland Signed-off-by: James Smart Reviewed-by: Ewan D. Milne Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1e9654f04c60..6b7bbf39fa06 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1143,10 +1143,8 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - spin_unlock_irqrestore(&tgtport->lock, flags); - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); - spin_lock_irqsave(&tgtport->lock, flags); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1185,7 +1183,8 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - schedule_work(&assoc->del_work); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); return; } -- cgit v1.2.3-59-g8ed1b From 404ec31df434fdae515202952b5e230c1b983ee1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:04 +0100 Subject: nvmet-fc: bring Disconnect into compliance with FC-NVME spec The FC-NVME spec, when finally approved, modified the disconnect LS such that the only scope available is the association. Rework the Disconnect LS processing to be in accordance with the change. Signed-off-by: Nigel Kirkland Signed-off-by: James Smart Reviewed-by: Ewan D. Milne Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6b7bbf39fa06..98b7b1f4ee96 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1502,10 +1502,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, (struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf; struct fcnvme_ls_disconnect_acc *acc = (struct fcnvme_ls_disconnect_acc *)iod->rspbuf; - struct nvmet_fc_tgt_queue *queue = NULL; struct nvmet_fc_tgt_assoc *assoc; int ret = 0; - bool del_assoc = false; memset(acc, 0, sizeof(*acc)); @@ -1536,18 +1534,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, assoc = nvmet_fc_find_target_assoc(tgtport, be64_to_cpu(rqst->associd.association_id)); iod->assoc = assoc; - if (assoc) { - if (rqst->discon_cmd.scope == - FCNVME_DISCONN_CONNECTION) { - queue = nvmet_fc_find_target_queue(tgtport, - be64_to_cpu( - rqst->discon_cmd.id)); - if (!queue) { - nvmet_fc_tgt_a_put(assoc); - ret = VERR_NO_CONN; - } - } - } else + if (!assoc) ret = VERR_NO_ASSOC; } @@ -1575,26 +1562,10 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, sizeof(struct fcnvme_ls_disconnect_acc)), FCNVME_LS_DISCONNECT); - - /* are we to delete a Connection ID (queue) */ - if (queue) { - int qid = queue->qid; - - nvmet_fc_delete_target_queue(queue); - - /* release the get taken by find_target_queue */ - nvmet_fc_tgt_q_put(queue); - - /* tear association down if io queue terminated */ - if (!qid) - del_assoc = true; - } - /* release get taken in nvmet_fc_find_target_assoc */ nvmet_fc_tgt_a_put(iod->assoc); - if (del_assoc) - nvmet_fc_delete_target_assoc(iod->assoc); + nvmet_fc_delete_target_assoc(iod->assoc); } -- cgit v1.2.3-59-g8ed1b From 7b210e4ed5e281728243799c5e2b84d3f70d4dd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:05 +0100 Subject: nvme: disable Write Zeroes for qemu controllers Qemu started out with a broken implementation of Write Zeroes written by yours truly. Disable Write Zeroes on qemu for now, eventually we need to go back and make all the qemu quirks version specific, but that is left for another time. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Tested-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b92fab434066..951e9f31b57c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1531,7 +1531,8 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) u32 max_sectors; unsigned short bs = 1 << ns->lba_shift; - if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)) + if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || + (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) return; /* * Even though NVMe spec explicitly states that MDTS is not diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b91f1838bbd5..527d64545023 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -87,6 +87,11 @@ enum nvme_quirks { * Ignore device provided subnqn. */ NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), + + /* + * Broken Write Zeroes. + */ + NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f54718b63637..3a2377888a46 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2975,7 +2975,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ -- cgit v1.2.3-59-g8ed1b From b1aafb35b45b1d734c670059c125a4ff111a47bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:06 +0100 Subject: nvme: remove nvme_ns_config_oncs Just opencode the two function calls in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 951e9f31b57c..26ae805fc958 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1552,12 +1552,6 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); } -static inline void nvme_ns_config_oncs(struct nvme_ns *ns) -{ - nvme_config_discard(ns); - nvme_config_write_zeroes(ns); -} - static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { @@ -1611,7 +1605,9 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; set_capacity(disk, capacity); - nvme_ns_config_oncs(ns); + + nvme_config_discard(ns); + nvme_config_write_zeroes(ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); -- cgit v1.2.3-59-g8ed1b From 2631857160ecbea04e54423f5053133fe2b6ea45 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:07 +0100 Subject: nvme: add proper discard setup for the multipath device Add a gendisk argument to nvme_config_discard so that the call to nvme_update_disk_info for the multipath device node updates the proper request_queue. Signed-off-by: Christoph Hellwig Reported-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 26ae805fc958..6a57ece7d76b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1495,10 +1495,10 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; - struct request_queue *queue = ns->queue; + struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { @@ -1606,7 +1606,7 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); - nvme_config_discard(ns); + nvme_config_discard(disk, ns); nvme_config_write_zeroes(ns); if (id->nsattr & (1 << 0)) -- cgit v1.2.3-59-g8ed1b From 9f0916ab932f676c042d4592a235a895847484f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:08 +0100 Subject: nvme: add proper write zeroes setup for the multipath device Add a gendisk argument to nvme_config_write_zeroes so that the call to nvme_update_disk_info for the multipath device node updates the proper request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6a57ece7d76b..470601980794 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1526,7 +1526,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static inline void nvme_config_write_zeroes(struct nvme_ns *ns) +static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { u32 max_sectors; unsigned short bs = 1 << ns->lba_shift; @@ -1549,7 +1549,7 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) else max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; - blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); + blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1607,7 +1607,7 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); nvme_config_discard(disk, ns); - nvme_config_write_zeroes(ns); + nvme_config_write_zeroes(disk, ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); -- cgit v1.2.3-59-g8ed1b From 005c674f705ee308e23b8e4e7047419d12122fde Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:09 +0100 Subject: nvmet: ignore EOPNOTSUPP for discard NVMe DSM is a pure hint, so if the underlying device / file system does not support discard-like operations we should not fail the operation but rather return success. Fixes: 3b031d15995f ("nvmet: add error log support for bdev backend") Signed-off-by: Christoph Hellwig Reviewed by: Chaitanya Kulkarni Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 8 ++++---- drivers/nvme/target/io-cmd-file.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 71dfedbadc26..a065dbfc43b1 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -194,11 +194,11 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); - - if (ret) + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range->slba); - - return blk_to_nvme_status(req, errno_to_blk_status(ret)); + return blk_to_nvme_status(req, errno_to_blk_status(ret)); + } + return NVME_SC_SUCCESS; } static void nvmet_bdev_execute_discard(struct nvmet_req *req) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 517522305e5c..3e43212d3c1c 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -297,7 +297,7 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) } ret = vfs_fallocate(req->ns->file, mode, offset, len); - if (ret) { + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range.slba); status = errno_to_nvme_status(req, ret); break; -- cgit v1.2.3-59-g8ed1b From 602d674ce90f64ac135452fb9b2b058acb53b226 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Mar 2019 18:55:10 +0100 Subject: nvme-tcp: support C2HData with SUCCESS flag A C2HData PDU with the SUCCESS flag set indicates that the I/O was completed by the controller successfully and means that a subsequent completion response capsule PDU will be ommitted. If we see this flag, fisrt we check that LAST_PDU flag is set as well, and then we complete the request when the data transfer (and data digest verification if its on) is done. While we're at it, reuse a bit of code with nvme_fail_request. Reported-by: Steve Blightman Suggested-by: Oliver Smith-Denny Signed-off-by: Sagi Grimberg Reviewed-by: Oliver Smith-Denny Tested-by: Oliver Smith-Denny Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 208ee518af65..e7e08889865e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -463,6 +463,15 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, queue->data_remaining = le32_to_cpu(pdu->data_length); + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + return 0; } @@ -618,6 +627,14 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return ret; } +static inline void nvme_tcp_end_request(struct request *rq, __le16 status) +{ + union nvme_result res = {}; + + nvme_end_request(rq, cpu_to_le16(status << 1), res); +} + + static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -685,6 +702,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); nvme_tcp_init_recv_ctx(queue); } } @@ -695,6 +714,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; @@ -718,6 +738,13 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return -EIO; } + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + } + nvme_tcp_init_recv_ctx(queue); return 0; } @@ -815,10 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - union nvme_result res = {}; - - nvme_end_request(blk_mq_rq_from_pdu(req), - cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) -- cgit v1.2.3-59-g8ed1b From f6d85f04e29859dd3ea65395c05925da352dae89 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:31:13 +0100 Subject: blkcg: annotate implicit fall through There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/trace/blktrace.c:725:9: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..e1c6d79fb4cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; -- cgit v1.2.3-59-g8ed1b