From b6dccf7fae4331b0ea41cf087e3f02d5db9161dc Mon Sep 17 00:00:00 2001 From: Arnav Dawn Date: Wed, 12 Jul 2017 16:10:40 +0530 Subject: nvme: add support for FW activation without reset This patch adds support for handling Fw activation without reset On completion of FW-activation-starting AER, all queues are paused till CSTS.PP is cleared or timed out (exceeds max time for fw activtion MTFA). If device fails to clear CSTS.PP within MTFA, driver issues reset controller. Signed-off-by: Arnav Dawn Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 77 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c596dd3c58b1..1a1dedbac39b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -76,6 +76,11 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +static __le32 nvme_get_log_dw10(u8 lid, size_t size) +{ + return cpu_to_le32((((size / 4) - 1) << 16) | lid); +} + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -2534,6 +2539,71 @@ static void nvme_async_event_work(struct work_struct *work) spin_unlock_irq(&ctrl->lock); } +static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) +{ + + u32 csts; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) + return false; + + if (csts == ~0) + return false; + + return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); +} + +static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_fw_slot_info_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return; + + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xffffffff); + c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); + + if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) + dev_warn(ctrl->device, + "Get FW SLOT INFO log error\n"); + kfree(log); +} + +static void nvme_fw_act_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, + struct nvme_ctrl, fw_act_work); + unsigned long fw_act_timeout; + + if (ctrl->mtfa) + fw_act_timeout = jiffies + + msecs_to_jiffies(ctrl->mtfa * 100); + else + fw_act_timeout = jiffies + + msecs_to_jiffies(admin_timeout * 1000); + + nvme_stop_queues(ctrl); + while (nvme_ctrl_pp_status(ctrl)) { + if (time_after(jiffies, fw_act_timeout)) { + dev_warn(ctrl->device, + "Fw activation timeout, reset controller\n"); + nvme_reset_ctrl(ctrl); + break; + } + msleep(100); + } + + if (ctrl->state != NVME_CTRL_LIVE) + return; + + nvme_start_queues(ctrl); + /* read FW slot informationi to clear the AER*/ + nvme_get_fw_slot_info(ctrl); +} + void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res) { @@ -2560,6 +2630,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, dev_info(ctrl->device, "rescanning\n"); nvme_queue_scan(ctrl); break; + case NVME_AER_NOTICE_FW_ACT_STARTING: + schedule_work(&ctrl->fw_act_work); + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -2607,6 +2680,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); + cancel_work_sync(&ctrl->fw_act_work); } EXPORT_SYMBOL_GPL(nvme_stop_ctrl); @@ -2670,6 +2744,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->quirks = quirks; INIT_WORK(&ctrl->scan_work, nvme_scan_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); + INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); ret = nvme_set_instance(ctrl); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8f2a168ddc01..b40b9af4564f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -142,6 +142,7 @@ struct nvme_ctrl { u16 cntlid; u32 ctrl_config; + u16 mtfa; u32 queue_count; u64 cap; @@ -167,6 +168,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct work_struct fw_act_work; /* Power saving configuration */ u64 ps_max_latency_us; -- cgit v1.2.3-59-g8ed1b From 62346eaeb2f1a0524b35eaa2f479596f40491165 Mon Sep 17 00:00:00 2001 From: Arnav Dawn Date: Wed, 12 Jul 2017 16:11:53 +0530 Subject: nvme: define NVME_NSID_ALL Define the constant "0xffffffff" (used as nsid for all namespaces) as NVME_NSID_ALL. Signed-off-by: Arnav Dawn Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++--- include/linux/nvme.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a1dedbac39b..a6afeedc009a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -312,7 +312,7 @@ static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) memset(&c, 0, sizeof(c)); c.directive.opcode = nvme_admin_directive_send; - c.directive.nsid = cpu_to_le32(0xffffffff); + c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; c.directive.dtype = NVME_DIR_IDENTIFY; c.directive.tdtype = NVME_DIR_STREAMS; @@ -362,7 +362,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ret) return ret; - ret = nvme_get_stream_params(ctrl, &s, 0xffffffff); + ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); if (ret) return ret; @@ -2563,7 +2563,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) return; c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xffffffff); + c.common.nsid = cpu_to_le32(NVME_NSID_ALL); c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5d0f2f4fc3b8..1d79046bf9d4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -32,6 +32,8 @@ #define NVME_RDMA_IP_PORT 4420 +#define NVME_NSID_ALL 0xffffffff + enum nvme_subsys_type { NVME_NQN_DISC = 1, /* Discovery type target subsystem */ NVME_NQN_NVME = 2, /* NVME type target subsystem */ -- cgit v1.2.3-59-g8ed1b From dbf86b39005d26b21c52a23720e15fb850d71cdc Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 16 Aug 2017 09:51:29 +0200 Subject: nvme: add support for NVMe 1.3 Timestamp Feature NVME's Timestamp feature allows controllers to be aware of the epoch time in milliseconds. This patch adds the set features hook for various transports through the identify path, so that resets and resumes can update the controller as necessary. Signed-off-by: Jon Derrick [hch: rebased on top of nvme-4.13 error handling changes, changed nvme_configure_timestamp to return the status] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 21 +++++++++++++++++++++ include/linux/nvme.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a6afeedc009a..0b979e16655e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1507,6 +1507,23 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } +static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) +{ + __le64 ts; + int ret; + + if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) + return 0; + + ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); + ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), + NULL); + if (ret) + dev_warn_once(ctrl->device, + "could not set timestamp (%d)\n", ret); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -1861,6 +1878,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; + + ret = nvme_configure_timestamp(ctrl); + if (ret < 0) + return ret; ret = nvme_configure_directives(ctrl); if (ret < 0) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d79046bf9d4..a12b47073273 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -254,6 +254,7 @@ enum { NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, @@ -688,6 +689,7 @@ enum { NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, NVME_FEAT_HOST_MEM_BUF = 0x0d, + NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, -- cgit v1.2.3-59-g8ed1b From 1645d5036f9201bd0925aff099fa92fbe7afaff4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jul 2017 19:46:36 +0200 Subject: nvmet: use NVME_NSID_ALL Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/configfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index a53bb6635b83..dffd8064163b 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -100,7 +100,7 @@ static u16 nvmet_get_smart_log(struct nvmet_req *req, u16 status; WARN_ON(req == NULL || slog == NULL); - if (req->cmd->get_log_page.nsid == cpu_to_le32(0xFFFFFFFF)) + if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL)) status = nvmet_get_smart_log_all(req, slog); else status = nvmet_get_smart_log_nsid(req, slog); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 0a0067e771f5..b6aeb1d70951 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -444,7 +444,7 @@ static struct config_group *nvmet_ns_make(struct config_group *group, goto out; ret = -EINVAL; - if (nsid == 0 || nsid == 0xffffffff) + if (nsid == 0 || nsid == NVME_NSID_ALL) goto out; ret = -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 130c24b5be80512be973e3a614abda75f5896b42 Mon Sep 17 00:00:00 2001 From: Guan Junxiong Date: Fri, 4 Aug 2017 17:27:47 +0800 Subject: nvmet: fix the return error code of target if host is not allowed nvmf target shall return NVME_SC_CONNECT_INVALID_HOST instead of the gereal code INVALID_PARAM when the given host nqn is not allowed to connect. Refer to the 2.2.1 section of the NVMe over Fabrics Spec. Signed-off-by: Guan Junxiong Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index f4b02bb4a1a8..ea7eb00dcb87 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -749,6 +749,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, hostnqn, subsysnqn); req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); + status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; goto out_put_subsystem; } up_read(&nvmet_config_sem); -- cgit v1.2.3-59-g8ed1b From 1c35be8c8a13fa714c1c783f14653ca9d3e3721f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 Aug 2017 11:28:48 +0200 Subject: nvmet-fcloop: remove ALL_OPTS define ALL_OPTS isn't used anywhere, remove it. Signed-off-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1bb9d5b311b1..1cb9847ec261 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -193,9 +193,6 @@ out_free_options: #define TGTPORT_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN) -#define ALL_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN | NVMF_OPT_ROLES | \ - NVMF_OPT_FCADDR | NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN) - static DEFINE_SPINLOCK(fcloop_lock); static LIST_HEAD(fcloop_lports); -- cgit v1.2.3-59-g8ed1b From 4897ad4e0812c358c8bdd7aa25fdc2201ae2de0b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 Aug 2017 11:28:47 +0200 Subject: nvme-rdma: remove NVME_RDMA_MAX_SEGMENT_SIZE NVME_RDMA_MAX_SEGMENT_SIZE is not used anywhere, zap it. Signed-off-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9ff0eb3a625e..ffd73890d077 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -36,8 +36,6 @@ #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ -#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ - #define NVME_RDMA_MAX_SEGMENTS 256 #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -- cgit v1.2.3-59-g8ed1b From 90af35123d3be8b011a8a3f69ce46fd431c55b25 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:28 +0300 Subject: nvme-rdma: move nvme_rdma_configure_admin_queue code location We will call it from other places so avoid having to forward declare it. Also move it next to nvme_rdma_destroy_admin_queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 185 ++++++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 91 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ffd73890d077..3cecb087ee3a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -149,6 +149,9 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static const struct blk_mq_ops nvme_rdma_mq_ops; +static const struct blk_mq_ops nvme_rdma_admin_mq_ops; + /* XXX: really should move to a generic header sooner or later.. */ static inline void put_unaligned_le24(u32 val, u8 *p) { @@ -653,6 +656,97 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) nvme_rdma_dev_put(ctrl->device); } +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + int error; + + error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + error = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_queue; + + ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ctrl->device->dev->attrs.max_fast_reg_page_list_len); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_put_dev; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + &ctrl->ctrl.cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, + &ctrl->async_event_sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (error) + goto out_cleanup_queue; + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + /* disconnect and drain the queue before freeing the tagset */ + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1517,97 +1611,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .timeout = nvme_rdma_timeout, }; -static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) -{ - int error; - - error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); - if (error) - return error; - - ctrl->device = ctrl->queues[0].device; - - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - error = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) - goto out_free_queue; - - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ctrl->device->dev->attrs.max_fast_reg_page_list_len); - - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; - ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; - - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); - if (error) - goto out_put_dev; - - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; - } - - error = nvmf_connect_admin_queue(&ctrl->ctrl); - if (error) - goto out_cleanup_queue; - - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - if (error) - goto out_cleanup_queue; - - ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); - - error = nvme_init_identify(&ctrl->ctrl); - if (error) - goto out_cleanup_queue; - - error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, - &ctrl->async_event_sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (error) - goto out_cleanup_queue; - - return 0; - -out_cleanup_queue: - blk_cleanup_queue(ctrl->ctrl.admin_q); -out_free_tagset: - /* disconnect and drain the queue before freeing the tagset */ - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_free_tag_set(&ctrl->admin_tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); -out_free_queue: - nvme_rdma_free_queue(&ctrl->queues[0]); - return error; -} - static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) { cancel_work_sync(&ctrl->err_work); -- cgit v1.2.3-59-g8ed1b From 34b6c2315eb66e6411261aa440f6e3c4cded3506 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:29 +0300 Subject: nvme: Add admin_tagset pointer to nvme_ctrl Will be used when we centralize control flows. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + drivers/nvme/host/rdma.c | 1 + drivers/nvme/target/loop.c | 1 + 5 files changed, 5 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1438be649866..1912df412692 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2731,6 +2731,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (ret) goto out_free_queues; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b40b9af4564f..2c8a02be46fd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -125,6 +125,7 @@ struct nvme_ctrl { struct kref kref; int instance; struct blk_mq_tag_set *tagset; + struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; struct mutex namespaces_mutex; struct device *device; /* char device */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 925467b31a33..e6283745ecd2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1376,6 +1376,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; + dev->ctrl.admin_tagset = &dev->admin_tagset; dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); if (IS_ERR(dev->ctrl.admin_q)) { diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3cecb087ee3a..6ef56500fc9c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -691,6 +691,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) goto out_put_dev; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 717ed7ddb2f6..92628c432926 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -375,6 +375,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) goto out_free_sq; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { -- cgit v1.2.3-59-g8ed1b From b28a308ee7774341312de28405e53a4a30cb7d31 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:30 +0300 Subject: nvme-rdma: move tagset allocation to a dedicated routine We always pair tagset allocation with rdma device reference and it shares some code, centralize it with an argument if its an admin or IO tagset. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 131 +++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 56 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6ef56500fc9c..3f580c198100 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -646,14 +646,79 @@ out_free_queues: return ret; } +static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + struct blk_mq_tag_set *set = admin ? + &ctrl->admin_tag_set : &ctrl->tag_set; + + blk_mq_free_tag_set(set); + nvme_rdma_dev_put(ctrl->device); +} + +static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_admin_mq_ops; + set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + set->reserved_tags = 2; /* connect + keep-alive */ + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_mq_ops; + set->queue_depth = nctrl->opts->queue_size; + set->reserved_tags = 1; /* fabric connect */ + set->numa_node = NUMA_NO_NODE; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + goto out; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + ret = nvme_rdma_dev_get(ctrl->device); + if (!ret) { + ret = -EINVAL; + goto out_free_tagset; + } + + return set; + +out_free_tagset: + blk_mq_free_tag_set(set); +out: + return ERR_PTR(ret); +} + static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); blk_cleanup_queue(ctrl->ctrl.admin_q); - blk_mq_free_tag_set(&ctrl->admin_tag_set); - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, true); } static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) @@ -666,32 +731,12 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) ctrl->device = ctrl->queues[0].device; - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - error = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) - goto out_free_queue; - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, ctrl->device->dev->attrs.max_fast_reg_page_list_len); - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; - ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; - - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); - if (error) - goto out_put_dev; - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.admin_tagset)) + goto out_free_queue; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { @@ -740,9 +785,7 @@ out_cleanup_queue: out_free_tagset: /* disconnect and drain the queue before freeing the tagset */ nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_free_tag_set(&ctrl->admin_tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, true); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -1644,8 +1687,7 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { blk_cleanup_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(&ctrl->tag_set); - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, false); } nvme_put_ctrl(&ctrl->ctrl); @@ -1765,31 +1807,10 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - ret = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) + ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.tagset)) goto out_free_io_queues; - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); - ctrl->tag_set.ops = &nvme_rdma_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; - ctrl->tag_set.reserved_tags = 1; /* fabric connect */ - ctrl->tag_set.numa_node = NUMA_NO_NODE; - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; - - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); - if (ret) - goto out_put_dev; - ctrl->ctrl.tagset = &ctrl->tag_set; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); if (IS_ERR(ctrl->ctrl.connect_q)) { ret = PTR_ERR(ctrl->ctrl.connect_q); @@ -1805,9 +1826,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) out_cleanup_connect_q: blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: - blk_mq_free_tag_set(&ctrl->tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, false); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; -- cgit v1.2.3-59-g8ed1b From 18398af2dcca3044c74fd2697f1d5b624b7848fe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:31 +0300 Subject: nvme-rdma: disable the controller on resets Mimic the pci driver as a controller disable might be more lightweight than a shutdown. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3f580c198100..b1e67cd41c81 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1655,7 +1655,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .timeout = nvme_rdma_timeout, }; -static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); @@ -1667,8 +1667,10 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_free_io_queues(ctrl); } - if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) + if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); + else + nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, @@ -1682,7 +1684,7 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_stop_ctrl(&ctrl->ctrl); nvme_remove_namespaces(&ctrl->ctrl); if (shutdown) - nvme_rdma_shutdown_ctrl(ctrl); + nvme_rdma_shutdown_ctrl(ctrl, shutdown); nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { @@ -1746,7 +1748,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) bool changed; nvme_stop_ctrl(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl); + nvme_rdma_shutdown_ctrl(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl); if (ret) { -- cgit v1.2.3-59-g8ed1b From 3f02fffb74ae7486232fef6ca4341c5e9719c759 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:32 +0300 Subject: nvme-rdma: don't free tagset on resets We're not supposed to do that. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 49 +++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b1e67cd41c81..b8c07f2f2204 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -712,16 +712,20 @@ out: return ERR_PTR(ret); } -static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); - blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + if (remove) { + blk_cleanup_queue(ctrl->ctrl.admin_q); + nvme_rdma_free_tagset(&ctrl->ctrl, true); + } } -static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool new) { int error; @@ -734,14 +738,21 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, ctrl->device->dev->attrs.max_fast_reg_page_list_len); - ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); - if (IS_ERR(ctrl->ctrl.admin_tagset)) - goto out_free_queue; + if (new) { + ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.admin_tagset)) + goto out_free_queue; - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + } else { + error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, + nvme_rdma_reinit_request); + if (error) + goto out_free_queue; } error = nvmf_connect_admin_queue(&ctrl->ctrl); @@ -781,11 +792,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) return 0; out_cleanup_queue: - blk_cleanup_queue(ctrl->ctrl.admin_q); + if (new) + blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: - /* disconnect and drain the queue before freeing the tagset */ - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + if (new) + nvme_rdma_free_tagset(&ctrl->ctrl, true); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -1676,7 +1687,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl); + nvme_rdma_destroy_admin_queue(ctrl, shutdown); } static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) @@ -1750,7 +1761,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); - ret = nvme_rdma_configure_admin_queue(ctrl); + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) { /* ctrl is already shutdown, just remove the ctrl */ INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); @@ -1891,7 +1902,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, if (!ctrl->queues) goto out_uninit_ctrl; - ret = nvme_rdma_configure_admin_queue(ctrl); + ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) goto out_kfree_queues; @@ -1948,7 +1959,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return &ctrl->ctrl; out_remove_admin_queue: - nvme_rdma_destroy_admin_queue(ctrl); + nvme_rdma_destroy_admin_queue(ctrl, true); out_kfree_queues: kfree(ctrl->queues); out_uninit_ctrl: -- cgit v1.2.3-59-g8ed1b From 31fdf18401703df246e20fc0382ed3594e6cf8d8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 28 Aug 2017 21:40:06 +0200 Subject: nvme-rdma: reuse configure/destroy_admin_queue No need to open-code it. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b8c07f2f2204..cdf14226bf07 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -857,24 +857,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) goto requeue; } - nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); - - ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set, - nvme_rdma_reinit_request); - if (ret) - goto requeue; - - ret = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); - if (ret) - goto requeue; - - ret = nvmf_connect_admin_queue(&ctrl->ctrl); - if (ret) - goto requeue; - - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_rdma_destroy_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto requeue; -- cgit v1.2.3-59-g8ed1b From a57bd54122232b32414748fc8b14634bfd74a7ff Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 28 Aug 2017 21:41:10 +0200 Subject: nvme-rdma: introduce configure/destroy io queues Make a symmetrical handling with admin queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 162 +++++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 81 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cdf14226bf07..b5fdd2b009c0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -562,30 +562,36 @@ out_destroy_cm_id: static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { + if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) + return; + rdma_disconnect(queue->cm_id); ib_drain_qp(queue->qp); } static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) { + if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) + return; + nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); } -static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) +static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) { - if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) - return; - nvme_rdma_stop_queue(queue); - nvme_rdma_free_queue(queue); + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_rdma_free_queue(&ctrl->queues[i]); } -static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) - nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + nvme_rdma_stop_queue(&ctrl->queues[i]); } static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) @@ -597,15 +603,15 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) { dev_info(ctrl->ctrl.device, "failed to connect i/o queue: %d\n", ret); - goto out_free_queues; + goto out_stop_queues; } set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } return 0; -out_free_queues: - nvme_rdma_free_io_queues(ctrl); +out_stop_queues: + nvme_rdma_stop_io_queues(ctrl); return ret; } @@ -641,7 +647,7 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) out_free_queues: for (i--; i >= 1; i--) - nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + nvme_rdma_free_queue(&ctrl->queues[i]); return ret; } @@ -717,11 +723,12 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); - nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, true); } + nvme_rdma_free_queue(&ctrl->queues[0]); } static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, @@ -802,6 +809,62 @@ out_free_queue: return error; } +static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + nvme_rdma_stop_io_queues(ctrl); + if (remove) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + nvme_rdma_free_tagset(&ctrl->ctrl, false); + } + nvme_rdma_free_io_queues(ctrl); +} + +static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false); + if (IS_ERR(ctrl->ctrl.tagset)) + goto out_free_io_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + } else { + ret = blk_mq_reinit_tagset(&ctrl->tag_set, + nvme_rdma_reinit_request); + if (ret) + goto out_free_io_queues; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); + } + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + if (new) + nvme_rdma_free_tagset(&ctrl->ctrl, false); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -848,14 +911,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->ctrl.queue_count > 1) { - nvme_rdma_free_io_queues(ctrl); - - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); - if (ret) - goto requeue; - } + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, false); nvme_rdma_destroy_admin_queue(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); @@ -863,16 +920,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) goto requeue; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_init_io_queues(ctrl); + ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) goto requeue; - - ret = nvme_rdma_connect_io_queues(ctrl); - if (ret) - goto requeue; - - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1659,7 +1709,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_free_io_queues(ctrl); + nvme_rdma_destroy_io_queues(ctrl, shutdown); } if (shutdown) @@ -1682,11 +1732,6 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_shutdown_ctrl(ctrl, shutdown); nvme_uninit_ctrl(&ctrl->ctrl); - if (ctrl->ctrl.tagset) { - blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, false); - } - nvme_put_ctrl(&ctrl->ctrl); } @@ -1753,21 +1798,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } if (ctrl->ctrl.queue_count > 1) { - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); - if (ret) - goto del_dead_ctrl; - - ret = nvme_rdma_init_io_queues(ctrl); - if (ret) - goto del_dead_ctrl; - - ret = nvme_rdma_connect_io_queues(ctrl); + ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) goto del_dead_ctrl; - - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1796,39 +1829,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .get_address = nvmf_get_address, }; -static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) -{ - int ret; - - ret = nvme_rdma_init_io_queues(ctrl); - if (ret) - return ret; - - ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); - if (IS_ERR(ctrl->ctrl.tagset)) - goto out_free_io_queues; - - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); - goto out_free_tag_set; - } - - ret = nvme_rdma_connect_io_queues(ctrl); - if (ret) - goto out_cleanup_connect_q; - - return 0; - -out_cleanup_connect_q: - blk_cleanup_queue(ctrl->ctrl.connect_q); -out_free_tag_set: - nvme_rdma_free_tagset(&ctrl->ctrl, false); -out_free_io_queues: - nvme_rdma_free_io_queues(ctrl); - return ret; -} - static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -1921,7 +1921,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, } if (opts->nr_io_queues) { - ret = nvme_rdma_create_io_queues(ctrl); + ret = nvme_rdma_configure_io_queues(ctrl, true); if (ret) goto out_remove_admin_queue; } -- cgit v1.2.3-59-g8ed1b From 148b4e7ff31e4bb90cf7851ad1bcd305c292be2c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:35 +0300 Subject: nvme-rdma: stop queues instead of simply flipping their state If we move the queues from LIVE state, we might as well stop them (drain for rdma). Do it after we stop the request queues to prevent a stray request sneaking in .queue_rq after we stop the queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b5fdd2b009c0..34d3ed8b249e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -945,16 +945,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - int i; nvme_stop_ctrl(&ctrl->ctrl); - for (i = 0; i < ctrl->ctrl.queue_count; i++) - clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - - if (ctrl->ctrl.queue_count > 1) + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + } blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); /* We must take care of fastfail/requeue all our inflight requests */ if (ctrl->ctrl.queue_count > 1) -- cgit v1.2.3-59-g8ed1b From 41e8cfa117cee46a732436dd303de4772c60965c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:36 +0300 Subject: nvme-rdma: rename nvme_rdma_init_queue to nvme_rdma_alloc_queue Give it a name symmetric to nvme_rdma_free_queue. Also pass in the ctrl sqsize+1 and not the opts queue_size. And suppress a superflous failure message. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34d3ed8b249e..d87874e4902d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -504,7 +504,7 @@ out_put_dev: return ret; } -static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, +static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, int idx, size_t queue_size) { struct nvme_rdma_queue *queue; @@ -615,7 +615,7 @@ out_stop_queues: return ret; } -static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; @@ -634,13 +634,10 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) "creating %d I/O queues.\n", nr_io_queues); for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvme_rdma_init_queue(ctrl, i, - ctrl->ctrl.opts->queue_size); - if (ret) { - dev_info(ctrl->ctrl.device, - "failed to initialize i/o queue: %d\n", ret); + ret = nvme_rdma_alloc_queue(ctrl, i, + ctrl->ctrl.sqsize + 1); + if (ret) goto out_free_queues; - } } return 0; @@ -736,7 +733,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, { int error; - error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); + error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; @@ -824,7 +821,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; - ret = nvme_rdma_init_io_queues(ctrl); + ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 68e16fcfaf9bbde573e89f783cf1ca60acb49cf5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:37 +0300 Subject: nvme-rdma: introduce nvme_rdma_start_queue This should pair with nvme_rdma_stop_queue. While this is not a complete inverse, it still pairs up pretty well because in fabrics we don't have a disconnect capsule (yet) but we simply teardown the transport association. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d87874e4902d..09c4ea8332d7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -594,24 +594,38 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) nvme_rdma_stop_queue(&ctrl->queues[i]); } -static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) +{ + int ret; + + if (idx) + ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); + else + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + + if (!ret) + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); + else + dev_info(ctrl->ctrl.device, + "failed to connect queue: %d ret=%d\n", idx, ret); + return ret; +} + +static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) { int i, ret = 0; for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); - if (ret) { - dev_info(ctrl->ctrl.device, - "failed to connect i/o queue: %d\n", ret); + ret = nvme_rdma_start_queue(ctrl, i); + if (ret) goto out_stop_queues; - } - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } return 0; out_stop_queues: - nvme_rdma_stop_io_queues(ctrl); + for (i--; i >= 1; i--) + nvme_rdma_stop_queue(&ctrl->queues[i]); return ret; } @@ -759,12 +773,10 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_queue; } - error = nvmf_connect_admin_queue(&ctrl->ctrl); + error = nvme_rdma_start_queue(ctrl, 0); if (error) goto out_cleanup_queue; - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { @@ -845,7 +857,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.queue_count - 1); } - ret = nvme_rdma_connect_io_queues(ctrl); + ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; -- cgit v1.2.3-59-g8ed1b From 370ae6e450260c8bd3e1ea75c1390f746d3c71ee Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:38 +0300 Subject: nvme-rdma: cleanup error path in controller reset No need to queue an extra work to indirect controller removal, just call the ctrl remove routine. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 09c4ea8332d7..1bf7c3a55c96 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1732,13 +1732,10 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_destroy_admin_queue(ctrl, shutdown); } -static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) +static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) { - nvme_stop_ctrl(&ctrl->ctrl); nvme_remove_namespaces(&ctrl->ctrl); - if (shutdown) - nvme_rdma_shutdown_ctrl(ctrl, shutdown); - + nvme_rdma_shutdown_ctrl(ctrl, true); nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -1748,7 +1745,8 @@ static void nvme_rdma_del_ctrl_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, delete_work); - __nvme_rdma_remove_ctrl(ctrl, true); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_rdma_remove_ctrl(ctrl); } static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) @@ -1780,14 +1778,6 @@ static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) return ret; } -static void nvme_rdma_remove_ctrl_work(struct work_struct *work) -{ - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, delete_work); - - __nvme_rdma_remove_ctrl(ctrl, false); -} - static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = @@ -1799,16 +1789,13 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_rdma_shutdown_ctrl(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) { - /* ctrl is already shutdown, just remove the ctrl */ - INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); - goto del_dead_ctrl; - } + if (ret) + goto out_fail; if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) - goto del_dead_ctrl; + goto out_fail; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1818,10 +1805,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; -del_dead_ctrl: - /* Deleting this dead controller... */ +out_fail: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - WARN_ON(!queue_work(nvme_wq, &ctrl->delete_work)); + nvme_rdma_remove_ctrl(ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { -- cgit v1.2.3-59-g8ed1b From 09fdc23b29618127709711606d4fa439a6839852 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:39 +0300 Subject: nvme-rdma: call ops->reg_read64 instead of nvmf_reg_read64 To make the nvme_rdma_configure_admin_queue generic in preparation of moving it to common code. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1bf7c3a55c96..b51e7df63df5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -777,7 +777,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, -- cgit v1.2.3-59-g8ed1b From 9b483da15dd0c94b62ae4e789b37dfff4410f45e Mon Sep 17 00:00:00 2001 From: Guan Junxiong Date: Thu, 3 Aug 2017 21:40:26 +0800 Subject: nvme-fabrics: log a warning if hostid is invalid This helps users to quickly spot the reason of why connection fails if the hostid is not compliant with the uuid format. Signed-off-by: Guan Junxiong Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 5f5cd306f76d..fc3b6552f467 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -735,6 +735,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } if (uuid_parse(p, &hostid)) { + pr_err("Invalid hostid %s\n", p); ret = -EINVAL; goto out; } -- cgit v1.2.3-59-g8ed1b From caaa15c5097d58545075a8bbdf208078b87d5f28 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 15 Aug 2017 12:24:05 +0300 Subject: nvme: fix identify namespace logging Use ctrl->device and lose the func name. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0b979e16655e..b2daeafbeadd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1167,7 +1167,7 @@ static void nvme_config_discard(struct nvme_ns *ns) static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) { if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { - dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); + dev_warn(ns->ctrl->device, "Identify namespace failed\n"); return -ENODEV; } -- cgit v1.2.3-59-g8ed1b From ad4e05b24c428d6125f6f10bd300600cae5079d4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Aug 2017 19:21:06 +0300 Subject: nvme: add symbolic constants for CC identifiers Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 14 +++++++------- include/linux/nvme.h | 24 +++++++++++++++--------- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ea7eb00dcb87..7c23eaf8e563 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -538,37 +538,37 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit); static inline bool nvmet_cc_en(u32 cc) { - return cc & 0x1; + return (cc >> NVME_CC_EN_SHIFT) & 0x1; } static inline u8 nvmet_cc_css(u32 cc) { - return (cc >> 4) & 0x7; + return (cc >> NVME_CC_CSS_SHIFT) & 0x7; } static inline u8 nvmet_cc_mps(u32 cc) { - return (cc >> 7) & 0xf; + return (cc >> NVME_CC_MPS_SHIFT) & 0xf; } static inline u8 nvmet_cc_ams(u32 cc) { - return (cc >> 11) & 0x7; + return (cc >> NVME_CC_AMS_SHIFT) & 0x7; } static inline u8 nvmet_cc_shn(u32 cc) { - return (cc >> 14) & 0x3; + return (cc >> NVME_CC_SHN_SHIFT) & 0x3; } static inline u8 nvmet_cc_iosqes(u32 cc) { - return (cc >> 16) & 0xf; + return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; } static inline u8 nvmet_cc_iocqes(u32 cc) { - return (cc >> 20) & 0xf; + return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; } static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a12b47073273..7b4322dc2975 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -135,16 +135,22 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_EN_SHIFT = 0, + NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, - NVME_CC_ARB_RR = 0 << 11, - NVME_CC_ARB_WRRU = 1 << 11, - NVME_CC_ARB_VS = 7 << 11, - NVME_CC_SHN_NONE = 0 << 14, - NVME_CC_SHN_NORMAL = 1 << 14, - NVME_CC_SHN_ABRUPT = 2 << 14, - NVME_CC_SHN_MASK = 3 << 14, - NVME_CC_IOSQES = NVME_NVM_IOSQES << 16, - NVME_CC_IOCQES = NVME_NVM_IOCQES << 20, + NVME_CC_AMS_SHIFT = 11, + NVME_CC_SHN_SHIFT = 14, + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_ARB_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_ARB_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_ARB_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, -- cgit v1.2.3-59-g8ed1b From 60b43f627a71aaf233ef5af90f72e207c29781b4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Aug 2017 19:21:07 +0300 Subject: nvme: rename AMS symbolic constants to fit specification Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- include/linux/nvme.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b2daeafbeadd..1db8de0bee87 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1445,7 +1445,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b4322dc2975..05560c2ecc83 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -142,9 +142,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, - NVME_CC_ARB_RR = 0 << NVME_CC_AMS_SHIFT, - NVME_CC_ARB_WRRU = 1 << NVME_CC_AMS_SHIFT, - NVME_CC_ARB_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, -- cgit v1.2.3-59-g8ed1b From 5533d42480d6ced6765401c55a3622b4c437d7eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 31 Jul 2017 13:20:30 -0700 Subject: nvme-fc: Reattach to localports on re-registration If the LLDD resets or detaches from an fc port, the LLDD will deregister all remoteports seen by the fc port and deregister the localport associated with the fc port. The teardown of the localport structure will be held off due to reference counting until all the remoteports are removed (and they are held off until all controllers/associations to terminated). Currently, if the fc port is reinit/reattached and registered again as a localport it is treated as an independent entity from the prior localport and all prior remoteports and controllers cannot be revived. They are created as new and separate entities. This patch changes the localport registration to look at the known localports that are waiting to be torndown. If they are the same port based on wwn's, the local port is transitioned out of the teardown state. This allows the remote ports and controller connections to be reestablished and resumed as long as the localport can also be reregistered within the timeout windows. The patch adds a new routine nvme_fc_attach_to_unreg_lport() with the functionality and moves the lport get/put routines to avoid forward references. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 144 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 38 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1912df412692..d2e882c0f496 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -220,6 +220,90 @@ static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *); static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, struct nvme_fc_queue *, unsigned int); +static void +nvme_fc_free_lport(struct kref *ref) +{ + struct nvme_fc_lport *lport = + container_of(ref, struct nvme_fc_lport, ref); + unsigned long flags; + + WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&lport->endp_list)); + + /* remove from transport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&lport->port_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + /* let the LLDD know we've finished tearing it down */ + lport->ops->localport_delete(&lport->localport); + + ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); + ida_destroy(&lport->endp_cnt); + + put_device(lport->dev); + + kfree(lport); +} + +static void +nvme_fc_lport_put(struct nvme_fc_lport *lport) +{ + kref_put(&lport->ref, nvme_fc_free_lport); +} + +static int +nvme_fc_lport_get(struct nvme_fc_lport *lport) +{ + return kref_get_unless_zero(&lport->ref); +} + + +static struct nvme_fc_lport * +nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) +{ + struct nvme_fc_lport *lport; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + if (lport->localport.node_name != pinfo->node_name || + lport->localport.port_name != pinfo->port_name) + continue; + + if (lport->localport.port_state != FC_OBJSTATE_DELETED) { + lport = ERR_PTR(-EEXIST); + goto out_done; + } + + if (!nvme_fc_lport_get(lport)) { + /* + * fails if ref cnt already 0. If so, + * act as if lport already deleted + */ + lport = NULL; + goto out_done; + } + + /* resume the lport */ + + lport->localport.port_role = pinfo->port_role; + lport->localport.port_id = pinfo->port_id; + lport->localport.port_state = FC_OBJSTATE_ONLINE; + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; + } + + lport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; +} /** * nvme_fc_register_localport - transport entry point called by an @@ -257,6 +341,28 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, goto out_reghost_failed; } + /* + * look to see if there is already a localport that had been + * deregistered and in the process of waiting for all the + * references to fully be removed. If the references haven't + * expired, we can simply re-enable the localport. Remoteports + * and controller reconnections should resume naturally. + */ + newrec = nvme_fc_attach_to_unreg_lport(pinfo); + + /* found an lport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_reghost_failed; + + /* found existing lport, which was resumed */ + } else if (newrec) { + *portptr = &newrec->localport; + return 0; + } + + /* nothing found - allocate a new localport struct */ + newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz), GFP_KERNEL); if (!newrec) { @@ -310,44 +416,6 @@ out_reghost_failed: } EXPORT_SYMBOL_GPL(nvme_fc_register_localport); -static void -nvme_fc_free_lport(struct kref *ref) -{ - struct nvme_fc_lport *lport = - container_of(ref, struct nvme_fc_lport, ref); - unsigned long flags; - - WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED); - WARN_ON(!list_empty(&lport->endp_list)); - - /* remove from transport list */ - spin_lock_irqsave(&nvme_fc_lock, flags); - list_del(&lport->port_list); - spin_unlock_irqrestore(&nvme_fc_lock, flags); - - /* let the LLDD know we've finished tearing it down */ - lport->ops->localport_delete(&lport->localport); - - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); - ida_destroy(&lport->endp_cnt); - - put_device(lport->dev); - - kfree(lport); -} - -static void -nvme_fc_lport_put(struct nvme_fc_lport *lport) -{ - kref_put(&lport->ref, nvme_fc_free_lport); -} - -static int -nvme_fc_lport_get(struct nvme_fc_lport *lport) -{ - return kref_get_unless_zero(&lport->ref); -} - /** * nvme_fc_unregister_localport - transport entry point called by an * LLDD to deregister/remove a previously -- cgit v1.2.3-59-g8ed1b From 48fa362b6c3f4d69bdb6310b46626049092475e0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 31 Jul 2017 13:21:14 -0700 Subject: nvmet-fc: simplify sg list handling The existing nvmet_fc sg list handling has 2 faults: a) the request between LLDD and transport has too large of an sg list (256 elements), which is normally 256k (64 elements). b) sglist handling doesn't optimize on the fact that each element is a page. This patch removes the static sg list in the request and uses the dynamic list already present in the nvmet_fc transport. It also simplies the handling of the sg list on multiple sequences to take advantage of the per-page divisions. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 48 +++++++++--------------------------------- include/linux/nvme-fc-driver.h | 2 +- 2 files changed, 11 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 309c84aa7595..421e43bf1dd7 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -58,7 +58,8 @@ struct nvmet_fc_ls_iod { struct work_struct work; } __aligned(sizeof(unsigned long long)); -#define NVMET_FC_MAX_KB_PER_XFR 256 +#define NVMET_FC_MAX_SEQ_LENGTH (256 * 1024) +#define NVMET_FC_MAX_XFR_SGENTS (NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE) enum nvmet_fcp_datadir { NVMET_FCP_NODATA, @@ -74,9 +75,7 @@ struct nvmet_fc_fcp_iod { struct nvme_fc_ersp_iu rspiubuf; dma_addr_t rspdma; struct scatterlist *data_sg; - struct scatterlist *next_sg; int data_sg_cnt; - u32 next_sg_offset; u32 total_length; u32 offset; enum nvmet_fcp_datadir io_dir; @@ -112,6 +111,7 @@ struct nvmet_fc_tgtport { struct ida assoc_cnt; struct nvmet_port *port; struct kref ref; + u32 max_sg_cnt; }; struct nvmet_fc_defer_fcp_req { @@ -994,6 +994,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, INIT_LIST_HEAD(&newrec->assoc_list); kref_init(&newrec->ref); ida_init(&newrec->assoc_cnt); + newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS, + template->max_sgl_segments); ret = nvmet_fc_alloc_ls_iodlist(newrec); if (ret) { @@ -1866,51 +1868,23 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod, u8 op) { struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; - struct scatterlist *sg, *datasg; unsigned long flags; - u32 tlen, sg_off; + u32 tlen; int ret; fcpreq->op = op; fcpreq->offset = fod->offset; fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; - tlen = min_t(u32, (NVMET_FC_MAX_KB_PER_XFR * 1024), + + tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, (fod->total_length - fod->offset)); - tlen = min_t(u32, tlen, NVME_FC_MAX_SEGMENTS * PAGE_SIZE); - tlen = min_t(u32, tlen, fod->tgtport->ops->max_sgl_segments - * PAGE_SIZE); fcpreq->transfer_length = tlen; fcpreq->transferred_length = 0; fcpreq->fcp_error = 0; fcpreq->rsplen = 0; - fcpreq->sg_cnt = 0; - - datasg = fod->next_sg; - sg_off = fod->next_sg_offset; - - for (sg = fcpreq->sg ; tlen; sg++) { - *sg = *datasg; - if (sg_off) { - sg->offset += sg_off; - sg->length -= sg_off; - sg->dma_address += sg_off; - sg_off = 0; - } - if (tlen < sg->length) { - sg->length = tlen; - fod->next_sg = datasg; - fod->next_sg_offset += tlen; - } else if (tlen == sg->length) { - fod->next_sg_offset = 0; - fod->next_sg = sg_next(datasg); - } else { - fod->next_sg_offset = 0; - datasg = sg_next(datasg); - } - tlen -= sg->length; - fcpreq->sg_cnt++; - } + fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE]; + fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE); /* * If the last READDATA request: check if LLDD supports @@ -2225,8 +2199,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->req.sg = fod->data_sg; fod->req.sg_cnt = fod->data_sg_cnt; fod->offset = 0; - fod->next_sg = fod->data_sg; - fod->next_sg_offset = 0; if (fod->io_dir == NVMET_FCP_WRITE) { /* pull the data over before invoking nvmet layer */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2591878c1d48..9c5cb4480806 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -624,7 +624,7 @@ struct nvmefc_tgt_fcp_req { u32 timeout; u32 transfer_length; struct fc_ba_rjt ba_rjt; - struct scatterlist sg[NVME_FC_MAX_SEGMENTS]; + struct scatterlist *sg; int sg_cnt; void *rspaddr; dma_addr_t rspdma; -- cgit v1.2.3-59-g8ed1b From 17c39d053a46b300fee786857458857086a4844e Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 14 Aug 2017 22:12:39 +0200 Subject: nvmet: use memcpy_and_pad for identify sn/fr This changes the earlier patch "nvmet: don't report 0-bytes in serial number" to use the memcpy_and_pad() helper introduced in a previous patch. Signed-off-by: Martin Wilck Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dffd8064163b..9496c71d2257 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -168,15 +168,6 @@ out: nvmet_req_complete(req, status); } -static void copy_and_pad(char *dst, int dst_len, const char *src, int src_len) -{ - int len = min(src_len, dst_len); - - memcpy(dst, src, len); - if (dst_len > len) - memset(dst + len, ' ', dst_len - len); -} - static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -196,8 +187,9 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); - copy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1); - copy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE)); + memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + memcpy_and_pad(id->fr, sizeof(id->fr), + UTS_RELEASE, strlen(UTS_RELEASE), ' '); id->rab = 6; -- cgit v1.2.3-59-g8ed1b From a7b7c7a105a528e6c2a0a2581b814a5acacb4c38 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 14 Aug 2017 15:29:26 +0300 Subject: nvme-rdma: Use unlikely macro in the fast path This patch slightly improves performance (mainly for small block sizes). Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b51e7df63df5..6a7682620d87 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1047,7 +1047,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (req->mr->need_inval) { res = nvme_rdma_inv_rkey(queue, req); - if (res < 0) { + if (unlikely(res < 0)) { dev_err(ctrl->ctrl.device, "Queueing INV WR for rkey %#x failed (%d)\n", req->mr->rkey, res); @@ -1112,7 +1112,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, int nr; nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); - if (nr < count) { + if (unlikely(nr < count)) { if (nr < 0) return nr; return -EINVAL; @@ -1248,7 +1248,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, first = ≀ ret = ib_post_send(queue->qp, first, &bad_wr); - if (ret) { + if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } @@ -1274,7 +1274,7 @@ static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, wr.num_sge = 1; ret = ib_post_recv(queue->qp, &wr, &bad_wr); - if (ret) { + if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } @@ -1634,7 +1634,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); err = nvme_rdma_map_data(queue, rq, c); - if (err < 0) { + if (unlikely(err < 0)) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", err); nvme_cleanup_cmd(rq); @@ -1648,7 +1648,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, flush = true; err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); - if (err) { + if (unlikely(err)) { nvme_rdma_unmap_data(queue, rq); goto err; } -- cgit v1.2.3-59-g8ed1b From 5228b3280b9bb8fa6aef59f891cca64a028e9b36 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Sun, 27 Aug 2017 15:56:37 +0200 Subject: nvme: fix uninitialized prp2 value on small transfers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of iod->first_dma ends up as prp2 in NVMe commands. In case there is not enough data to cross a page boundary, iod->first_dma is never initialized and contains random data. Comply with the NVMe specification and fill in 0 in that case. Signed-off-by: Jan H. Schönherr Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6283745ecd2..544805a2421b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -555,8 +555,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) int nprps, i; length -= (page_size - offset); - if (length <= 0) + if (length <= 0) { + iod->first_dma = 0; return BLK_STS_OK; + } dma_len -= (page_size - offset); if (dma_len) { -- cgit v1.2.3-59-g8ed1b From 07fbd32a6b215d8b2fc01ccc89622207b9b782fd Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 25 Aug 2017 19:14:50 -0400 Subject: nvme: honor RTD3 Entry Latency for shutdowns If an NVMe controller reports RTD3 Entry Latency larger than shutdown_timeout, up to a maximum of 60 seconds, use that value to set the shutdown timer. Otherwise fall back to the module parameter which defaults to 5 seconds. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg [hch: removed do_div, made transition time local scope] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 16 +++++++++++++++- drivers/nvme/host/nvme.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1db8de0bee87..745058905da6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = jiffies + (shutdown_timeout * HZ); + unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); u32 csts; int ret; @@ -1826,6 +1826,20 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + if (id->rtd3e) { + /* us -> s */ + u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + + ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, + shutdown_timeout, 60); + + if (ctrl->shutdown_timeout != shutdown_timeout) + dev_warn(ctrl->device, + "Shutdown timeout set to %u seconds\n", + ctrl->shutdown_timeout); + } else + ctrl->shutdown_timeout = shutdown_timeout; + ctrl->npss = id->npss; ctrl->apsta = id->apsta; prev_apst_enabled = ctrl->apst_enabled; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2c8a02be46fd..9b959ee18cb6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,7 @@ struct nvme_ctrl { u16 kas; u8 npss; u8 apsta; + unsigned int shutdown_timeout; unsigned int kato; bool subsystem; unsigned long quirks; -- cgit v1.2.3-59-g8ed1b From a751da33945a2c94a0449d1005c3c973e5acfefb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Aug 2017 10:17:03 +0200 Subject: nvme: report more detailed status codes to the block layer Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 745058905da6..4c49ec4349bc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -113,7 +113,16 @@ static blk_status_t nvme_error_status(struct request *req) case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_READ_ONLY: return BLK_STS_MEDIUM; + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_INVALID_PI: + return BLK_STS_PROTECTION; + case NVME_SC_RESERVATION_CONFLICT: + return BLK_STS_NEXUS; default: return BLK_STS_IOERR; } -- cgit v1.2.3-59-g8ed1b From 0a72bbba493bebd53fa78e6741b86a3003f070d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Aug 2017 11:42:24 +0200 Subject: nvme: allow calling nvme_change_ctrl_state from irq context Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4c49ec4349bc..d63e1fcf4437 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -176,9 +176,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state) { enum nvme_ctrl_state old_state; + unsigned long flags; bool changed = false; - spin_lock_irq(&ctrl->lock); + spin_lock_irqsave(&ctrl->lock, flags); old_state = ctrl->state; switch (new_state) { @@ -239,7 +240,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, if (changed) ctrl->state = new_state; - spin_unlock_irq(&ctrl->lock); + spin_unlock_irqrestore(&ctrl->lock, flags); return changed; } -- cgit v1.2.3-59-g8ed1b From 57eeaf8ec67d5c5dee0ee9bb5b19866b8eef780d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Aug 2017 15:47:37 +0200 Subject: nvme: remove unused struct nvme_ns fields And move the flags for the flags field near that field while touching this area. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9b959ee18cb6..936c4056d98e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -211,13 +211,9 @@ struct nvme_ns { bool ext; u8 pi_type; unsigned long flags; - u16 noiob; - #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 - - u64 mode_select_num_blocks; - u32 mode_select_block_len; + u16 noiob; }; struct nvme_ctrl_ops { -- cgit v1.2.3-59-g8ed1b From cdbff4f26bd9fab11bfac22097f836892a5c3612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Aug 2017 16:14:47 +0200 Subject: nvme: remove nvme_revalidate_ns The function is used in two places, and the shared code for those will diverge later in this series. Instead factor out a new helper to get the ids for a namespace, simplify the calling conventions for nvme_identify_ns and just open code the sequence. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 100 +++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 47 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d63e1fcf4437..b87cf3a6e9ac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -783,7 +783,8 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } -static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) +static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, + u8 *eui64, u8 *nguid, uuid_t *uuid) { struct nvme_command c = { }; int status; @@ -799,7 +800,7 @@ static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) if (!data) return -ENOMEM; - status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data, + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, NVME_IDENTIFY_DATA_SIZE); if (status) goto free_data; @@ -813,33 +814,33 @@ static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) switch (cur->nidt) { case NVME_NIDT_EUI64: if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", cur->nidl); goto free_data; } len = NVME_NIDT_EUI64_LEN; - memcpy(ns->eui, data + pos + sizeof(*cur), len); + memcpy(eui64, data + pos + sizeof(*cur), len); break; case NVME_NIDT_NGUID: if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", cur->nidl); goto free_data; } len = NVME_NIDT_NGUID_LEN; - memcpy(ns->nguid, data + pos + sizeof(*cur), len); + memcpy(nguid, data + pos + sizeof(*cur), len); break; case NVME_NIDT_UUID: if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", cur->nidl); goto free_data; } len = NVME_NIDT_UUID_LEN; - uuid_copy(&ns->uuid, data + pos + sizeof(*cur)); + uuid_copy(uuid, data + pos + sizeof(*cur)); break; default: /* Skip unnkown types */ @@ -864,9 +865,10 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } -static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, - struct nvme_id_ns **id) +static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid) { + struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -875,15 +877,18 @@ static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); - if (!*id) - return -ENOMEM; + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NULL; - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ns)); - if (error) - kfree(*id); - return error; + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (error) { + dev_warn(ctrl->device, "Identify namespace failed\n"); + kfree(id); + return NULL; + } + + return id; } static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, @@ -1174,32 +1179,21 @@ static void nvme_config_discard(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); } -static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) +static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, + struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) { - if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { - dev_warn(ns->ctrl->device, "Identify namespace failed\n"); - return -ENODEV; - } - - if ((*id)->ncap == 0) { - kfree(*id); - return -ENODEV; - } - - if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); - if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid)); - if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) { + if (ctrl->vs >= NVME_VS(1, 1, 0)) + memcpy(eui64, id->eui64, sizeof(id->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0)) + memcpy(nguid, id->nguid, sizeof(id->nguid)); + if (ctrl->vs >= NVME_VS(1, 3, 0)) { /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ns, ns->ns_id)) - dev_warn(ns->ctrl->device, + if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) + dev_warn(ctrl->device, "%s: Identify Descriptors failed\n", __func__); } - - return 0; } static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) @@ -1240,22 +1234,28 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; - struct nvme_id_ns *id = NULL; - int ret; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ns *id; + int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { set_capacity(disk, 0); return -ENODEV; } - ret = nvme_revalidate_ns(ns, &id); - if (ret) - return ret; + id = nvme_identify_ns(ctrl, ns->ns_id); + if (!id) + return -ENODEV; - __nvme_revalidate_disk(disk, id); - kfree(id); + if (id->ncap == 0) { + ret = -ENODEV; + goto out; + } - return 0; + nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); +out: + kfree(id); + return ret; } static char nvme_pr_type(enum pr_type type) @@ -2361,9 +2361,15 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); - if (nvme_revalidate_ns(ns, &id)) + id = nvme_identify_ns(ctrl, nsid); + if (!id) goto out_free_queue; + if (id->ncap == 0) + goto out_free_id; + + nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + if (nvme_nvm_ns_supported(ns, id) && nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); -- cgit v1.2.3-59-g8ed1b From 1d5df6af8c7469f9ae3e66e7bed0782cfe4f95db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Aug 2017 14:10:00 +0200 Subject: nvme: don't blindly overwrite identifiers on disk revalidate Instead validate that these identifiers do not change, as that is prohibited by the specification. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b87cf3a6e9ac..b0dd58db110e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1236,6 +1236,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id; + u8 eui64[8] = { 0 }, nguid[16] = { 0 }; + uuid_t uuid = uuid_null; int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { @@ -1252,7 +1254,15 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); + if (!uuid_equal(&ns->uuid, &uuid) || + memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) || + memcmp(&ns->eui, &eui64, sizeof(ns->eui))) { + dev_err(ctrl->device, + "identifiers changed for nsid %d\n", ns->ns_id); + ret = -ENODEV; + } + out: kfree(id); return ret; -- cgit v1.2.3-59-g8ed1b From 489beb91e66a237254e23ac1a0fe1beac23d87c5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 29 Aug 2017 10:33:44 -0700 Subject: nvme-fabrics: Convert nvmf_transports_mutex to an rwsem The mutex protects against the list of transports changing while a controller is being created, but using a plain old mutex means that it also serializes controller creation. This unnecessarily slows down creating multiple controllers - for example for the RDMA transport, creating a controller involves establishing one connection for every IO queue, which involves even more network/software round trips, so the delay can become significant. The simplest way to fix this is to change the mutex to an rwsem and only hold it for writing when the list is being mutated. Since we can take the rwsem for reading while creating a controller, we can create multiple controllers in parallel. Signed-off-by: Roland Dreier Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index fc3b6552f467..53a9598e3aee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -22,7 +22,7 @@ #include "fabrics.h" static LIST_HEAD(nvmf_transports); -static DEFINE_MUTEX(nvmf_transports_mutex); +static DECLARE_RWSEM(nvmf_transports_rwsem); static LIST_HEAD(nvmf_hosts); static DEFINE_MUTEX(nvmf_hosts_mutex); @@ -495,9 +495,9 @@ int nvmf_register_transport(struct nvmf_transport_ops *ops) if (!ops->create_ctrl) return -EINVAL; - mutex_lock(&nvmf_transports_mutex); + down_write(&nvmf_transports_rwsem); list_add_tail(&ops->entry, &nvmf_transports); - mutex_unlock(&nvmf_transports_mutex); + up_write(&nvmf_transports_rwsem); return 0; } @@ -514,9 +514,9 @@ EXPORT_SYMBOL_GPL(nvmf_register_transport); */ void nvmf_unregister_transport(struct nvmf_transport_ops *ops) { - mutex_lock(&nvmf_transports_mutex); + down_write(&nvmf_transports_rwsem); list_del(&ops->entry); - mutex_unlock(&nvmf_transports_mutex); + up_write(&nvmf_transports_rwsem); } EXPORT_SYMBOL_GPL(nvmf_unregister_transport); @@ -525,7 +525,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( { struct nvmf_transport_ops *ops; - lockdep_assert_held(&nvmf_transports_mutex); + lockdep_assert_held(&nvmf_transports_rwsem); list_for_each_entry(ops, &nvmf_transports, entry) { if (strcmp(ops->name, opts->transport) == 0) @@ -851,7 +851,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_free_opts; opts->mask &= ~NVMF_REQUIRED_OPTS; - mutex_lock(&nvmf_transports_mutex); + down_read(&nvmf_transports_rwsem); ops = nvmf_lookup_transport(opts); if (!ops) { pr_info("no handler found for transport %s.\n", @@ -878,16 +878,16 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", ctrl->subnqn); - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); ctrl->ops->delete_ctrl(ctrl); return ERR_PTR(-EINVAL); } - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); return ctrl; out_unlock: - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); out_free_opts: nvmf_free_options(opts); return ERR_PTR(ret); -- cgit v1.2.3-59-g8ed1b From 1cad65620fecfb24cdeefa5533628a4f293783e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 17:46:01 -0400 Subject: nvme: factor metadata handling out of __nvme_submit_user_cmd Keep the metadata code in a separate helper instead of making the main function more complicated. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 76 +++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b0dd58db110e..a520c841c63c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -600,6 +600,40 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, @@ -625,46 +659,17 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (ret) goto out; bio = req->bio; - - if (!disk) - goto submit; bio->bi_disk = disk; - - if (meta_buffer && meta_len) { - struct bio_integrity_payload *bip; - - meta = kmalloc(meta_len, GFP_KERNEL); - if (!meta) { - ret = -ENOMEM; + if (disk && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); goto out_unmap; } - - if (write) { - if (copy_from_user(meta, meta_buffer, - meta_len)) { - ret = -EFAULT; - goto out_free_meta; - } - } - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = meta_len; - bip->bip_iter.bi_sector = meta_seed; - - ret = bio_integrity_add_page(bio, virt_to_page(meta), - meta_len, offset_in_page(meta)); - if (ret != meta_len) { - ret = -ENOMEM; - goto out_free_meta; - } } } - submit: + blk_execute_rq(req->q, disk, req, 0); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; @@ -676,7 +681,6 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; } - out_free_meta: kfree(meta); out_unmap: if (bio) -- cgit v1.2.3-59-g8ed1b From b5d8af5b521bdb4167808979df37e9defabeb707 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:02 -0400 Subject: nvme/pci: Use req_op to determine DIF remapping Only read and write commands need DIF remapping. Everything else uses a passthrough integrity payload. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 544805a2421b..11874afb2422 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -668,7 +668,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) goto out_unmap; - if (rq_data_dir(req)) + if (req_op(req) == REQ_OP_WRITE) nvme_dif_remap(req, nvme_dif_prep); if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) @@ -696,7 +696,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) if (iod->nents) { dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) + if (req_op(req) == REQ_OP_READ) nvme_dif_remap(req, nvme_dif_complete); dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); } -- cgit v1.2.3-59-g8ed1b From 485783ca63e3a47fa5a30df8c6745a6299607e4d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:03 -0400 Subject: nvme: Make nvme user functions static These functions are used only locally in the nvme core. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/nvme.h | 7 ------- 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a520c841c63c..01f39a977f95 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -634,10 +634,10 @@ out: return ERR_PTR(ret); } -int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, - u32 *result, unsigned timeout) +static int __nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u32 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -690,7 +690,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, return ret; } -int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, +static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 936c4056d98e..a19a587d60ed 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -314,13 +314,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, int flags); -int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, u32 *result, - unsigned timeout); -int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, - u32 *result, unsigned timeout); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); -- cgit v1.2.3-59-g8ed1b From 63263d60e0f9f37bfd5e6a1e83a62f0e62fc459f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:04 -0400 Subject: nvme: Use metadata for passthrough commands The ioctls' struct allows the user to provide a metadata address and length for a passthrough command. This patch uses these values that were previously ignored and deletes the now unused wrapper function. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 01f39a977f95..277a7a02cba5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -634,7 +634,7 @@ out: return ERR_PTR(ret); } -static int __nvme_submit_user_cmd(struct request_queue *q, +static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout) @@ -690,14 +690,6 @@ static int __nvme_submit_user_cmd(struct request_queue *q, return ret; } -static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, u32 *result, - unsigned timeout) -{ - return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, - result, timeout); -} - static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; @@ -987,7 +979,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - return __nvme_submit_user_cmd(ns->queue, &c, + return nvme_submit_user_cmd(ns->queue, &c, (void __user *)(uintptr_t)io.addr, length, metadata, meta_len, io.slba, NULL, 0); } @@ -1025,7 +1017,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - &cmd.result, timeout); + (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, + 0, &cmd.result, timeout); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; -- cgit v1.2.3-59-g8ed1b From 28dd5cf70aaac2a12a16847ae0a978f0b0575194 Mon Sep 17 00:00:00 2001 From: Omri Mann Date: Wed, 30 Aug 2017 15:22:59 +0300 Subject: nvmet: add support for reporting the host identifier And fix the Get/Set Log Page implementation to take all 8 bits of the feature identifier into account. Signed-off-by: Omri Mann Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig [hch: used the UUID API, updated changelog] --- drivers/nvme/target/admin-cmd.c | 17 +++++++++++++++-- drivers/nvme/target/fabrics-cmd.c | 1 + drivers/nvme/target/nvmet.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9496c71d2257..c4a0bf36e752 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -443,7 +443,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req) u32 val32; u16 status = 0; - switch (cdw10 & 0xf) { + switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); @@ -453,6 +453,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req) req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); nvmet_set_result(req, req->sq->ctrl->kato); break; + case NVME_FEAT_HOST_ID: + status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; @@ -467,7 +470,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); u16 status = 0; - switch (cdw10 & 0xf) { + switch (cdw10 & 0xff) { /* * These features are mandatory in the spec, but we don't * have a useful way to implement them. We'll eventually @@ -501,6 +504,16 @@ static void nvmet_execute_get_features(struct nvmet_req *req) case NVME_FEAT_KATO: nvmet_set_result(req, req->sq->ctrl->kato * 1000); break; + case NVME_FEAT_HOST_ID: + /* need 128-bit host identifier flag */ + if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, + sizeof(req->sq->ctrl->hostid)); + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 3cc17269504b..859a66725291 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -154,6 +154,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) le32_to_cpu(c->kato), &ctrl); if (status) goto out; + uuid_copy(&ctrl->hostid, &d->hostid); status = nvmet_install_queue(ctrl, req); if (status) { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e3b244c7e443..7d261ab894f4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -115,6 +115,7 @@ struct nvmet_ctrl { u32 cc; u32 csts; + uuid_t hostid; u16 cntlid; u32 kato; -- cgit v1.2.3-59-g8ed1b From 8a0740c4109d646d8697d359962edea47301c652 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:41 -0700 Subject: loop: get rid of lo_blocksize This is only used for setting the soft block size on the struct block_device once and then never used again. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++-------- drivers/block/loop.h | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 407cb172d6e3..efad2d46a018 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -573,8 +573,6 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p) mapping = file->f_mapping; mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); @@ -867,7 +865,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct file *file, *f; struct inode *inode; struct address_space *mapping; - unsigned lo_blocksize; int lo_flags = 0; int error; loff_t size; @@ -911,9 +908,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo_flags |= LO_FLAGS_READ_ONLY; - lo_blocksize = S_ISBLK(inode->i_mode) ? - inode->i_bdev->bd_block_size : PAGE_SIZE; - error = -EFBIG; size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) @@ -927,7 +921,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = false; - lo->lo_blocksize = lo_blocksize; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; @@ -947,7 +940,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - set_blocksize(bdev, lo_blocksize); + set_blocksize(bdev, S_ISBLK(inode->i_mode) ? + block_size(inode->i_bdev) : PAGE_SIZE); lo->lo_state = Lo_bound; if (part_shift) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index fecd3f97ef8c..efe57189d01e 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -48,7 +48,6 @@ struct loop_device { struct file * lo_backing_file; struct block_device *lo_device; - unsigned lo_blocksize; void *key_data; gfp_t old_gfp_mask; -- cgit v1.2.3-59-g8ed1b From 6c6b6f28b3335fd85ec833ee0005d9c9dca6c003 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:42 -0700 Subject: loop: set physical block size to PAGE_SIZE The physical block size is "the lowest possible sector size that the hardware can operate on without reverting to read-modify-write operations" (from the comment on blk_queue_physical_block_size()). Since loop does buffered I/O on the backing file by default, the RMW unit is a page. This isn't the case for direct I/O mode, but let's keep it simple. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index efad2d46a018..e3f190016d4f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1764,6 +1764,8 @@ static int loop_add(struct loop_device **l, int i) } lo->lo_queue->queuedata = lo; + blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); + /* * It doesn't make sense to enable merge because the I/O * submitted to backing file is handled page by page. -- cgit v1.2.3-59-g8ed1b From 89e4fdecb51cf5535867026274bc97de9480ade5 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:43 -0700 Subject: loop: add ioctl for changing logical block size This is a different approach from the first attempt in f2c6df7dbf9a ("loop: support 4k physical blocksize"). Rather than extending LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block size. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 24 ++++++++++++++++++++++++ include/uapi/linux/loop.h | 1 + 2 files changed, 25 insertions(+) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e3f190016d4f..ac106b287d75 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1047,6 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); + blk_queue_logical_block_size(lo->lo_queue, 512); if (bdev) { bdput(bdev); invalidate_bdev(bdev); @@ -1330,6 +1331,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) return error; } +static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +{ + if (lo->lo_state != Lo_bound) + return -ENXIO; + + if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) + return -EINVAL; + + blk_mq_freeze_queue(lo->lo_queue); + + blk_queue_logical_block_size(lo->lo_queue, arg); + loop_update_dio(lo); + + blk_mq_unfreeze_queue(lo->lo_queue); + + return 0; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1378,6 +1397,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_dio(lo, arg); break; + case LOOP_SET_BLOCK_SIZE: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_block_size(lo, arg); + break; default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index c8125ec1f4f2..23158dbe2424 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -88,6 +88,7 @@ struct loop_info64 { #define LOOP_CHANGE_FD 0x4C06 #define LOOP_SET_CAPACITY 0x4C07 #define LOOP_SET_DIRECT_IO 0x4C08 +#define LOOP_SET_BLOCK_SIZE 0x4C09 /* /dev/loop-control interface */ #define LOOP_CTL_ADD 0x4C80 -- cgit v1.2.3-59-g8ed1b From 43cade803ebeb002403d4b704e041ce800e5b0e1 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:44 -0700 Subject: loop: fold loop_switch() into callers The comments here are really outdated, and blk-mq made flushing much simpler, so just fold the two cases into the callers. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 76 ++++++++-------------------------------------------- 1 file changed, 11 insertions(+), 65 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ac106b287d75..f6c204f62b1e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -546,72 +546,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) } } -struct switch_request { - struct file *file; - struct completion wait; -}; - static inline void loop_update_dio(struct loop_device *lo) { __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | lo->use_dio); } -/* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - return; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - loop_update_dio(lo); -} - -/* - * loop_switch performs the hard work of switching a backing store. - * First it needs to flush existing IO, it does this by sending a magic - * BIO down the pipe. The completion of this BIO does the actual switch. - */ -static int loop_switch(struct loop_device *lo, struct file *file) -{ - struct switch_request w; - - w.file = file; - - /* freeze queue and wait for completion of scheduled requests */ - blk_mq_freeze_queue(lo->lo_queue); - - /* do the switch action */ - do_loop_switch(lo, &w); - - /* unfreeze */ - blk_mq_unfreeze_queue(lo->lo_queue); - - return 0; -} - -/* - * Helper to flush the IOs in loop, but keeping loop thread running - */ -static int loop_flush(struct loop_device *lo) -{ - /* loop not yet configured, no running thread, nothing to flush */ - if (lo->lo_state != Lo_bound) - return 0; - return loop_switch(lo, NULL); -} - static void loop_reread_partitions(struct loop_device *lo, struct block_device *bdev) { @@ -676,9 +616,14 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_putf; /* and ... switch */ - error = loop_switch(lo, file); - if (error) - goto out_putf; + blk_mq_freeze_queue(lo->lo_queue); + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_update_dio(lo); + blk_mq_unfreeze_queue(lo->lo_queue); fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -1601,12 +1546,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode) err = loop_clr_fd(lo); if (!err) return; - } else { + } else if (lo->lo_state == Lo_bound) { /* * Otherwise keep thread (if running) and config, * but flush possible ongoing bios in thread. */ - loop_flush(lo); + blk_mq_freeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue); } mutex_unlock(&lo->lo_ctl_mutex); -- cgit v1.2.3-59-g8ed1b From 40a5fce495715c48c2e02668144e68a507ac5a30 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Wed, 30 Aug 2017 15:18:19 -0700 Subject: nvme-fabrics: generate spec-compliant UUID NQNs The default host NQN, which is generated based on the host's UUID, does not follow the UUID-based NQN format laid out in the NVMe 1.3 specification. Remove the "NVMf:" portion of the NQN to match the spec. Signed-off-by: Daniel Verkamp Reviewed-by: Max Gurtovoy Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 53a9598e3aee..47307752dc65 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -75,7 +75,7 @@ static struct nvmf_host *nvmf_host_default(void) kref_init(&host->ref); snprintf(host->nqn, NVMF_NQN_SIZE, - "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); + "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); mutex_lock(&nvmf_hosts_mutex); list_add_tail(&host->list, &nvmf_hosts); -- cgit v1.2.3-59-g8ed1b From 54bb0ade6627a183c211345761ec46e4bf0048fe Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 22:09:45 -0700 Subject: block/loop: set hw_sectors Loop can handle any size of request. Limiting it to 255 sectors just burns the CPU for bio split and request merge for underlayer disk and also cause bad fs block allocation in directio mode. Reviewed-by: Omar Sandoval Reviewed-by: Ming Lei Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f6c204f62b1e..9eff4d3ab1f3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1736,6 +1736,7 @@ static int loop_add(struct loop_device **l, int i) blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); + blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); /* * It doesn't make sense to enable merge because the I/O * submitted to backing file is handled page by page. -- cgit v1.2.3-59-g8ed1b From 40326d8a33d5b70039849d233975b63c733d94a2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 22:09:46 -0700 Subject: block/loop: allow request merge for directio mode Currently loop disables merge. While it makes sense for buffer IO mode, directio mode can benefit from request merge. Without merge, loop could send small size IO to underlayer disk and harm performance. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 66 ++++++++++++++++++++++++++++++++++++++++------------ drivers/block/loop.h | 1 + 2 files changed, 52 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9eff4d3ab1f3..3a35121f8a6f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) */ blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; - if (use_dio) + if (use_dio) { + queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags |= LO_FLAGS_DIRECT_IO; - else + } else { + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + } blk_mq_unfreeze_queue(lo->lo_queue); } @@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + kfree(cmd->bvec); + cmd->bvec = NULL; cmd->ret = ret; blk_mq_complete_request(cmd->rq); } @@ -473,22 +478,50 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, { struct iov_iter iter; struct bio_vec *bvec; - struct bio *bio = cmd->rq->bio; + struct request *rq = cmd->rq; + struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; + unsigned int offset; + int segments = 0; int ret; - /* nomerge for loop request queue */ - WARN_ON(cmd->rq->bio != cmd->rq->biotail); + if (rq->bio != rq->biotail) { + struct req_iterator iter; + struct bio_vec tmp; + + __rq_for_each_bio(bio, rq) + segments += bio_segments(bio); + bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO); + if (!bvec) + return -EIO; + cmd->bvec = bvec; + + /* + * The bios of the request may be started from the middle of + * the 'bvec' because of bio splitting, so we can't directly + * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment + * API will take care of all details for us. + */ + rq_for_each_segment(tmp, rq, iter) { + *bvec = tmp; + bvec++; + } + bvec = cmd->bvec; + offset = 0; + } else { + /* + * Same here, this bio may be started from the middle of the + * 'bvec' because of bio splitting, so offset from the bvec + * must be passed to iov iterator + */ + offset = bio->bi_iter.bi_bvec_done; + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + segments = bio_segments(bio); + } - bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, - bio_segments(bio), blk_rq_bytes(cmd->rq)); - /* - * This bio may be started from the middle of the 'bvec' - * because of bio splitting, so offset from the bvec must - * be passed to iov iterator - */ - iter.iov_offset = bio->bi_iter.bi_bvec_done; + segments, blk_rq_bytes(rq)); + iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; @@ -1737,9 +1770,12 @@ static int loop_add(struct loop_device **l, int i) blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); + /* - * It doesn't make sense to enable merge because the I/O - * submitted to backing file is handled page by page. + * By default, we do buffer IO, so it doesn't make sense to enable + * merge because the I/O submitted to backing file is handled page by + * page. For directio mode, merge does help to dispatch bigger request + * to underlayer disk. We will enable merge once directio is enabled. */ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index efe57189d01e..43d20d37b79a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -71,6 +71,7 @@ struct loop_cmd { bool use_aio; /* use AIO interface to handle I/O */ long ret; struct kiocb iocb; + struct bio_vec *bvec; }; /* Support for loadable transfer modules */ -- cgit v1.2.3-59-g8ed1b From 92d773324b7edbd36bf0c28c1e0157763aeccc92 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 1 Sep 2017 11:15:17 -0700 Subject: block/loop: fix use after free lo_rw_aio->call_read_iter-> 1 aops->direct_IO 2 iov_iter_revert lo_rw_aio_complete could happen between 1 and 2, the bio and bvec could be freed before 2, which accesses bvec. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 16 +++++++++++++--- drivers/block/loop.h | 5 ++++- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3a35121f8a6f..78c47c4b584d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -463,14 +463,21 @@ static void lo_complete_rq(struct request *rq) blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK); } +static void lo_rw_aio_do_completion(struct loop_cmd *cmd) +{ + if (!atomic_dec_and_test(&cmd->ref)) + return; + kfree(cmd->bvec); + cmd->bvec = NULL; + blk_mq_complete_request(cmd->rq); +} + static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); - kfree(cmd->bvec); - cmd->bvec = NULL; cmd->ret = ret; - blk_mq_complete_request(cmd->rq); + lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, @@ -518,6 +525,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); segments = bio_segments(bio); } + atomic_set(&cmd->ref, 2); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, segments, blk_rq_bytes(rq)); @@ -533,6 +541,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, else ret = call_read_iter(file, &cmd->iocb, &iter); + lo_rw_aio_do_completion(cmd); + if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); return 0; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 43d20d37b79a..b0ba4a5951c4 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -68,7 +68,10 @@ struct loop_cmd { struct kthread_work work; struct request *rq; struct list_head list; - bool use_aio; /* use AIO interface to handle I/O */ + union { + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + }; long ret; struct kiocb iocb; struct bio_vec *bvec; -- cgit v1.2.3-59-g8ed1b From bc75705d00637c5f7b0346bf63094a9899c3d516 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 1 Sep 2017 11:15:18 -0700 Subject: block/loop: remove unused field nobody uses the list. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/loop.h b/drivers/block/loop.h index b0ba4a5951c4..f68c1d50802f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -67,7 +67,6 @@ struct loop_device { struct loop_cmd { struct kthread_work work; struct request *rq; - struct list_head list; union { bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ -- cgit v1.2.3-59-g8ed1b From 4b758df21ee7081ab41448d21d60367efaa625b3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 14:25:51 +0800 Subject: bcache: Fix leak of bdev reference If blkdev_get_by_path() in register_bcache() fails, we try to lookup the block device using lookup_bdev() to detect which situation we are in to properly report error. However we never drop the reference returned to us from lookup_bdev(). Fix that. Signed-off-by: Jan Kara Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 974d832e54a6..c3fedd265d18 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } -- cgit v1.2.3-59-g8ed1b From c81ffa32a214c84b08900fbc9d432187bd948eba Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:52 +0800 Subject: bcache: fix sequential large write IO bypass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sequential write IOs were tested with bs=1M by FIO in writeback cache mode, these IOs were expected to be bypassed, but actually they did not. We debug the code, and find in check_should_bypass(): if (!congested && mode == CACHE_MODE_WRITEBACK && op_is_write(bio_op(bio)) && (bio->bi_opf & REQ_SYNC)) goto rescale that means, If in writeback mode, a write IO with REQ_SYNC flag will not be bypassed though it is a sequential large IO, It's not a correct thing to do actually, so this patch remove these codes. Signed-off-by: tang.junhui Reviewed-by: Kent Overstreet Reviewed-by: Eric Wheeler Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 0e1463d0c334..de97d86ddff4 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && !dc->sequential_cutoff) goto rescale; - if (!congested && - mode == CACHE_MODE_WRITEBACK && - op_is_write(bio->bi_opf) && - op_is_sync(bio->bi_opf)) - goto rescale; - spin_lock(&dc->io_lock); hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) -- cgit v1.2.3-59-g8ed1b From 69daf03adef5f7bc13e0ac86b4b8007df1767aab Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:53 +0800 Subject: bcache: do not subtract sectors_to_gc for bypassed IO Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to trigger gc thread. Signed-off-by: tang.junhui Acked-by: Coly Li Reviewed-by: Eric Wheeler Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index de97d86ddff4..681b4f12b05a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) - wake_up_gc(op->c); - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. -- cgit v1.2.3-59-g8ed1b From 09b3efec81def807fb225359e34a8e72866dd9c4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 6 Sep 2017 14:25:54 +0800 Subject: bcache: Don't reinvent the wheel but use existing llist API Although llist provides proper APIs, they are not used. Make them used. Signed-off-by: Byungchul Park Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 864e673aec39..7d5286b05036 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -70,21 +70,10 @@ void __closure_wake_up(struct closure_waitlist *wait_list) list = llist_del_all(&wait_list->list); /* We first reverse the list to preserve FIFO ordering and fairness */ - - while (list) { - struct llist_node *t = list; - list = llist_next(list); - - t->next = reverse; - reverse = t; - } + reverse = llist_reverse_order(list); /* Then do the wakeups */ - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry(cl, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } -- cgit v1.2.3-59-g8ed1b From 0b43f49dc4d6d3789e936731dc16af94cb57d568 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:55 +0800 Subject: bcache: gc does not work when triggering by manual command I try to execute the following command to trigger gc thread: [root@localhost internal]# echo 1 > trigger_gc But it does not work, I debug the code in gc_should_run(), It works only if in invalidating or sectors_to_gc < 0. So set sectors_to_gc to -1 to meet the condition when we trigger gc by manual command. (Code comments aded by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index f90f13616980..09295c07ea88 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -615,8 +615,21 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) + if (attr == &sysfs_trigger_gc) { + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * when users write to sysfs entry trigger_gc, most of time + * they want to forcibly triger gargage collection. Here -1 is + * set to c->sectors_to_gc, to make gc_should_run() give a + * chance to permit gc thread to run. "give a chance" means + * before going into gc_should_run(), there is still chance + * that c->sectors_to_gc being set to other positive value. So + * writing sysfs entry trigger_gc won't always make sure gc + * thread takes effect. + */ + atomic_set(&c->sectors_to_gc, -1); wake_up_gc(c); + } if (attr == &sysfs_prune_cache) { struct shrink_control sc; -- cgit v1.2.3-59-g8ed1b From a8394090a9129b40f9d90dcb7f4a49d60c727ca6 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:56 +0800 Subject: bcache: correct cache_dirty_target in __update_writeback_rate() __update_write_rate() uses a Proportion-Differentiation Controller algorithm to control writeback rate. A dirty target number is used in this PD controller to control writeback rate. A larger target number will make the writeback rate smaller, on the versus, a smaller target number will make the writeback rate larger. bcache uses the following steps to calculate the target number, 1) cache_sectors = all-buckets-of-cache-set * buckets-size 2) cache_dirty_target = cache_sectors * cached-device-writeback_percent 3) target = cache_dirty_target * (sectors-of-cached-device/sectors-of-all-cached-devices-of-this-cache-set) The calculation at step 1) for cache_sectors is incorrect, which does not consider dirty blocks occupied by flash only volume. A flash only volume can be took as a bcache device without cached device. All data sectors allocated for it are persistent on cache device and marked dirty, they are not touched by bcache writeback and garbage collection code. So data blocks of flash only volume should be ignore when calculating cache_sectors of cache set. Current code does not subtract dirty sectors of flash only volume, which results a larger target number from the above 3 steps. And in sequence the cache device's writeback rate is smaller then a correct value, writeback speed is slower on all cached devices. This patch fixes the incorrect slower writeback rate by subtracting dirty sectors of flash only volumes in __update_writeback_rate(). (Commit log composed by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 ++- drivers/md/bcache/writeback.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c49022a8dc9d..5abe6472b66b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -21,7 +21,8 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 629bd1a502fd..8789b9c8c484 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { -- cgit v1.2.3-59-g8ed1b From 77fa100f27475d08a569b9d51c17722130f089e7 Mon Sep 17 00:00:00 2001 From: Tony Asleson Date: Wed, 6 Sep 2017 14:25:57 +0800 Subject: bcache: Correct return value for sysfs attach errors If you encounter any errors in bch_cached_dev_attach it will return a negative error code. The variable 'v' which stores the result is unsigned, thus user space sees a very large value returned for bytes written which can cause incorrect user space behavior. Utilize 1 signed variable to use throughout the function to preserve error return capability. Signed-off-by: Tony Asleson Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 09295c07ea88..104c57cd666c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -192,7 +192,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -227,7 +227,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; -- cgit v1.2.3-59-g8ed1b From 89b1fc54c257104df007c5888e3705e52b973d45 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:58 +0800 Subject: bcache: increase the number of open buckets In currently, we only alloc 6 open buckets for each cache set, but in usually, we always attach about 10 or so backend devices for each cache set, and the each bcache device are always accessed by about 10 or so threads in top application layer. So 6 open buckets are too few, It has led to that each of the same thread write data to different buckets, which would cause low efficiency write-back, and also cause buckets inefficient, and would be Very easy to run out of. I add debug message in bch_open_buckets_alloc() to print alloc bucket info, and test with ten bcache devices with a cache set, and each bcache device is accessed by ten threads. From the debug message, we can see that, after the modification, One bucket is more likely to assign to the same thread, and the data from the same thread are more likely to write the same bucket. Usually the same thread always write/read the same backend device, so it is good for write-back and also promote the usage efficiency of buckets. Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ca4abe1ccd8d..cacbe2dbd5c3 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -68,6 +68,8 @@ #include #include +#define MAX_OPEN_BUCKETS 128 + /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c) spin_lock_init(&c->data_bucket_lock); - for (i = 0; i < 6; i++) { + for (i = 0; i < MAX_OPEN_BUCKETS; i++) { struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 9baf30972b5568d8b5bc8b3c46a6ec5b58100463 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:59 +0800 Subject: bcache: fix for gc and write-back race gc and write-back get raced (see the email "bcache get stucked" I sended before): gc thread write-back thread | |bch_writeback_thread() |bch_gc_thread() | | |==>read_dirty() |==>bch_btree_gc() | |==>btree_root() //get btree root | | //node write locker | |==>bch_btree_gc_root() | | |==>read_dirty_submit() | |==>write_dirty() | |==>continue_at(cl, | | write_dirty_finish, | | system_wq); | |==>write_dirty_finish()//excute | | //in system_wq | |==>bch_btree_insert() | |==>bch_btree_map_leaf_nodes() | |==>__bch_btree_map_nodes() | |==>btree_root //try to get btree | | //root node read | | //lock | |-----stuck here |==>bch_btree_set_root() |==>bch_journal_meta() |==>bch_journal() |==>journal_try_write() |==>journal_write_unlocked() //journal_full(&c->journal) | //condition satisfied |==>continue_at(cl, journal_write, system_wq); //try to excute | //journal_write in system_wq | //but work queue is excuting | //write_dirty_finish() |==>closure_sync(); //wait journal_write execute | //over and wake up gc, |-------------stuck here |==>release root node write locker This patch alloc a separate work-queue for write-back thread to avoid such race. (Commit log re-organized by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ drivers/md/bcache/writeback.c | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dee542fff68e..2ed9bd231d84 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -333,6 +333,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c3fedd265d18..3b724fa2b68d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1059,6 +1059,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5abe6472b66b..34131439e28c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -187,7 +187,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -207,7 +207,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -516,6 +516,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) -- cgit v1.2.3-59-g8ed1b From da22f0eea555baf9b0a84b52afe56db2052cfe8d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Sep 2017 14:26:00 +0800 Subject: bcache: silence static checker warning In olden times, closure_return() used to have a hidden return built in. We removed the hidden return but forgot to add a new return here. If "c" were NULL we would oops on the next line, but fortunately "c" is never NULL. Let's just remove the if statement. Signed-off-by: Dan Carpenter Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3b724fa2b68d..1ae63374366c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1376,9 +1376,6 @@ static void cache_set_flush(struct closure *cl) struct btree *b; unsigned i; - if (!c) - closure_return(cl); - bch_cache_accounting_destroy(&c->accounting); kobject_put(&c->internal); -- cgit v1.2.3-59-g8ed1b From 7b6a8570e02a20d0b6cadb9689e4b2e6217c390c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Sep 2017 14:26:01 +0800 Subject: bcache: Update continue_at() documentation continue_at() doesn't have a return statement anymore. Signed-off-by: Dan Carpenter Acked-by: Coly Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 1ec84ca81146..295b7e43f92c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list) * been dropped with closure_put()), it will resume execution at @fn running out * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). * - * NOTE: This macro expands to a return in the calling function! - * * This is because after calling continue_at() you no longer have a ref on @cl, * and whatever @cl owns may be freed out from under you - a running closure fn * has a ref on its own closure which continue_at() drops. @@ -340,8 +338,6 @@ do { \ * Causes @fn to be executed out of @cl, in @wq context (or called directly if * @wq is NULL). * - * NOTE: like continue_at(), this macro expands to a return in the caller! - * * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, * thus it's not safe to touch anything protected by @cl after a * continue_at_nobarrier(). -- cgit v1.2.3-59-g8ed1b From 9276717b9e297a62d1151a43d1cd286213f68eb7 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Wed, 6 Sep 2017 14:26:02 +0800 Subject: bcache: fix bch_hprint crash and improve output Most importantly, solve a crash where %llu was used to format signed numbers. This would cause a buffer overflow when reading sysfs writeback_rate_debug, as only 20 bytes were allocated for this and %llu writes 20 characters plus a null. Always use the units mechanism rather than having different output paths for simplicity. Also, correct problems with display output where 1.10 was a larger number than 1.09, by multiplying by 10 and then dividing by 1024 instead of dividing by 100. (Remainders of >= 1000 would print as .10). Minor changes: Always display the decimal point instead of trying to omit it based on number of digits shown. Decide what units to use based on 1000 as a threshold, not 1024 (in other words, always print at most 3 digits before the decimal point). Signed-off-by: Michael Lyle Reported-by: Dmitry Yu Okunev Acked-by: Kent Overstreet Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 50 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 8c3a938f4bf0..176d3c2ef5f5 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; - - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } - - if (!u) - return sprintf(buf, "%llu", v); - - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); - - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + int u = 0, t; + + uint64_t q; + + if (v < 0) + q = -v; + else + q = v; + + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; + + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], -- cgit v1.2.3-59-g8ed1b From bf09375337077b692d21d062c30697c86f2872d3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 5 Sep 2017 14:24:47 -0700 Subject: loop: set physical block size to logical block size Commit 6c6b6f28b333 ("loop: set physical block size to PAGE_SIZE") caused mkfs.xfs to barf on ppc64 [1]. Always using PAGE_SIZE as the physical block size still makes the most sense semantically, but let's just lie and always set it to the same value as the logical block size (same goes for io_min). In the future we might want to at least bump up io_min to PAGE_SIZE but I'm sick of these stupid changes so let's play it safe. 1: https://marc.info/?l=linux-xfs&m=150459024723753&w=2 Tested-by: Chandan Rajendra Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 78c47c4b584d..85de67334695 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1036,6 +1036,8 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); blk_queue_logical_block_size(lo->lo_queue, 512); + blk_queue_physical_block_size(lo->lo_queue, 512); + blk_queue_io_min(lo->lo_queue, 512); if (bdev) { bdput(bdev); invalidate_bdev(bdev); @@ -1330,6 +1332,8 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) blk_mq_freeze_queue(lo->lo_queue); blk_queue_logical_block_size(lo->lo_queue, arg); + blk_queue_physical_block_size(lo->lo_queue, arg); + blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); @@ -1777,8 +1781,6 @@ static int loop_add(struct loop_device **l, int i) } lo->lo_queue->queuedata = lo; - blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); - blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); /* -- cgit v1.2.3-59-g8ed1b From 175206cf9ab63161dec74d9cd7f9992e062491f5 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 7 Sep 2017 01:28:53 +0800 Subject: bcache: initialize dirty stripes in flash_dev_run() bcache uses a Proportion-Differentiation Controller algorithm to control writeback rate to cached devices. In the PD controller algorithm, dirty stripes of thin flash device should not be counted in, because flash only volumes never write back dirty data. Currently dirty stripe counter for thin flash device is not initialized when the thin flash device starts. Which means the following calculation in PD controller will reference an undefined dirty stripes number, and all cached devices attached to the same cache set where the thin flash device lies on may have an inaccurate writeback rate. This patch calles bch_sectors_dirty_init() in flash_dev_run(), to correctly initialize dirty stripe counter when the thin flash device starts to run. This patch also does following parameter data type change, -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); to call this function conveniently in flash_dev_run(). (Commit log is composed by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- drivers/md/bcache/writeback.c | 8 ++++---- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1ae63374366c..fc0a31b13ac4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1026,7 +1026,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1230,6 +1230,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 34131439e28c..e663ca082183 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); + d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 8789b9c8c484..e35421d20d2e 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -103,7 +103,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *); -- cgit v1.2.3-59-g8ed1b