diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/nvme/host/apple.c | 26 | ||||
-rw-r--r-- | drivers/nvme/host/auth.c | 46 | ||||
-rw-r--r-- | drivers/nvme/host/constants.c | 16 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 161 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 19 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 18 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 147 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 18 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 189 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 16 |
12 files changed, 391 insertions, 270 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index e36aeb50b4ed..b317ce6c4ec3 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -829,7 +829,23 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) apple_nvme_remove_cq(anv); } - nvme_disable_ctrl(&anv->ctrl, shutdown); + /* + * Always disable the NVMe controller after shutdown. + * We need to do this to bring it back up later anyway, and we + * can't do it while the firmware is not running (e.g. in the + * resume reset path before RTKit is initialized), so for Apple + * controllers it makes sense to unconditionally do it here. + * Additionally, this sequence of events is reliable, while + * others (like disabling after bringing back the firmware on + * resume) seem to run into trouble under some circumstances. + * + * Both U-Boot and m1n1 also use this convention (i.e. an ANS + * NVMe controller is handed off with firmware shut down, in an + * NVMe disabled state, after a clean shutdown). + */ + if (shutdown) + nvme_disable_ctrl(&anv->ctrl, shutdown); + nvme_disable_ctrl(&anv->ctrl, false); } WRITE_ONCE(anv->ioq.enabled, false); @@ -985,11 +1001,11 @@ static void apple_nvme_reset_work(struct work_struct *work) goto out; } - if (anv->ctrl.ctrl_config & NVME_CC_ENABLE) - apple_nvme_disable(anv, false); - /* RTKit must be shut down cleanly for the (soft)-reset to work */ if (apple_rtkit_is_running(anv->rtk)) { + /* reset the controller if it is enabled */ + if (anv->ctrl.ctrl_config & NVME_CC_ENABLE) + apple_nvme_disable(anv, false); dev_dbg(anv->dev, "Trying to shut down RTKit before reset."); ret = apple_rtkit_shutdown(anv->rtk); if (ret) @@ -1493,7 +1509,7 @@ static int apple_nvme_probe(struct platform_device *pdev) } ret = nvme_init_ctrl(&anv->ctrl, anv->dev, &nvme_ctrl_ops, - NVME_QUIRK_SKIP_CID_GEN); + NVME_QUIRK_SKIP_CID_GEN | NVME_QUIRK_IDENTIFY_CNS); if (ret) { dev_err_probe(dev, ret, "Failed to initialize nvme_ctrl"); goto put_dev; diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index bb0abbe4491c..ea16a0aba679 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -45,6 +45,8 @@ struct nvme_dhchap_queue_context { int sess_key_len; }; +static struct workqueue_struct *nvme_auth_wq; + #define nvme_auth_flags_from_qid(qid) \ (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED #define nvme_auth_queue_from_qid(ctrl, qid) \ @@ -158,7 +160,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EINVAL; } hmac_name = nvme_auth_hmac_name(data->hashid); @@ -167,7 +169,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, "qid %d: invalid HASH ID %d\n", chap->qid, data->hashid); chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } if (chap->hash_id == data->hashid && chap->shash_tfm && @@ -193,7 +195,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ENOMEM; } if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { @@ -203,7 +205,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, crypto_free_shash(chap->shash_tfm); chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_AUTH_REQUIRED; + return -EPROTO; } chap->hash_id = data->hashid; @@ -219,7 +221,7 @@ select_kpp: chap->qid, data->dhgid); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; /* Leave previous dh_tfm intact */ - return NVME_SC_AUTH_REQUIRED; + return -EPROTO; } if (chap->dhgroup_id == data->dhgid && @@ -242,7 +244,7 @@ select_kpp: "qid %d: empty DH value\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0); @@ -254,7 +256,7 @@ select_kpp: chap->qid, ret, gid_name); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; chap->dh_tfm = NULL; - return NVME_SC_AUTH_REQUIRED; + return ret; } dev_dbg(ctrl->device, "qid %d: selected DH group %s\n", chap->qid, gid_name); @@ -263,7 +265,7 @@ select_kpp: "qid %d: invalid DH value for NULL DH\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } chap->dhgroup_id = data->dhgid; @@ -274,7 +276,7 @@ skip_kpp: chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL); if (!chap->ctrl_key) { chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ENOMEM; } chap->ctrl_key_len = dhvlen; memcpy(chap->ctrl_key, data->cval + chap->hash_len, @@ -344,7 +346,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EINVAL; } if (data->hl != chap->hash_len) { @@ -352,7 +354,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, "qid %d: invalid hash length %u\n", chap->qid, data->hl); chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } /* Just print out information for the admin queue */ @@ -376,7 +378,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, "qid %d: controller authentication failed\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ECONNREFUSED; } /* Just print out information for the admin queue */ @@ -730,7 +732,7 @@ static void nvme_queue_auth_work(struct work_struct *work) NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE); if (ret) { chap->status = ret; - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; return; } @@ -798,7 +800,7 @@ static void nvme_queue_auth_work(struct work_struct *work) NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1); if (ret) { chap->status = ret; - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; return; } @@ -819,7 +821,7 @@ static void nvme_queue_auth_work(struct work_struct *work) ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { /* Controller authentication failed */ - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; goto fail2; } @@ -866,7 +868,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) chap = &ctrl->dhchap_ctxs[qid]; cancel_work_sync(&chap->auth_work); - queue_work(nvme_wq, &chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); return 0; } EXPORT_SYMBOL_GPL(nvme_auth_negotiate); @@ -953,7 +955,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) goto err_free_dhchap_secret; if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) - return ret; + return 0; ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), sizeof(*chap), GFP_KERNEL); @@ -1008,10 +1010,15 @@ EXPORT_SYMBOL_GPL(nvme_auth_free); int __init nvme_init_auth(void) { + nvme_auth_wq = alloc_workqueue("nvme-auth-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_auth_wq) + return -ENOMEM; + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); if (!nvme_chap_buf_cache) - return -ENOMEM; + goto err_destroy_workqueue; nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, mempool_free_slab, nvme_chap_buf_cache); @@ -1021,6 +1028,8 @@ int __init nvme_init_auth(void) return 0; err_destroy_chap_buf_cache: kmem_cache_destroy(nvme_chap_buf_cache); +err_destroy_workqueue: + destroy_workqueue(nvme_auth_wq); return -ENOMEM; } @@ -1028,4 +1037,5 @@ void __exit nvme_exit_auth(void) { mempool_destroy(nvme_chap_buf_pool); kmem_cache_destroy(nvme_chap_buf_cache); + destroy_workqueue(nvme_auth_wq); } diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index e958d5015585..bc523ca02254 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -54,6 +54,14 @@ static const char * const nvme_admin_ops[] = { [nvme_admin_get_lba_status] = "Get LBA Status", }; +static const char * const nvme_fabrics_ops[] = { + [nvme_fabrics_type_property_set] = "Property Set", + [nvme_fabrics_type_property_get] = "Property Get", + [nvme_fabrics_type_connect] = "Connect", + [nvme_fabrics_type_auth_send] = "Authentication Send", + [nvme_fabrics_type_auth_receive] = "Authentication Receive", +}; + static const char * const nvme_statuses[] = { [NVME_SC_SUCCESS] = "Success", [NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode", @@ -185,3 +193,11 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode) return nvme_admin_ops[opcode]; return "Unknown"; } +EXPORT_SYMBOL_GPL(nvme_get_admin_opcode_str); + +const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode) { + if (opcode < ARRAY_SIZE(nvme_fabrics_ops) && nvme_fabrics_ops[opcode]) + return nvme_fabrics_ops[opcode]; + return "Unknown"; +} +EXPORT_SYMBOL_GPL(nvme_get_fabrics_opcode_str); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 95c488ea91c3..c2730b116dc6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -38,6 +38,7 @@ struct nvme_ns_info { bool is_shared; bool is_readonly; bool is_ready; + bool is_removed; }; unsigned int admin_timeout = 60; @@ -806,9 +807,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, cmnd->dsm.nr = cpu_to_le32(segments - 1); cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = alloc_size; + bvec_set_virt(&req->special_vec, range, alloc_size); req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_STS_OK; @@ -1004,7 +1003,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); * >0: nvme controller's cqe status response * <0: kernel error in lieu of controller response */ -static int nvme_execute_rq(struct request *rq, bool at_head) +int nvme_execute_rq(struct request *rq, bool at_head) { blk_status_t status; @@ -1015,6 +1014,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head) return nvme_req(rq)->status; return blk_status_to_errno(status); } +EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU); /* * Returns 0 on success. If the result is negative, it's a Linux error code; @@ -1060,44 +1060,32 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = 0; if (ns) { - if (ns->head->effects) - effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn_once(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", + "IO command:%02x has unusual effects:%08x\n", opcode, effects); - return 0; - } - if (ctrl->effects) + /* + * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues, + * which would deadlock when done on an I/O command. Note that + * We already warn about an unusual effect above. + */ + effects &= ~NVME_CMD_EFFECTS_CSE_MASK; + } else { effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); + } return effects; } EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) +u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = nvme_command_effects(ctrl, ns, opcode); @@ -1115,6 +1103,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } return effects; } +EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU); void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, struct nvme_command *cmd, int status) @@ -1156,17 +1145,6 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, } EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU); -int nvme_execute_passthru_rq(struct request *rq, u32 *effects) -{ - struct nvme_command *cmd = nvme_req(rq)->cmd; - struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; - struct nvme_ns *ns = rq->q->queuedata; - - *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - return nvme_execute_rq(rq, false); -} -EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); - /* * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: * @@ -1425,16 +1403,8 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - goto out_free_id; + kfree(*id); } - - error = NVME_SC_INVALID_NS | NVME_SC_DNR; - if ((*id)->ncap == 0) /* namespace not allocated or attached */ - goto out_free_id; - return 0; - -out_free_id: - kfree(*id); return error; } @@ -1448,6 +1418,13 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, ret = nvme_identify_ns(ctrl, info->nsid, &id); if (ret) return ret; + + if (id->ncap == 0) { + /* namespace not allocated or attached */ + info->is_removed = true; + return -ENODEV; + } + info->anagrpid = id->anagrpid; info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; @@ -3102,6 +3079,62 @@ free_data: return ret; } +static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl) +{ + struct nvme_effects_log *log = ctrl->effects; + + log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_NCC | + NVME_CMD_EFFECTS_CSE_MASK); + log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK); + + /* + * The spec says the result of a security receive command depends on + * the previous security send command. As such, many vendors log this + * command as one to submitted only when no other commands to the same + * namespace are outstanding. The intention is to tell the host to + * prevent mixing security send and receive. + * + * This driver can only enforce such exclusive access against IO + * queues, though. We are not readily able to enforce such a rule for + * two commands to the admin queue, which is the only queue that + * matters for this command. + * + * Rather than blindly freezing the IO queues for this effect that + * doesn't even apply to IO, mask it off. + */ + log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK); + + log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); + log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); + log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); +} + +static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + int ret = 0; + + if (ctrl->effects) + return 0; + + if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); + if (ret < 0) + return ret; + } + + if (!ctrl->effects) { + ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (!ctrl->effects) + return -ENOMEM; + xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL); + } + + nvme_init_known_nvm_effects(ctrl); + return 0; +} + static int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; @@ -3115,12 +3148,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } - if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); - if (ret < 0) - goto out_free; - } - if (!(ctrl->ops->flags & NVME_F_FABRICS)) ctrl->cntlid = le16_to_cpu(id->cntlid); @@ -3143,6 +3170,10 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_init_subsystem(ctrl, id); if (ret) goto out_free; + + ret = nvme_init_effects(ctrl, id); + if (ret) + goto out_free; } memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev)); @@ -4398,6 +4429,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; + int ret; if (nvme_identify_ns_descs(ctrl, &info)) return; @@ -4414,19 +4446,19 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) * set up a namespace. If not fall back to the legacy version. */ if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) || - (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) { - if (nvme_ns_info_from_id_cs_indep(ctrl, &info)) - return; - } else { - if (nvme_ns_info_from_identify(ctrl, &info)) - return; - } + (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) + ret = nvme_ns_info_from_id_cs_indep(ctrl, &info); + else + ret = nvme_ns_info_from_identify(ctrl, &info); + + if (info.is_removed) + nvme_ns_remove_by_nsid(ctrl, nsid); /* * Ignore the namespace if it is not ready. We will get an AEN once it * becomes ready and restart the scan. */ - if (!info.is_ready) + if (ret || !info.is_ready) return; ns = nvme_find_get_ns(ctrl, nsid); @@ -4901,7 +4933,9 @@ out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); blk_put_queue(ctrl->admin_q); out_free_tagset: - blk_mq_free_tag_set(ctrl->admin_tagset); + blk_mq_free_tag_set(set); + ctrl->admin_q = NULL; + ctrl->fabrics_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); @@ -4926,7 +4960,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; - set->queue_depth = ctrl->sqsize + 1; + set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1); /* * Some Apple controllers requires tags to be unique across admin and * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. @@ -4963,6 +4997,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_free_tag_set: blk_mq_free_tag_set(set); + ctrl->connect_q = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index ce27276f552d..bbaa04a0c502 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -410,7 +410,14 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; - if ((result >> 16) & 0x3) { + if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { + /* Secure concatenation is not implemented */ + if (result & NVME_CONNECT_AUTHREQ_ASCR) { + dev_warn(ctrl->device, + "qid 0: secure concatenation is not supported\n"); + ret = NVME_SC_AUTH_REQUIRED; + goto out_free_data; + } /* Authentication required */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { @@ -486,7 +493,14 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) &cmd, data); } result = le32_to_cpu(res.u32); - if ((result >> 16) & 2) { + if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { + /* Secure concatenation is not implemented */ + if (result & NVME_CONNECT_AUTHREQ_ASCR) { + dev_warn(ctrl->device, + "qid 0: secure concatenation is not supported\n"); + ret = NVME_SC_AUTH_REQUIRED; + goto out_free_data; + } /* Authentication required */ ret = nvme_auth_negotiate(ctrl, qid); if (ret) { @@ -500,6 +514,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) "qid %u: authentication failed\n", qid); } } +out_free_data: kfree(data); return ret; } diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a6e22116e139..dcac3df8a5f7 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -189,7 +189,8 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) { - if (!ctrl->subsys) + if (!ctrl->subsys || + !strcmp(ctrl->opts->subsysnqn, NVME_DISC_SUBSYS_NAME)) return ctrl->opts->subsysnqn; return ctrl->subsys->subnqn; } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 4564f16a0b20..456ee42a6133 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3521,13 +3521,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); - ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_fc_admin_mq_ops, - struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, - ctrl->lport->ops->fcprqst_priv_sz)); - if (ret) - goto out_free_queues; - /* * Would have been nice to init io queues tag set as well. * However, we require interaction from the controller @@ -3537,10 +3530,17 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); if (ret) - goto out_cleanup_tagset; + goto out_free_queues; /* at this point, teardown path changes to ref counting on nvme ctrl */ + ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, + &nvme_fc_admin_mq_ops, + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz)); + if (ret) + goto fail_ctrl; + spin_lock_irqsave(&rport->lock, flags); list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); @@ -3592,8 +3592,6 @@ fail_ctrl: return ERR_PTR(-EIO); -out_cleanup_tagset: - nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_queues: kfree(ctrl->queues); out_free_ida: diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9ddda571f046..723e7d5b778f 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -8,13 +8,27 @@ #include <linux/io_uring.h> #include "nvme.h" +enum { + NVME_IOCTL_VEC = (1 << 0), + NVME_IOCTL_PARTITION = (1 << 1), +}; + static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, - fmode_t mode) + unsigned int flags, fmode_t mode) { + u32 effects; + if (capable(CAP_SYS_ADMIN)) return true; /* + * Do not allow unprivileged passthrough on partitions, as that allows an + * escape from the containment of the partition. + */ + if (flags & NVME_IOCTL_PARTITION) + return false; + + /* * Do not allow unprivileged processes to send vendor specific or fabrics * commands as we can't be sure about their effects. */ @@ -43,11 +57,29 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, } /* - * Only allow I/O commands that transfer data to the controller if the - * special file is open for writing, but always allow I/O commands that - * transfer data from the controller. + * Check if the controller provides a Commands Supported and Effects log + * and marks this command as supported. If not reject unprivileged + * passthrough. */ - if (nvme_is_write(c)) + effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); + if (!(effects & NVME_CMD_EFFECTS_CSUPP)) + return false; + + /* + * Don't allow passthrough for command that have intrusive (or unknown) + * effects. + */ + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_UUID_SEL | + NVME_CMD_EFFECTS_SCOPE_MASK)) + return false; + + /* + * Only allow I/O commands that transfer data to the controller or that + * change the logical block contents if the file descriptor is open for + * writing. + */ + if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) return mode & FMODE_WRITE; return true; } @@ -130,7 +162,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, - bool vec) + unsigned int flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -143,7 +175,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct iov_iter iter; /* fixedbufs is only for non-vectored io */ - if (WARN_ON_ONCE(vec)) + if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) return -EINVAL; ret = io_uring_cmd_import_fixed(ubuffer, bufflen, rq_data_dir(req), &iter, ioucmd); @@ -152,8 +184,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); } else { ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), - bufflen, GFP_KERNEL, vec, 0, 0, - rq_data_dir(req)); + bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, + 0, rq_data_dir(req)); } if (ret) @@ -183,10 +215,11 @@ out: } static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, u64 ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout, bool vec) + struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u64 *result, unsigned timeout, unsigned int flags) { + struct nvme_ns *ns = q->queuedata; struct nvme_ctrl *ctrl; struct request *req; void *meta = NULL; @@ -201,7 +234,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, meta_seed, &meta, NULL, vec); + meta_len, meta_seed, &meta, NULL, flags); if (ret) return ret; } @@ -209,8 +242,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, bio = req->bio; ctrl = nvme_req(req)->ctrl; - ret = nvme_execute_passthru_rq(req, &effects); - + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + ret = nvme_execute_rq(req, false); if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (meta) @@ -284,10 +317,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - return nvme_submit_user_cmd(ns->queue, &c, - io.addr, length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0, - false); + return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, + meta_len, lower_32_bits(io.slba), NULL, 0, 0); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -305,7 +336,8 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd, fmode_t mode) + struct nvme_passthru_cmd __user *ucmd, unsigned int flags, + fmode_t mode) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -333,16 +365,15 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); - if (!nvme_cmd_allowed(ns, &c, mode)) + if (!nvme_cmd_allowed(ns, &c, 0, mode)) return -EACCES; if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - cmd.addr, cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout, false); + cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), + cmd.metadata_len, 0, &result, timeout, 0); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -353,8 +384,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd, bool vec, - fmode_t mode) + struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, + fmode_t mode) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; @@ -381,16 +412,15 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); - if (!nvme_cmd_allowed(ns, &c, mode)) + if (!nvme_cmd_allowed(ns, &c, flags, mode)) return -EACCES; if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - cmd.addr, cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout, vec); + cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), + cmd.metadata_len, 0, &cmd.result, timeout, flags); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -525,7 +555,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_uring_data d; struct nvme_command c; struct request *req; - blk_opf_t rq_flags = 0; + blk_opf_t rq_flags = REQ_ALLOC_CACHE; blk_mq_req_flags_t blk_flags = 0; void *meta = NULL; int ret; @@ -551,7 +581,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); - if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode)) + if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode)) return -EACCES; d.metadata = READ_ONCE(cmd->metadata); @@ -561,7 +591,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.timeout_ms = READ_ONCE(cmd->timeout_ms); if (issue_flags & IO_URING_F_NONBLOCK) { - rq_flags = REQ_NOWAIT; + rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; } if (issue_flags & IO_URING_F_IOPOLL) @@ -621,9 +651,9 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp, mode); + return nvme_user_cmd(ctrl, NULL, argp, 0, mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false, mode); + return nvme_user_cmd64(ctrl, NULL, argp, 0, mode); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -648,14 +678,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, fmode_t mode) + void __user *argp, unsigned int flags, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp, mode); + return nvme_user_cmd(ns->ctrl, ns, argp, flags, mode); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -666,37 +696,40 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, #endif case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); - case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode); case NVME_IOCTL_IO64_CMD_VEC: - return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode); + flags |= NVME_IOCTL_VEC; + fallthrough; + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp, flags, mode); default: return -ENOTTY; } } -static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg, - fmode_t mode) -{ - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode); - return nvme_ns_ioctl(ns, cmd, arg, mode); -} - int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + unsigned int flags = 0; + + if (bdev_is_partition(bdev)) + flags |= NVME_IOCTL_PARTITION; - return __nvme_ioctl(ns, cmd, (void __user *)arg, mode); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); + return nvme_ns_ioctl(ns, cmd, argp, flags, mode); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + void __user *argp = (void __user *)arg; - return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, file->f_mode); + return nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -786,6 +819,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, void __user *argp = (void __user *)arg; struct nvme_ns *ns; int srcu_idx, ret = -EWOULDBLOCK; + unsigned int flags = 0; + + if (bdev_is_partition(bdev)) + flags |= NVME_IOCTL_PARTITION; srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); @@ -801,7 +838,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, mode); - ret = nvme_ns_ioctl(ns, cmd, argp, mode); + ret = nvme_ns_ioctl(ns, cmd, argp, flags, mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -826,7 +863,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, file->f_mode); - ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode); + ret = nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -925,7 +962,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp, mode); + ret = nvme_user_cmd(ctrl, ns, argp, 0, mode); nvme_put_ns(ns); return ret; @@ -942,9 +979,9 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp, file->f_mode); + return nvme_user_cmd(ctrl, NULL, argp, 0, file->f_mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode); + return nvme_user_cmd64(ctrl, NULL, argp, 0, file->f_mode); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp, file->f_mode); case NVME_IOCTL_RESET: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index c03093b6813c..fc39d01e7b63 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -376,6 +376,8 @@ static void nvme_ns_head_submit_bio(struct bio *bio) * pool from the original queue to allocate the bvecs from. */ bio = bio_split_to_limits(bio); + if (!bio) + return; srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6bbb73ef8b25..bf46f122e9e1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -893,7 +893,7 @@ static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - if (req->cmd_flags & REQ_NVME_MPATH) + if ((req->cmd_flags & REQ_NVME_MPATH) && req->bio) trace_block_bio_complete(ns->head->disk->queue, req->bio); } @@ -1070,7 +1070,8 @@ static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); -int nvme_execute_passthru_rq(struct request *rq, u32 *effects); +u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); +int nvme_execute_rq(struct request *rq, bool at_head); void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); @@ -1086,6 +1087,7 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) const unsigned char *nvme_get_error_status_str(u16 status); const unsigned char *nvme_get_opcode_str(u8 opcode); const unsigned char *nvme_get_admin_opcode_str(u8 opcode); +const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode); #else /* CONFIG_NVME_VERBOSE_ERRORS */ static inline const unsigned char *nvme_get_error_status_str(u16 status) { @@ -1099,6 +1101,18 @@ static inline const unsigned char *nvme_get_admin_opcode_str(u8 opcode) { return "Admin Cmd"; } + +static inline const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode) +{ + return "Fabrics Cmd"; +} #endif /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const unsigned char *nvme_opcode_str(int qid, u8 opcode, u8 fctype) +{ + if (opcode == nvme_fabrics_command) + return nvme_get_fabrics_opcode_str(fctype); + return qid ? nvme_get_opcode_str(opcode) : + nvme_get_admin_opcode_str(opcode); +} #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f0f8027644bb..5b95c94ee40f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -36,14 +36,15 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. */ -#define NVME_MAX_KB_SZ 4096 -#define NVME_MAX_SEGS 127 +#define NVME_MAX_KB_SZ 8192 +#define NVME_MAX_SEGS 128 +#define NVME_MAX_NR_ALLOCATIONS 5 static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -110,6 +111,7 @@ struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); +static void nvme_update_attrs(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -144,9 +146,9 @@ struct nvme_dev { mempool_t *iod_mempool; /* shadow doorbell buffer support: */ - u32 *dbbuf_dbs; + __le32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; - u32 *dbbuf_eis; + __le32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; /* host memory buffer support: */ @@ -208,13 +210,18 @@ struct nvme_queue { #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 - u32 *dbbuf_sq_db; - u32 *dbbuf_cq_db; - u32 *dbbuf_sq_ei; - u32 *dbbuf_cq_ei; + __le32 *dbbuf_sq_db; + __le32 *dbbuf_cq_db; + __le32 *dbbuf_sq_ei; + __le32 *dbbuf_cq_ei; struct completion delete_done; }; +union nvme_descriptor { + struct nvme_sgl_desc *sg_list; + __le64 *prp_list; +}; + /* * The nvme_iod describes the data in an I/O. * @@ -224,7 +231,6 @@ struct nvme_queue { struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool use_sgl; bool aborted; s8 nr_allocations; /* PRP list pool allocations. 0 means small pool in use */ @@ -232,6 +238,7 @@ struct nvme_iod { dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; + union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -343,11 +350,11 @@ static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) } /* Update dbbuf and return true if an MMIO is required */ -static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, - volatile u32 *dbbuf_ei) +static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, + volatile __le32 *dbbuf_ei) { if (dbbuf_db) { - u16 old_value; + u16 old_value, event_idx; /* * Ensure that the queue is written before updating @@ -355,8 +362,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ wmb(); - old_value = *dbbuf_db; - *dbbuf_db = value; + old_value = le32_to_cpu(*dbbuf_db); + *dbbuf_db = cpu_to_le32(value); /* * Ensure that the doorbell is updated before reading the event @@ -366,7 +373,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ mb(); - if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) + event_idx = le32_to_cpu(*dbbuf_ei); + if (!nvme_dbbuf_need_event(event_idx, value, old_value)) return false; } @@ -380,19 +388,9 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); -} - -/* - * Calculates the number of pages needed for the SGL segments. For example a 4k - * page can accommodate 256 SGL descriptors. - */ -static int nvme_pci_npages_sgl(void) -{ - return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - PAGE_SIZE); + unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; + unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); + return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -508,16 +506,10 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } -static void **nvme_pci_iod_list(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req)); -} - -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, + int nseg) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); @@ -539,7 +531,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) int i; for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = nvme_pci_iod_list(req)[i]; + __le64 *prp_list = iod->list[i].prp_list; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); @@ -547,22 +539,6 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) } } -static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) -{ - const int last_sg = SGES_PER_PAGE - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->nr_allocations; i++) { - struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; - dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); - - dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); - dma_addr = next_dma_addr; - } -} - static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -578,10 +554,11 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, + iod->first_dma); + else if (iod->nr_allocations == 1) + dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, iod->first_dma); - else if (iod->use_sgl) - nvme_free_sgls(dev, req); else nvme_free_prps(dev, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); @@ -612,7 +589,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, u64 dma_addr = sg_dma_address(sg); int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; - void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; @@ -650,7 +626,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->nr_allocations = -1; return BLK_STS_RESOURCE; } - list[0] = prp_list; + iod->list[0].prp_list = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { @@ -659,7 +635,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - list[iod->nr_allocations++] = prp_list; + iod->list[iod->nr_allocations++].prp_list = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -704,13 +680,8 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, dma_addr_t dma_addr, int entries) { sge->addr = cpu_to_le64(dma_addr); - if (entries < SGES_PER_PAGE) { - sge->length = cpu_to_le32(entries * sizeof(*sge)); - sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; - } else { - sge->length = cpu_to_le32(PAGE_SIZE); - sge->type = NVME_SGL_FMT_SEG_DESC << 4; - } + sge->length = cpu_to_le32(entries * sizeof(*sge)); + sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, @@ -746,34 +717,16 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_RESOURCE; } - nvme_pci_iod_list(req)[0] = sg_list; + iod->list[0].sg_list = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - do { - if (i == SGES_PER_PAGE) { - struct nvme_sgl_desc *old_sg_desc = sg_list; - struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; - - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) - goto free_sgls; - - i = 0; - nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list; - sg_list[i++] = *link; - nvme_pci_sgl_set_seg(link, sgl_dma, entries); - } - nvme_pci_sgl_set_data(&sg_list[i++], sg); sg = sg_next(sg); } while (--entries > 0); return BLK_STS_OK; -free_sgls: - nvme_free_sgls(dev, req); - return BLK_STS_RESOURCE; } static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, @@ -855,8 +808,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_free_sg; } - iod->use_sgl = nvme_pci_use_sgls(dev, req); - if (iod->use_sgl) + if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); @@ -1361,7 +1313,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) else nvme_poll_irqdisable(nvmeq); - if (blk_mq_request_completed(req)) { + if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1922,6 +1874,8 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); + + nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2208,6 +2162,11 @@ static const struct attribute_group *nvme_pci_dev_attr_groups[] = { NULL, }; +static void nvme_update_attrs(struct nvme_dev *dev) +{ + sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group); +} + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2332,10 +2291,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); - if (result > 0) + if (result > 0) { dev->q_depth = result; - else + dev->ctrl.sqsize = result - 1; + } else { dev->cmb_use_sqes = false; + } } do { @@ -2506,18 +2467,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); - int dma_address_bits = 64; if (pci_enable_device_mem(pdev)) return result; pci_set_master(pdev); - if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) - dma_address_bits = 48; - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto disable; @@ -2530,13 +2485,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) */ result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); if (result < 0) - return result; + goto disable; dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); - dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; @@ -2577,15 +2531,20 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", dev->q_depth); } - + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); - return nvme_pci_configure_admin_queue(dev); + result = nvme_pci_configure_admin_queue(dev); + if (result) + goto free_irq; + return result; + free_irq: + pci_free_irq_vectors(pdev); disable: pci_disable_device(pdev); return result; @@ -2697,11 +2656,8 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - size_t alloc_size = sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; + size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; - WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)alloc_size, GFP_KERNEL, @@ -2963,7 +2919,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return NULL; + return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); mutex_init(&dev->shutdown_lock); @@ -2991,7 +2947,11 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, quirks); if (ret) goto out_put_device; - + + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48)); + else + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); dma_set_max_seg_size(&pdev->dev, 0xffffffff); @@ -3024,8 +2984,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int result = -ENOMEM; dev = nvme_pci_alloc_dev(pdev, id); - if (!dev) - return -ENOMEM; + if (IS_ERR(dev)) + return PTR_ERR(dev); result = nvme_dev_map(dev); if (result) @@ -3095,6 +3055,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_start_ctrl(&dev->ctrl); nvme_put_ctrl(&dev->ctrl); + flush_work(&dev->ctrl.scan_work); return 0; out_disable: @@ -3415,6 +3376,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x10ec, 0x5763), /* ADATA SX6000PNP */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, @@ -3493,7 +3456,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_SINGLE_VECTOR | NVME_QUIRK_128_BYTES_SQES | NVME_QUIRK_SHARED_TAGS | - NVME_QUIRK_SKIP_CID_GEN }, + NVME_QUIRK_SKIP_CID_GEN | + NVME_QUIRK_IDENTIFY_CNS }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } }; @@ -3521,8 +3485,9 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > - S8_MAX); + BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); + BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); + BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cedc1ef496c..7723a4989524 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -14,6 +14,7 @@ #include <linux/blk-mq.h> #include <crypto/hash.h> #include <net/busy_poll.h> +#include <trace/events/sock.h> #include "nvme.h" #include "fabrics.h" @@ -905,6 +906,8 @@ static void nvme_tcp_data_ready(struct sock *sk) { struct nvme_tcp_queue *queue; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue && queue->rd_enabled) && @@ -2282,10 +2285,13 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; + u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype; + int qid = nvme_tcp_queue_id(req->queue); dev_warn(ctrl->device, - "queue %d: timeout request %#x type %d\n", - nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); + "queue %d: timeout cid %#x type %d opcode %#x (%s)\n", + nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type, + opc, nvme_opcode_str(qid, opc, fctype)); if (ctrl->state != NVME_CTRL_LIVE) { /* @@ -2486,6 +2492,10 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) len = nvmf_get_address(ctrl, buf, size); + mutex_lock(&queue->queue_lock); + + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + goto done; ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); if (ret > 0) { if (len > 0) @@ -2493,6 +2503,8 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", (len) ? "," : "", &src_addr); } +done: + mutex_unlock(&queue->queue_lock); return len; } |