aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig1
-rw-r--r--drivers/nvme/host/core.c217
-rw-r--r--drivers/nvme/host/fabrics.c5
-rw-r--r--drivers/nvme/host/fc.c34
-rw-r--r--drivers/nvme/host/lightnvm.c1
-rw-r--r--drivers/nvme/host/multipath.c17
-rw-r--r--drivers/nvme/host/nvme.h4
-rw-r--r--drivers/nvme/host/pci.c327
-rw-r--r--drivers/nvme/host/rdma.c196
-rw-r--r--drivers/nvme/host/tcp.c80
-rw-r--r--drivers/nvme/host/trace.h1
11 files changed, 516 insertions, 367 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 0f345e207675..ec43ac9199e2 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config NVME_CORE
tristate
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 470601980794..120fb593d1da 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -288,7 +288,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
"Cancelling I/O %d", req->tag);
nvme_req(req)->status = NVME_SC_ABORT_REQ;
- blk_mq_complete_request(req);
+ blk_mq_complete_request_sync(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -388,7 +388,7 @@ static void nvme_free_ns_head(struct kref *ref)
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
list_del_init(&head->entry);
- cleanup_srcu_struct_quiesced(&head->srcu);
+ cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head);
}
@@ -1105,7 +1105,7 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
if (error) {
- dev_warn(ctrl->device, "Identify namespace failed\n");
+ dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(id);
return NULL;
}
@@ -1259,8 +1259,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
- else
- effects = nvme_known_admin_effects(opcode);
+ effects |= nvme_known_admin_effects(opcode);
/*
* For simplicity, IO to all namespaces is quiesced even if the command
@@ -1362,9 +1361,14 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
{
#ifdef CONFIG_NVME_MULTIPATH
if (disk->fops == &nvme_ns_head_ops) {
+ struct nvme_ns *ns;
+
*head = disk->private_data;
*srcu_idx = srcu_read_lock(&(*head)->srcu);
- return nvme_find_path(*head);
+ ns = nvme_find_path(*head);
+ if (!ns)
+ srcu_read_unlock(&(*head)->srcu, *srcu_idx);
+ return ns;
}
#endif
*head = NULL;
@@ -1378,42 +1382,56 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
srcu_read_unlock(&head->srcu, idx);
}
-static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
{
+ struct nvme_ns_head *head = NULL;
+ void __user *argp = (void __user *)arg;
+ struct nvme_ns *ns;
+ int srcu_idx, ret;
+
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ return -EWOULDBLOCK;
+
+ /*
+ * Handle ioctls that apply to the controller instead of the namespace
+ * seperately and drop the ns SRCU reference early. This avoids a
+ * deadlock when deleting namespaces using the passthrough interface.
+ */
+ if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ nvme_get_ctrl(ns->ctrl);
+ nvme_put_ns_from_disk(head, srcu_idx);
+
+ if (cmd == NVME_IOCTL_ADMIN_CMD)
+ ret = nvme_user_cmd(ctrl, NULL, argp);
+ else
+ ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+
+ nvme_put_ctrl(ctrl);
+ return ret;
+ }
+
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
- return ns->head->ns_id;
- case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+ ret = ns->head->ns_id;
+ break;
case NVME_IOCTL_IO_CMD:
- return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+ ret = nvme_user_cmd(ns->ctrl, ns, argp);
+ break;
case NVME_IOCTL_SUBMIT_IO:
- return nvme_submit_io(ns, (void __user *)arg);
+ ret = nvme_submit_io(ns, argp);
+ break;
default:
-#ifdef CONFIG_NVM
if (ns->ndev)
- return nvme_nvm_ioctl(ns, cmd, arg);
-#endif
- if (is_sed_ioctl(cmd))
- return sed_ioctl(ns->ctrl->opal_dev, cmd,
- (void __user *) arg);
- return -ENOTTY;
+ ret = nvme_nvm_ioctl(ns, cmd, arg);
+ else
+ ret = -ENOTTY;
}
-}
-
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct nvme_ns_head *head = NULL;
- struct nvme_ns *ns;
- int srcu_idx, ret;
- ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
- if (unlikely(!ns))
- ret = -EWOULDBLOCK;
- else
- ret = nvme_ns_ioctl(ns, cmd, arg);
nvme_put_ns_from_disk(head, srcu_idx);
return ret;
}
@@ -1588,9 +1606,13 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
- sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
unsigned short bs = 1 << ns->lba_shift;
+ if (ns->lba_shift > PAGE_SHIFT) {
+ /* unsupported block size, set capacity to 0 later */
+ bs = (1 << 9);
+ }
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
@@ -1601,7 +1623,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
if (ns->ms && !ns->ext &&
(ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns->ms, ns->pi_type);
- if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+ if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
+ ns->lba_shift > PAGE_SHIFT)
capacity = 0;
set_capacity(disk, capacity);
@@ -2337,20 +2360,35 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
NULL,
};
-static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
+ struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
- int count = 0;
- struct nvme_ctrl *ctrl;
+ struct nvme_ctrl *tmp;
- mutex_lock(&subsys->lock);
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
- if (ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DEAD)
- count++;
+ lockdep_assert_held(&nvme_subsystems_lock);
+
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
+ if (ctrl->state == NVME_CTRL_DELETING ||
+ ctrl->state == NVME_CTRL_DEAD)
+ continue;
+
+ if (tmp->cntlid == ctrl->cntlid) {
+ dev_err(ctrl->device,
+ "Duplicate cntlid %u with %s, rejecting\n",
+ ctrl->cntlid, dev_name(tmp->device));
+ return false;
+ }
+
+ if ((id->cmic & (1 << 1)) ||
+ (ctrl->opts && ctrl->opts->discovery_nqn))
+ continue;
+
+ dev_err(ctrl->device,
+ "Subsystem does not support multiple controllers\n");
+ return false;
}
- mutex_unlock(&subsys->lock);
- return count;
+ return true;
}
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
@@ -2390,22 +2428,13 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
mutex_lock(&nvme_subsystems_lock);
found = __nvme_find_get_subsystem(subsys->subnqn);
if (found) {
- /*
- * Verify that the subsystem actually supports multiple
- * controllers, else bail out.
- */
- if (!(ctrl->opts && ctrl->opts->discovery_nqn) &&
- nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
- dev_err(ctrl->device,
- "ignoring ctrl due to duplicate subnqn (%s).\n",
- found->subnqn);
- nvme_put_subsystem(found);
- ret = -EINVAL;
- goto out_unlock;
- }
-
__nvme_release_subsystem(subsys);
subsys = found;
+
+ if (!nvme_validate_cntlid(subsys, ctrl, id)) {
+ ret = -EINVAL;
+ goto out_put_subsystem;
+ }
} else {
ret = device_add(&subsys->dev);
if (ret) {
@@ -2417,23 +2446,20 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
list_add_tail(&subsys->entry, &nvme_subsystems);
}
- ctrl->subsys = subsys;
- mutex_unlock(&nvme_subsystems_lock);
-
if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
dev_name(ctrl->device))) {
dev_err(ctrl->device,
"failed to create sysfs link from subsystem.\n");
- /* the transport driver will eventually put the subsystem */
- return -EINVAL;
+ goto out_put_subsystem;
}
- mutex_lock(&subsys->lock);
+ ctrl->subsys = subsys;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
- mutex_unlock(&subsys->lock);
-
+ mutex_unlock(&nvme_subsystems_lock);
return 0;
+out_put_subsystem:
+ nvme_put_subsystem(subsys);
out_unlock:
mutex_unlock(&nvme_subsystems_lock);
put_device(&subsys->dev);
@@ -2549,7 +2575,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->crdt[2] = le16_to_cpu(id->crdt3);
ctrl->oacs = le16_to_cpu(id->oacs);
- ctrl->oncs = le16_to_cpup(&id->oncs);
+ ctrl->oncs = le16_to_cpu(id->oncs);
+ ctrl->mtfa = le16_to_cpu(id->mtfa);
ctrl->oaes = le32_to_cpu(id->oaes);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
@@ -3373,7 +3400,8 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
{
struct nvme_ns *ns;
__le32 *ns_list;
- unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+ unsigned i, j, nsid, prev = 0;
+ unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
int ret = 0;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
@@ -3600,19 +3628,18 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
{
u32 aer_notice_type = (result & 0xff00) >> 8;
+ trace_nvme_async_event(ctrl, aer_notice_type);
+
switch (aer_notice_type) {
case NVME_AER_NOTICE_NS_CHANGED:
- trace_nvme_async_event(ctrl, aer_notice_type);
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
- trace_nvme_async_event(ctrl, aer_notice_type);
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
- trace_nvme_async_event(ctrl, aer_notice_type);
if (!ctrl->ana_log_buf)
break;
queue_work(nvme_wq, &ctrl->ana_work);
@@ -3675,6 +3702,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
+ dev_pm_qos_hide_latency_tolerance(ctrl->device);
cdev_device_del(&ctrl->cdev, ctrl->device);
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
@@ -3691,10 +3719,10 @@ static void nvme_free_ctrl(struct device *dev)
__free_page(ctrl->discard_page);
if (subsys) {
- mutex_lock(&subsys->lock);
+ mutex_lock(&nvme_subsystems_lock);
list_del(&ctrl->subsys_entry);
- mutex_unlock(&subsys->lock);
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+ mutex_unlock(&nvme_subsystems_lock);
}
ctrl->ops->free_ctrl(ctrl);
@@ -3874,10 +3902,49 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
-int __init nvme_core_init(void)
+
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_sync_queue(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
+/*
+ * Check we didn't inadvertently grow the command structure sizes:
+ */
+static inline void _nvme_check_size(void)
+{
+ BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
+}
+
+
+static int __init nvme_core_init(void)
{
int result = -ENOMEM;
+ _nvme_check_size();
+
nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq)
@@ -3924,7 +3991,7 @@ out:
return result;
}
-void __exit nvme_core_exit(void)
+static void __exit nvme_core_exit(void)
{
ida_destroy(&nvme_subsystems_ida);
class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index d4cb826f58ff..5838f7cd53ac 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -978,7 +978,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
NVMF_OPT_DISABLE_SQFLOW)
static struct nvme_ctrl *
-nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
+nvmf_create_ctrl(struct device *dev, const char *buf)
{
struct nvmf_ctrl_options *opts;
struct nvmf_transport_ops *ops;
@@ -1073,7 +1073,7 @@ static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf,
goto out_unlock;
}
- ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
+ ctrl = nvmf_create_ctrl(nvmf_device, buf);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
goto out_unlock;
@@ -1188,6 +1188,7 @@ static void __exit nvmf_exit(void)
class_destroy(nvmf_class);
nvmf_host_put(nvmf_default_host);
+ BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 31637f8ef22e..dd8169bbf0d2 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -202,7 +202,7 @@ static LIST_HEAD(nvme_fc_lport_list);
static DEFINE_IDA(nvme_fc_local_port_cnt);
static DEFINE_IDA(nvme_fc_ctrl_cnt);
-
+static struct workqueue_struct *nvme_fc_wq;
/*
* These items are short-term. They will eventually be moved into
@@ -1845,7 +1845,7 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
memset(queue, 0, sizeof(*queue));
queue->ctrl = ctrl;
queue->qnum = idx;
- atomic_set(&queue->csn, 1);
+ atomic_set(&queue->csn, 0);
queue->dev = ctrl->dev;
if (idx > 0)
@@ -1887,7 +1887,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue)
*/
queue->connection_id = 0;
- atomic_set(&queue->csn, 1);
+ atomic_set(&queue->csn, 0);
}
static void
@@ -2054,7 +2054,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
*/
if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
active = atomic_xchg(&ctrl->err_work_active, 1);
- if (!active && !schedule_work(&ctrl->err_work)) {
+ if (!active && !queue_work(nvme_fc_wq, &ctrl->err_work)) {
atomic_set(&ctrl->err_work_active, 0);
WARN_ON(1);
}
@@ -2183,7 +2183,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
{
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
struct nvme_command *sqe = &cmdiu->sqe;
- u32 csn;
int ret, opstate;
/*
@@ -2198,8 +2197,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
- csn = atomic_inc_return(&queue->csn);
- cmdiu->csn = cpu_to_be32(csn);
cmdiu->data_len = cpu_to_be32(data_len);
switch (io_dir) {
case NVMEFC_FCP_WRITE:
@@ -2257,11 +2254,24 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (!(op->flags & FCOP_FLAGS_AEN))
blk_mq_start_request(op->rq);
+ cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
&ctrl->rport->remoteport,
queue->lldd_handle, &op->fcp_req);
if (ret) {
+ /*
+ * If the lld fails to send the command is there an issue with
+ * the csn value? If the command that fails is the Connect,
+ * no - as the connection won't be live. If it is a command
+ * post-connect, it's possible a gap in csn may be created.
+ * Does this matter? As Linux initiators don't send fused
+ * commands, no. The gap would exist, but as there's nothing
+ * that depends on csn order to be delivered on the target
+ * side, it shouldn't hurt. It would be difficult for a
+ * target to even detect the csn gap as it has no idea when the
+ * cmd with the csn was supposed to arrive.
+ */
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
@@ -3389,6 +3399,10 @@ static int __init nvme_fc_init_module(void)
{
int ret;
+ nvme_fc_wq = alloc_workqueue("nvme_fc_wq", WQ_MEM_RECLAIM, 0);
+ if (!nvme_fc_wq)
+ return -ENOMEM;
+
/*
* NOTE:
* It is expected that in the future the kernel will combine
@@ -3406,7 +3420,7 @@ static int __init nvme_fc_init_module(void)
ret = class_register(&fc_class);
if (ret) {
pr_err("couldn't register class fc\n");
- return ret;
+ goto out_destroy_wq;
}
/*
@@ -3430,6 +3444,9 @@ out_destroy_device:
device_destroy(&fc_class, MKDEV(0, 0));
out_destroy_class:
class_unregister(&fc_class);
+out_destroy_wq:
+ destroy_workqueue(nvme_fc_wq);
+
return ret;
}
@@ -3446,6 +3463,7 @@ static void __exit nvme_fc_exit_module(void)
device_destroy(&fc_class, MKDEV(0, 0));
class_unregister(&fc_class);
+ destroy_workqueue(nvme_fc_wq);
}
module_init(nvme_fc_init_module);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 949e29e1d782..4f20a10b39d3 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -977,6 +977,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
geo->csecs = 1 << ns->lba_shift;
geo->sos = ns->ms;
geo->ext = ns->ext;
+ geo->mdts = ns->ctrl->max_hw_sectors;
dev->q = q;
memcpy(dev->name, disk_name, DISK_NAME_LEN);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 2839bb70badf..499acf07d61a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -31,7 +31,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
} else if (ns->head->disk) {
sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
- ctrl->cntlid, ns->head->instance);
+ ctrl->instance, ns->head->instance);
*flags = GENHD_FL_HIDDEN;
} else {
sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
@@ -232,6 +232,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
+ /*
+ * The namespace might be going away and the bio might
+ * be moved to a different queue via blk_steal_bios(),
+ * so we need to use the bio_split pool from the original
+ * queue to allocate the bvecs from.
+ */
+ blk_queue_split(q, &bio);
+
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (likely(ns)) {
@@ -404,15 +412,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state)
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
- enum nvme_ana_state old;
-
mutex_lock(&ns->head->lock);
- old = ns->ana_state;
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
- if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
+ if (nvme_state_is_live(ns->ana_state))
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
@@ -424,7 +429,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
- dev_info(ctrl->device, "ANA group %d: %s.\n",
+ dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
nvme_ana_state_names[desc->state]);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 527d64545023..55553d293a98 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -441,6 +441,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
@@ -577,7 +578,4 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
return dev_to_disk(dev)->private_data;
}
-int __init nvme_core_init(void);
-void __exit nvme_core_exit(void);
-
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a90cf5d63aac..524d6bd6d095 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -146,7 +146,7 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
static int queue_count_set(const char *val, const struct kernel_param *kp)
{
- int n = 0, ret;
+ int n, ret;
ret = kstrtoint(val, 10, &n);
if (ret)
@@ -177,7 +177,6 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
* commands and one for I/O commands).
*/
struct nvme_queue {
- struct device *q_dmadev;
struct nvme_dev *dev;
spinlock_t sq_lock;
struct nvme_command *sq_cmds;
@@ -189,7 +188,7 @@ struct nvme_queue {
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
u16 q_depth;
- s16 cq_vector;
+ u16 cq_vector;
u16 sq_tail;
u16 last_sq_tail;
u16 cq_head;
@@ -200,6 +199,7 @@ struct nvme_queue {
#define NVMEQ_ENABLED 0
#define NVMEQ_SQ_CMB 1
#define NVMEQ_DELETE_ERROR 2
+#define NVMEQ_POLLED 3
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
@@ -208,10 +208,10 @@ struct nvme_queue {
};
/*
- * The nvme_iod describes the data in an I/O, including the list of PRP
- * entries. You can't see it in this data structure because C doesn't let
- * me express that. Use nvme_init_iod to ensure there's enough space
- * allocated to store the PRP list.
+ * The nvme_iod describes the data in an I/O.
+ *
+ * The sg pointer contains the list of PRP/SGL chunk allocations in addition
+ * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
@@ -220,33 +220,12 @@ struct nvme_iod {
int aborted;
int npages; /* In the PRP list. 0 means small pool in use */
int nents; /* Used in scatterlist */
- int length; /* Of data, in bytes */
dma_addr_t first_dma;
- struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
+ unsigned int dma_len; /* length of single DMA segment mapping */
+ dma_addr_t meta_dma;
struct scatterlist *sg;
- struct scatterlist inline_sg[0];
};
-/*
- * Check we didin't inadvertently grow the command struct
- */
-static inline void _nvme_check_size(void)
-{
- BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
- BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
- BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
- BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
-}
-
static unsigned int max_io_queues(void)
{
return num_possible_cpus() + write_queues + poll_queues;
@@ -372,12 +351,6 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
}
/*
- * Max size of iod being embedded in the request payload
- */
-#define NVME_INT_PAGES 2
-#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size)
-
-/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
@@ -411,15 +384,6 @@ static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
return alloc_size + sizeof(struct scatterlist) * nseg;
}
-static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
-{
- unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
- NVME_INT_BYTES(dev), NVME_INT_PAGES,
- use_sgl);
-
- return sizeof(struct nvme_iod) + alloc_size;
-}
-
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -500,7 +464,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
* affinity), so use the regular blk-mq cpu mapping
*/
map->queue_offset = qoff;
- if (i != HCTX_TYPE_POLL)
+ if (i != HCTX_TYPE_POLL && offset)
blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
else
blk_mq_map_queues(map);
@@ -584,37 +548,26 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
return true;
}
-static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
- int nseg = blk_rq_nr_phys_segments(rq);
- unsigned int size = blk_rq_payload_bytes(rq);
-
- iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ enum dma_data_direction dma_dir = rq_data_dir(req) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE;
+ const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+ dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+ int i;
- if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
- iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sg)
- return BLK_STS_RESOURCE;
- } else {
- iod->sg = iod->inline_sg;
+ if (iod->dma_len) {
+ dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
+ return;
}
- iod->aborted = 0;
- iod->npages = -1;
- iod->nents = 0;
- iod->length = size;
-
- return BLK_STS_OK;
-}
+ WARN_ON_ONCE(!iod->nents);
-static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
- dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+ /* P2PDMA requests do not need to be unmapped */
+ if (!is_pci_p2pdma_page(sg_page(iod->sg)))
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
- int i;
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
@@ -638,8 +591,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
dma_addr = next_dma_addr;
}
- if (iod->sg != iod->inline_sg)
- mempool_free(iod->sg, dev->iod_mempool);
+ mempool_free(iod->sg, dev->iod_mempool);
}
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -829,80 +781,104 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
+static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmnd,
+ struct bio_vec *bv)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;
+
+ iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->first_dma))
+ return BLK_STS_RESOURCE;
+ iod->dma_len = bv->bv_len;
+
+ cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
+ if (bv->bv_len > first_prp_len)
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
+ return 0;
+}
+
+static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmnd,
+ struct bio_vec *bv)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->first_dma))
+ return BLK_STS_RESOURCE;
+ iod->dma_len = bv->bv_len;
+
+ cmnd->flags = NVME_CMD_SGL_METABUF;
+ cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
+ cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
+ cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ return 0;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct request_queue *q = req->q;
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
- blk_status_t ret = BLK_STS_IOERR;
+ blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
+ if (blk_rq_nr_phys_segments(req) == 1) {
+ struct bio_vec bv = req_bvec(req);
+
+ if (!is_pci_p2pdma_page(bv.bv_page)) {
+ if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
+ return nvme_setup_prp_simple(dev, req,
+ &cmnd->rw, &bv);
+
+ if (iod->nvmeq->qid &&
+ dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
+ return nvme_setup_sgl_simple(dev, req,
+ &cmnd->rw, &bv);
+ }
+ }
+
+ iod->dma_len = 0;
+ iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+ if (!iod->sg)
+ return BLK_STS_RESOURCE;
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
- iod->nents = blk_rq_map_sg(q, req, iod->sg);
+ iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto out;
- ret = BLK_STS_RESOURCE;
-
if (is_pci_p2pdma_page(sg_page(iod->sg)))
nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
- dma_dir);
+ rq_dma_dir(req));
else
nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
- dma_dir, DMA_ATTR_NO_WARN);
+ rq_dma_dir(req), DMA_ATTR_NO_WARN);
if (!nr_mapped)
goto out;
+ iod->use_sgl = nvme_pci_use_sgls(dev, req);
if (iod->use_sgl)
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
-
- if (ret != BLK_STS_OK)
- goto out_unmap;
-
- ret = BLK_STS_IOERR;
- if (blk_integrity_rq(req)) {
- if (blk_rq_count_integrity_sg(q, req->bio) != 1)
- goto out_unmap;
-
- sg_init_table(&iod->meta_sg, 1);
- if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
- goto out_unmap;
-
- if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
- goto out_unmap;
-
- cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
- }
-
- return BLK_STS_OK;
-
-out_unmap:
- dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
+ if (ret != BLK_STS_OK)
+ nvme_unmap_data(dev, req);
return ret;
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
+ struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
-
- if (iod->nents) {
- /* P2PDMA requests do not need to be unmapped */
- if (!is_pci_p2pdma_page(sg_page(iod->sg)))
- dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
- if (blk_integrity_rq(req))
- dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
- }
-
- nvme_cleanup_cmd(req);
- nvme_free_iod(dev, req);
+ iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
+ rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->meta_dma))
+ return BLK_STS_IOERR;
+ cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ return 0;
}
/*
@@ -915,9 +891,14 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct nvme_queue *nvmeq = hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_command cmnd;
blk_status_t ret;
+ iod->aborted = 0;
+ iod->npages = -1;
+ iod->nents = 0;
+
/*
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
@@ -929,21 +910,23 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ret)
return ret;
- ret = nvme_init_iod(req, dev);
- if (ret)
- goto out_free_cmd;
-
if (blk_rq_nr_phys_segments(req)) {
ret = nvme_map_data(dev, req, &cmnd);
if (ret)
- goto out_cleanup_iod;
+ goto out_free_cmd;
+ }
+
+ if (blk_integrity_rq(req)) {
+ ret = nvme_map_metadata(dev, req, &cmnd);
+ if (ret)
+ goto out_unmap_data;
}
blk_mq_start_request(req);
nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
-out_cleanup_iod:
- nvme_free_iod(dev, req);
+out_unmap_data:
+ nvme_unmap_data(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -952,8 +935,14 @@ out_free_cmd:
static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_dev *dev = iod->nvmeq->dev;
- nvme_unmap_data(iod->nvmeq->dev, req);
+ nvme_cleanup_cmd(req);
+ if (blk_integrity_rq(req))
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req)->bv_len, rq_data_dir(req));
+ if (blk_rq_nr_phys_segments(req))
+ nvme_unmap_data(dev, req);
nvme_complete_rq(req);
}
@@ -1088,7 +1077,7 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
* using the CQ lock. For normal interrupt driven threads we have
* to disable the interrupt to avoid racing with it.
*/
- if (nvmeq->cq_vector == -1) {
+ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
spin_lock(&nvmeq->cq_poll_lock);
found = nvme_process_cq(nvmeq, &start, &end, tag);
spin_unlock(&nvmeq->cq_poll_lock);
@@ -1148,7 +1137,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG;
- if (vector != -1)
+ if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
flags |= NVME_CQ_IRQ_ENABLED;
/*
@@ -1161,10 +1150,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
- if (vector != -1)
- c.create_cq.irq_vector = cpu_to_le16(vector);
- else
- c.create_cq.irq_vector = 0;
+ c.create_cq.irq_vector = cpu_to_le16(vector);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1308,13 +1294,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
*/
switch (dev->ctrl.state) {
case NVME_CTRL_CONNECTING:
- case NVME_CTRL_RESETTING:
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ /* fall through */
+ case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE;
+ case NVME_CTRL_RESETTING:
+ return BLK_EH_RESET_TIMER;
default:
break;
}
@@ -1371,16 +1361,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
- dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
return;
if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
- pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+ pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
} else {
- dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
}
@@ -1410,10 +1400,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
- if (nvmeq->cq_vector == -1)
- return 0;
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
- nvmeq->cq_vector = -1;
+ if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
+ pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
return 0;
}
@@ -1498,7 +1486,6 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
goto free_cqdma;
- nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
spin_lock_init(&nvmeq->cq_poll_lock);
@@ -1507,7 +1494,6 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->qid = qid;
- nvmeq->cq_vector = -1;
dev->ctrl.queue_count++;
return 0;
@@ -1552,7 +1538,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
- s16 vector;
+ u16 vector = 0;
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
@@ -1563,7 +1549,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
if (!polled)
vector = dev->num_vecs == 1 ? 0 : qid;
else
- vector = -1;
+ set_bit(NVMEQ_POLLED, &nvmeq->flags);
result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result)
@@ -1578,7 +1564,8 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
nvmeq->cq_vector = vector;
nvme_init_queue(nvmeq, qid);
- if (vector != -1) {
+ if (!polled) {
+ nvmeq->cq_vector = vector;
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
@@ -1588,7 +1575,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
return result;
release_sq:
- nvmeq->cq_vector = -1;
dev->online_queues--;
adapter_delete_sq(dev, qid);
release_cq:
@@ -1639,7 +1625,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
- dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+ dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
@@ -1730,7 +1716,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
nvme_init_queue(nvmeq, 0);
result = queue_request_irq(nvmeq);
if (result) {
- nvmeq->cq_vector = -1;
+ dev->online_queues--;
return result;
}
@@ -2171,10 +2157,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
* number of interrupts.
*/
result = queue_request_irq(adminq);
- if (result) {
- adminq->cq_vector = -1;
+ if (result)
return result;
- }
set_bit(NVMEQ_ENABLED, &adminq->flags);
result = nvme_create_io_queues(dev);
@@ -2286,11 +2270,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
- if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
- dev->tagset.cmd_size = max(dev->tagset.cmd_size,
- nvme_pci_cmd_size(dev, true));
- }
+ dev->tagset.cmd_size = sizeof(struct nvme_iod);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
@@ -2301,8 +2281,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
return ret;
}
dev->ctrl.tagset = &dev->tagset;
-
- nvme_dbbuf_set(dev);
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
@@ -2310,6 +2288,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
nvme_free_queues(dev, dev->online_queues);
}
+ nvme_dbbuf_set(dev);
return 0;
}
@@ -2397,7 +2376,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- bool dead = true;
+ bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
mutex_lock(&dev->shutdown_lock);
@@ -2405,8 +2384,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
u32 csts = readl(dev->bar + NVME_REG_CSTS);
if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING)
+ dev->ctrl.state == NVME_CTRL_RESETTING) {
+ freeze = true;
nvme_start_freeze(&dev->ctrl);
+ }
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pdev->error_state != pci_channel_io_normal);
}
@@ -2415,10 +2396,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown.
*/
- if (!dead) {
- if (shutdown)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
- }
+ if (!dead && shutdown && freeze)
+ nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&dev->ctrl);
@@ -2438,8 +2417,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* must flush all entered requests to their failed completion to avoid
* deadlocking blk-mq hot-cpu notifier.
*/
- if (shutdown)
+ if (shutdown) {
nvme_start_queues(&dev->ctrl);
+ if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
+ blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+ }
mutex_unlock(&dev->shutdown_lock);
}
@@ -2510,6 +2492,7 @@ static void nvme_reset_work(struct work_struct *work)
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
+ nvme_sync_queues(&dev->ctrl);
mutex_lock(&dev->shutdown_lock);
result = nvme_pci_enable(dev);
@@ -2530,6 +2513,12 @@ static void nvme_reset_work(struct work_struct *work)
*/
dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
dev->ctrl.max_segments = NVME_MAX_SEGS;
+
+ /*
+ * Don't limit the IOMMU merged segment size.
+ */
+ dma_set_max_seg_size(dev->dev, 0xffffffff);
+
mutex_unlock(&dev->shutdown_lock);
/*
@@ -2979,6 +2968,9 @@ static struct pci_driver nvme_driver = {
static int __init nvme_init(void)
{
+ BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
return pci_register_driver(&nvme_driver);
}
@@ -2987,7 +2979,6 @@ static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
flush_workqueue(nvme_wq);
- _nvme_check_size();
}
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 11a5ecae78c8..97f668a39ae1 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
if (!ring)
return NULL;
+ /*
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
+ * lifetime. It's safe, since any chage in the underlying RDMA device
+ * will issue error recovery and queue re-creation.
+ */
for (i = 0; i < ib_queue_size; i++) {
if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
goto out_free_ring;
@@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
+ kfree(req->sqe.data);
}
static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
- int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl;
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
- if (ret)
- return ret;
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
+ if (!req->sqe.data)
+ return -ENOMEM;
req->queue = queue;
@@ -641,34 +637,16 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
struct ib_device *ibdev = ctrl->device->dev;
- unsigned int nr_io_queues;
+ unsigned int nr_io_queues, nr_default_queues;
+ unsigned int nr_read_queues, nr_poll_queues;
int i, ret;
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
-
- /*
- * we map queues according to the device irq vectors for
- * optimal locality so we don't need more queues than
- * completion vectors.
- */
- nr_io_queues = min_t(unsigned int, nr_io_queues,
- ibdev->num_comp_vectors);
-
- if (opts->nr_write_queues) {
- ctrl->io_queues[HCTX_TYPE_DEFAULT] =
- min(opts->nr_write_queues, nr_io_queues);
- nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT];
- } else {
- ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
- }
-
- ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues;
-
- if (opts->nr_poll_queues) {
- ctrl->io_queues[HCTX_TYPE_POLL] =
- min(opts->nr_poll_queues, num_online_cpus());
- nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL];
- }
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_io_queues, num_online_cpus()));
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_write_queues, num_online_cpus()));
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
@@ -681,6 +659,34 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_default_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_read_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(nr_poll_queues, nr_io_queues);
+ }
+
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_rdma_alloc_queue(ctrl, i,
ctrl->ctrl.sqsize + 1);
@@ -697,15 +703,6 @@ out_free_queues:
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
- struct blk_mq_tag_set *set)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- blk_mq_free_tag_set(set);
- nvme_rdma_dev_put(ctrl->device);
-}
-
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
{
@@ -744,24 +741,9 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
ret = blk_mq_alloc_tag_set(set);
if (ret)
- goto out;
-
- /*
- * We need a reference on the device as long as the tag_set is alive,
- * as the MRs in the request structures need a valid ib_device.
- */
- ret = nvme_rdma_dev_get(ctrl->device);
- if (!ret) {
- ret = -EINVAL;
- goto out_free_tagset;
- }
+ return ERR_PTR(ret);
return set;
-
-out_free_tagset:
- blk_mq_free_tag_set(set);
-out:
- return ERR_PTR(ret);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -769,7 +751,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -793,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+ /*
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
+ * It's safe, since any chage in the underlying RDMA device will issue
+ * error recovery and queue re-creation.
+ */
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
if (error)
@@ -847,7 +834,7 @@ out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
@@ -862,7 +849,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
@@ -903,7 +890,7 @@ out_cleanup_connect_q:
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
@@ -914,8 +901,9 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.admin_tagset)
+ blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
+ nvme_cancel_request, &ctrl->ctrl);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -926,8 +914,9 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.tagset)
+ blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
+ nvme_cancel_request, &ctrl->ctrl);
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
@@ -1731,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev;
+
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
+ sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
+ if (unlikely(err))
+ return BLK_STS_RESOURCE;
+
ib_dma_sync_single_for_cpu(dev, sqe->dma,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ret = nvme_setup_cmd(ns, rq, c);
if (ret)
- return ret;
+ goto unmap_qe;
blk_mq_start_request(rq);
@@ -1761,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
}
return BLK_STS_OK;
+
err:
if (err == -ENOMEM || err == -EAGAIN)
- return BLK_STS_RESOURCE;
- return BLK_STS_IOERR;
+ ret = BLK_STS_RESOURCE;
+ else
+ ret = BLK_STS_IOERR;
+unmap_qe:
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ return ret;
}
static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
@@ -1777,25 +1780,36 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct ib_device *ibdev = queue->device->dev;
- nvme_rdma_unmap_data(req->queue, rq);
+ nvme_rdma_unmap_data(queue, rq);
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
nvme_complete_rq(rq);
}
static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
- set->map[HCTX_TYPE_DEFAULT].nr_queues =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
- set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ];
- if (ctrl->ctrl.opts->nr_write_queues) {
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
/* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
set->map[HCTX_TYPE_READ].queue_offset =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
} else {
- /* mixed read/write queues */
+ /* shared read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_READ].queue_offset = 0;
}
blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
@@ -1803,16 +1817,22 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
ctrl->device->dev, 0);
- if (ctrl->ctrl.opts->nr_poll_queues) {
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
set->map[HCTX_TYPE_POLL].nr_queues =
ctrl->io_queues[HCTX_TYPE_POLL];
set->map[HCTX_TYPE_POLL].queue_offset =
- ctrl->io_queues[HCTX_TYPE_DEFAULT];
- if (ctrl->ctrl.opts->nr_write_queues)
- set->map[HCTX_TYPE_POLL].queue_offset +=
- ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
}
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d/%d default/read/poll queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
+
return 0;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index e7e08889865e..08a2501b9357 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -111,6 +111,7 @@ struct nvme_tcp_ctrl {
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
+ u32 io_queues[HCTX_MAX_TYPES];
};
static LIST_HEAD(nvme_tcp_ctrl_list);
@@ -473,7 +474,6 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
}
return 0;
-
}
static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
@@ -627,14 +627,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
return ret;
}
-static inline void nvme_tcp_end_request(struct request *rq, __le16 status)
+static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{
union nvme_result res = {};
nvme_end_request(rq, cpu_to_le16(status << 1), res);
}
-
static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
@@ -1425,7 +1424,8 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
if (!ret) {
set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
} else {
- __nvme_tcp_stop_queue(&ctrl->queues[idx]);
+ if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
+ __nvme_tcp_stop_queue(&ctrl->queues[idx]);
dev_err(nctrl->device,
"failed to connect queue: %d ret=%d\n", idx, ret);
}
@@ -1535,7 +1535,7 @@ out_free_queue:
return ret;
}
-static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
@@ -1565,7 +1565,36 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
return nr_io_queues;
}
-static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
+static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
+ unsigned int nr_io_queues)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvmf_ctrl_options *opts = nctrl->opts;
+
+ if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(opts->nr_write_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(opts->nr_io_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
unsigned int nr_io_queues;
int ret;
@@ -1582,7 +1611,9 @@ static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
dev_info(ctrl->device,
"creating %d I/O queues.\n", nr_io_queues);
- return nvme_tcp_alloc_io_queues(ctrl);
+ nvme_tcp_set_io_queues(ctrl, nr_io_queues);
+
+ return __nvme_tcp_alloc_io_queues(ctrl);
}
static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
@@ -1599,7 +1630,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
{
int ret;
- ret = nvme_alloc_io_queues(ctrl);
+ ret = nvme_tcp_alloc_io_queues(ctrl);
if (ret)
return ret;
@@ -1710,7 +1741,9 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
- blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
+ if (ctrl->admin_tagset)
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset,
+ nvme_cancel_request, ctrl);
blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
@@ -1722,7 +1755,9 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
return;
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
+ if (ctrl->tagset)
+ blk_mq_tagset_busy_iter(ctrl->tagset,
+ nvme_cancel_request, ctrl);
if (remove)
nvme_start_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
@@ -2086,23 +2121,34 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_tcp_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
- set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
- if (ctrl->ctrl.opts->nr_write_queues) {
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
/* separate read/write queues */
set->map[HCTX_TYPE_DEFAULT].nr_queues =
- ctrl->ctrl.opts->nr_write_queues;
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
set->map[HCTX_TYPE_READ].queue_offset =
- ctrl->ctrl.opts->nr_write_queues;
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
} else {
- /* mixed read/write queues */
+ /* shared read/write queues */
set->map[HCTX_TYPE_DEFAULT].nr_queues =
- ctrl->ctrl.opts->nr_io_queues;
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
set->map[HCTX_TYPE_READ].queue_offset = 0;
}
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d default/read queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ]);
+
return 0;
}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 97d3c77365b8..e71502d141ed 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -167,6 +167,7 @@ TRACE_EVENT(nvme_async_event,
aer_name(NVME_AER_NOTICE_NS_CHANGED),
aer_name(NVME_AER_NOTICE_ANA),
aer_name(NVME_AER_NOTICE_FW_ACT_STARTING),
+ aer_name(NVME_AER_NOTICE_DISC_CHANGED),
aer_name(NVME_AER_ERROR),
aer_name(NVME_AER_SMART),
aer_name(NVME_AER_CSS),