aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/nvme/host/Kconfig10
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c260
-rw-r--r--drivers/nvme/host/fabrics.h3
-rw-r--r--drivers/nvme/host/fc.c49
-rw-r--r--drivers/nvme/host/hwmon.c259
-rw-r--r--drivers/nvme/host/multipath.c24
-rw-r--r--drivers/nvme/host/nvme.h40
-rw-r--r--drivers/nvme/host/pci.c112
-rw-r--r--drivers/nvme/host/rdma.c51
-rw-r--r--drivers/nvme/host/tcp.c21
11 files changed, 666 insertions, 164 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 2b36f052bfb9..c6439638a419 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -23,6 +23,16 @@ config NVME_MULTIPATH
/dev/nvmeXnY device will show up for each NVMe namespaces,
even if it is accessible through multiple controllers.
+config NVME_HWMON
+ bool "NVMe hardware monitoring"
+ depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
+ help
+ This provides support for NVMe hardware monitoring. If enabled,
+ a hardware monitoring device will be created for each NVMe drive
+ in the system.
+
+ If unsure, say N.
+
config NVME_FABRICS
tristate
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 8a4b671c5f0c..fc7b26be692d 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -14,6 +14,7 @@ nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
+nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 108f60b46804..dfe37a525f3a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -102,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
*/
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
- revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
+ /*
+ * Revalidate after unblocking dispatchers that may be holding bd_butex
+ */
+ revalidate_disk(ns->disk);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@ -113,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl)
/*
* Only new queue scan work when admin and IO queues are both alive
*/
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
+/*
+ * Use this function to proceed with scheduling reset_work for a controller
+ * that had previously been set to the resetting state. This is intended for
+ * code paths that can't be interrupted by other reset attempts. A hot removal
+ * may prevent this from succeeding.
+ */
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->state != NVME_CTRL_RESETTING)
+ return -EBUSY;
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
+ return -EBUSY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
+
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -134,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
ret = nvme_reset_ctrl(ctrl);
if (!ret) {
flush_work(&ctrl->reset_work);
- if (ctrl->state != NVME_CTRL_LIVE &&
- ctrl->state != NVME_CTRL_ADMIN_ONLY)
+ if (ctrl->state != NVME_CTRL_LIVE)
ret = -ENETRESET;
}
@@ -265,6 +283,8 @@ void nvme_complete_rq(struct request *req)
trace_nvme_complete_rq(req);
+ nvme_cleanup_cmd(req);
+
if (nvme_req(req)->ctrl->kas)
nvme_req(req)->ctrl->comp_seen = true;
@@ -295,7 +315,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
if (blk_mq_request_completed(req))
return true;
- nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+ nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
blk_mq_complete_request(req);
return true;
}
@@ -312,15 +332,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
old_state = ctrl->state;
switch (new_state) {
- case NVME_CTRL_ADMIN_ONLY:
- switch (old_state) {
- case NVME_CTRL_CONNECTING:
- changed = true;
- /* FALLTHRU */
- default:
- break;
- }
- break;
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
@@ -336,7 +347,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
changed = true;
/* FALLTHRU */
default:
@@ -356,7 +366,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
@@ -378,8 +387,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
}
- if (changed)
+ if (changed) {
ctrl->state = new_state;
+ wake_up_all(&ctrl->state_wq);
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
if (changed && ctrl->state == NVME_CTRL_LIVE)
@@ -388,6 +399,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+ switch (ctrl->state) {
+ case NVME_CTRL_NEW:
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_CONNECTING:
+ return false;
+ case NVME_CTRL_DELETING:
+ case NVME_CTRL_DEAD:
+ return true;
+ default:
+ WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+ return true;
+ }
+}
+
+/*
+ * Waits for the controller state to be resetting, or returns false if it is
+ * not possible to ever transition to that state.
+ */
+bool nvme_wait_reset(struct nvme_ctrl *ctrl)
+{
+ wait_event(ctrl->state_wq,
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
+ nvme_state_terminal(ctrl));
+ return ctrl->state == NVME_CTRL_RESETTING;
+}
+EXPORT_SYMBOL_GPL(nvme_wait_reset);
+
static void nvme_free_ns_head(struct kref *ref)
{
struct nvme_ns_head *head =
@@ -569,8 +613,14 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_dsm_range *range;
struct bio *bio;
- range = kmalloc_array(segments, sizeof(*range),
- GFP_ATOMIC | __GFP_NOWARN);
+ /*
+ * Some devices do not consider the DSM 'Number of Ranges' field when
+ * determining how much data to DMA. Always allocate memory for maximum
+ * number of segments to prevent device reading beyond end of buffer.
+ */
+ static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
+
+ range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
if (!range) {
/*
* If we fail allocation our range, fallback to the controller
@@ -584,7 +634,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
}
__rq_for_each_bio(bio, req) {
- u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
+ u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
if (n < segments) {
@@ -610,7 +660,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
- req->special_vec.bv_len = sizeof(*range) * segments;
+ req->special_vec.bv_len = alloc_size;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return BLK_STS_OK;
@@ -625,7 +675,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
- cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
cmnd->write_zeroes.control = 0;
@@ -649,7 +699,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
@@ -847,7 +897,7 @@ out:
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, u32 *result, unsigned timeout)
+ u32 meta_seed, u64 *result, unsigned timeout)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
@@ -888,7 +938,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
else
ret = nvme_req(req)->status;
if (result)
- *result = le32_to_cpu(nvme_req(req)->result.u32);
+ *result = le64_to_cpu(nvme_req(req)->result.u64);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
@@ -1303,8 +1353,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem);
-
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
}
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
@@ -1320,6 +1368,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
@@ -1335,6 +1384,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
+ u64 result;
+ int status;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+ return -EFAULT;
+ if (cmd.flags)
+ return -EINVAL;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = cmd.opcode;
+ c.common.flags = cmd.flags;
+ c.common.nsid = cpu_to_le32(cmd.nsid);
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+
+ if (cmd.timeout_ms)
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+ effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
+ status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+ (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+ (void __user *)(uintptr_t)cmd.metadata,
+ cmd.metadata_len, 0, &result, timeout);
+ nvme_passthru_end(ctrl, effects);
+
+ if (status >= 0) {
+ if (put_user(result, &ucmd->result))
+ return -EFAULT;
+ }
+
+ return status;
+}
+
+static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ struct nvme_passthru_cmd64 __user *ucmd)
+{
+ struct nvme_passthru_cmd64 cmd;
+ struct nvme_command c;
+ unsigned timeout = 0;
+ u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1405,6 +1502,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
srcu_read_unlock(&head->srcu, idx);
}
+static bool is_ctrl_ioctl(unsigned int cmd)
+{
+ if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
+ return true;
+ if (is_sed_ioctl(cmd))
+ return true;
+ return false;
+}
+
+static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
+ void __user *argp,
+ struct nvme_ns_head *head,
+ int srcu_idx)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
+
+ nvme_get_ctrl(ns->ctrl);
+ nvme_put_ns_from_disk(head, srcu_idx);
+
+ switch (cmd) {
+ case NVME_IOCTL_ADMIN_CMD:
+ ret = nvme_user_cmd(ctrl, NULL, argp);
+ break;
+ case NVME_IOCTL_ADMIN64_CMD:
+ ret = nvme_user_cmd64(ctrl, NULL, argp);
+ break;
+ default:
+ ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+ break;
+ }
+ nvme_put_ctrl(ctrl);
+ return ret;
+}
+
static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@@ -1422,20 +1554,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
* seperately and drop the ns SRCU reference early. This avoids a
* deadlock when deleting namespaces using the passthrough interface.
*/
- if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
- struct nvme_ctrl *ctrl = ns->ctrl;
-
- nvme_get_ctrl(ns->ctrl);
- nvme_put_ns_from_disk(head, srcu_idx);
-
- if (cmd == NVME_IOCTL_ADMIN_CMD)
- ret = nvme_user_cmd(ctrl, NULL, argp);
- else
- ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
-
- nvme_put_ctrl(ctrl);
- return ret;
- }
+ if (is_ctrl_ioctl(cmd))
+ return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
switch (cmd) {
case NVME_IOCTL_ID:
@@ -1448,6 +1568,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
case NVME_IOCTL_SUBMIT_IO:
ret = nvme_submit_io(ns, argp);
break;
+ case NVME_IOCTL_IO64_CMD:
+ ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+ break;
default:
if (ns->ndev)
ret = nvme_nvm_ioctl(ns, cmd, arg);
@@ -1532,7 +1655,7 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
static void nvme_set_chunk_size(struct nvme_ns *ns)
{
- u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
+ u32 chunk_size = nvme_lba_to_sect(ns, ns->noiob);
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
@@ -1569,8 +1692,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
{
- u32 max_sectors;
- unsigned short bs = 1 << ns->lba_shift;
+ u64 max_blocks;
if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
(ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
@@ -1586,11 +1708,12 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
* nvme_init_identify() if available.
*/
if (ns->ctrl->max_hw_sectors == UINT_MAX)
- max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
+ max_blocks = (u64)USHRT_MAX + 1;
else
- max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
+ max_blocks = ns->ctrl->max_hw_sectors + 1;
- blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
+ blk_queue_max_write_zeroes_sectors(disk->queue,
+ nvme_lba_to_sect(ns, max_blocks));
}
static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
@@ -1633,7 +1756,7 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
- sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
+ sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
unsigned short bs = 1 << ns->lba_shift;
u32 atomic_bs, phys_bs, io_opt;
@@ -2540,8 +2663,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
list_add_tail(&subsys->entry, &nvme_subsystems);
}
- if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
- dev_name(ctrl->device))) {
+ ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+ dev_name(ctrl->device));
+ if (ret) {
dev_err(ctrl->device,
"failed to create sysfs link from subsystem.\n");
goto out_put_subsystem;
@@ -2670,6 +2794,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->oncs = le16_to_cpu(id->oncs);
ctrl->mtfa = le16_to_cpu(id->mtfa);
ctrl->oaes = le32_to_cpu(id->oaes);
+ ctrl->wctemp = le16_to_cpu(id->wctemp);
+ ctrl->cctemp = le16_to_cpu(id->cctemp);
+
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
if (id->mdts)
@@ -2769,6 +2896,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
+ if (!ctrl->identified)
+ nvme_hwmon_init(ctrl);
+
ctrl->identified = true;
return 0;
@@ -2786,7 +2916,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
switch (ctrl->state) {
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
break;
default:
return -EWOULDBLOCK;
@@ -2838,6 +2967,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
+ case NVME_IOCTL_ADMIN64_CMD:
+ return nvme_user_cmd64(ctrl, NULL, argp);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
@@ -2857,7 +2988,7 @@ static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
.unlocked_ioctl = nvme_dev_ioctl,
- .compat_ioctl = nvme_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
static ssize_t nvme_sysfs_reset(struct device *dev,
@@ -3045,6 +3176,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
nvme_show_int_function(numa_node);
+nvme_show_int_function(queue_count);
+nvme_show_int_function(sqsize);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -3076,7 +3209,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
- [NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
@@ -3125,6 +3257,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_address.attr,
&dev_attr_state.attr,
&dev_attr_numa_node.attr,
+ &dev_attr_queue_count.attr,
+ &dev_attr_sqsize.attr,
NULL
};
@@ -3585,11 +3719,10 @@ static void nvme_scan_work(struct work_struct *work)
struct nvme_id_ctrl *id;
unsigned nn;
- if (ctrl->state != NVME_CTRL_LIVE)
+ /* No tagset on a live ctrl means IO queues could not created */
+ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
return;
- WARN_ON_ONCE(!ctrl->tagset);
-
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
dev_info(ctrl->device, "rescanning namespaces.\n");
nvme_clear_changed_ns_log(ctrl);
@@ -3750,13 +3883,13 @@ static void nvme_fw_act_work(struct work_struct *work)
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
"Fw activation timeout, reset controller\n");
- nvme_reset_ctrl(ctrl);
- break;
+ nvme_try_sched_reset(ctrl);
+ return;
}
msleep(100);
}
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
nvme_start_queues(ctrl);
@@ -3776,7 +3909,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
- queue_work(nvme_wq, &ctrl->fw_act_work);
+ /*
+ * We are (ab)using the RESETTING state to prevent subsequent
+ * recovery actions from interfering with the controller's
+ * firmware activation.
+ */
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
@@ -3899,6 +4038,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
+ init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 93f08d77c896..a0ec40ab62ee 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
- if (likely(ctrl->state == NVME_CTRL_LIVE ||
- ctrl->state == NVME_CTRL_ADMIN_ONLY))
+ if (likely(ctrl->state == NVME_CTRL_LIVE))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 265f89e11d8b..679a721ae229 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1224,7 +1224,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
lsreq->rqstlen = sizeof(*assoc_rqst);
lsreq->rspaddr = assoc_acc;
lsreq->rsplen = sizeof(*assoc_acc);
- lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+ lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC;
ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
@@ -1264,7 +1264,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
if (fcret) {
ret = -EBADF;
dev_err(ctrl->dev,
- "q %d connect failed: %s\n",
+ "q %d Create Association LS failed: %s\n",
queue->qnum, validation_errors[fcret]);
} else {
ctrl->association_id =
@@ -1332,7 +1332,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
lsreq->rqstlen = sizeof(*conn_rqst);
lsreq->rspaddr = conn_acc;
lsreq->rsplen = sizeof(*conn_acc);
- lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+ lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC;
ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
@@ -1363,7 +1363,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (fcret) {
ret = -EBADF;
dev_err(ctrl->dev,
- "q %d connect failed: %s\n",
+ "q %d Create I/O Connection LS failed: %s\n",
queue->qnum, validation_errors[fcret]);
} else {
queue->connection_id =
@@ -1376,7 +1376,7 @@ out_free_buffer:
out_no_memory:
if (ret)
dev_err(ctrl->dev,
- "queue %d connect command failed (%d).\n",
+ "queue %d connect I/O queue failed (%d).\n",
queue->qnum, ret);
return ret;
}
@@ -1413,8 +1413,8 @@ nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
static void
nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
{
- struct fcnvme_ls_disconnect_rqst *discon_rqst;
- struct fcnvme_ls_disconnect_acc *discon_acc;
+ struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst;
+ struct fcnvme_ls_disconnect_assoc_acc *discon_acc;
struct nvmefc_ls_req_op *lsop;
struct nvmefc_ls_req *lsreq;
int ret;
@@ -1430,11 +1430,11 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
lsreq = &lsop->ls_req;
lsreq->private = (void *)&lsop[1];
- discon_rqst = (struct fcnvme_ls_disconnect_rqst *)
+ discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)
(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
- discon_acc = (struct fcnvme_ls_disconnect_acc *)&discon_rqst[1];
+ discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1];
- discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+ discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC;
discon_rqst->desc_list_len = cpu_to_be32(
sizeof(struct fcnvme_lsdesc_assoc_id) +
sizeof(struct fcnvme_lsdesc_disconn_cmd));
@@ -1451,22 +1451,17 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
discon_rqst->discon_cmd.desc_len =
fcnvme_lsdesc_len(
sizeof(struct fcnvme_lsdesc_disconn_cmd));
- discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION;
- discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id);
lsreq->rqstaddr = discon_rqst;
lsreq->rqstlen = sizeof(*discon_rqst);
lsreq->rspaddr = discon_acc;
lsreq->rsplen = sizeof(*discon_acc);
- lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+ lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC;
ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
nvme_fc_disconnect_assoc_done);
if (ret)
kfree(lsop);
-
- /* only meaningful part to terminating the association */
- ctrl->association_id = 0;
}
@@ -1662,7 +1657,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
(freq->rcv_rsplen / 4) ||
be32_to_cpu(op->rsp_iu.xfrd_len) !=
freq->transferred_length ||
- op->rsp_iu.status_code ||
+ op->rsp_iu.ersp_result ||
sqe->common.command_id != cqe->command_id)) {
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device,
@@ -1672,7 +1667,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
be32_to_cpu(op->rsp_iu.xfrd_len),
freq->transferred_length,
- op->rsp_iu.status_code,
+ op->rsp_iu.ersp_result,
sqe->common.command_id,
cqe->command_id);
goto done;
@@ -1731,9 +1726,14 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
op->rq = rq;
op->rqno = rqno;
- cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+ cmdiu->format_id = NVME_CMD_FORMAT_ID;
cmdiu->fc_id = NVME_CMD_FC_ID;
cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+ if (queue->qnum)
+ cmdiu->rsv_cat = fccmnd_set_cat_css(0,
+ (NVME_CC_CSS_NVM >> NVME_CC_CSS_SHIFT));
+ else
+ cmdiu->rsv_cat = fccmnd_set_cat_admin(0);
op->fcp_req.cmddma = fc_dma_map_single(ctrl->lport->dev,
&op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE);
@@ -2173,8 +2173,6 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
rq_dma_dir(rq));
- nvme_cleanup_cmd(rq);
-
sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
@@ -2305,6 +2303,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (!(op->flags & FCOP_FLAGS_AEN))
nvme_fc_unmap_data(ctrl, op->rq, op);
+ nvme_cleanup_cmd(op->rq);
nvme_fc_ctrl_put(ctrl);
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
@@ -2695,7 +2694,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
/* warn if maxcmd is lower than queue_size */
dev_warn(ctrl->ctrl.device,
"queue_size %zu > ctrl maxcmd %u, reducing "
- "to queue_size\n",
+ "to maxcmd\n",
opts->queue_size, ctrl->ctrl.maxcmd);
opts->queue_size = ctrl->ctrl.maxcmd;
}
@@ -2703,7 +2702,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
/* warn if sqsize is lower than queue_size */
dev_warn(ctrl->ctrl.device,
- "queue_size %zu > ctrl sqsize %u, clamping down\n",
+ "queue_size %zu > ctrl sqsize %u, reducing "
+ "to sqsize\n",
opts->queue_size, ctrl->ctrl.sqsize + 1);
opts->queue_size = ctrl->ctrl.sqsize + 1;
}
@@ -2739,6 +2739,7 @@ out_term_aen_ops:
out_disconnect_admin_queue:
/* send a Disconnect(association) LS to fc-nvme target */
nvme_fc_xmt_disconnect_assoc(ctrl);
+ ctrl->association_id = 0;
out_delete_hw_queue:
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
out_free_queue:
@@ -2830,6 +2831,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
if (ctrl->association_id)
nvme_fc_xmt_disconnect_assoc(ctrl);
+ ctrl->association_id = 0;
+
if (ctrl->ctrl.tagset) {
nvme_fc_delete_hw_io_queues(ctrl);
nvme_fc_free_io_queues(ctrl);
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
new file mode 100644
index 000000000000..a5af21f5d370
--- /dev/null
+++ b/drivers/nvme/host/hwmon.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVM Express hardware monitoring support
+ * Copyright (c) 2019, Guenter Roeck
+ */
+
+#include <linux/hwmon.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+
+/* These macros should be moved to linux/temperature.h */
+#define MILLICELSIUS_TO_KELVIN(t) DIV_ROUND_CLOSEST((t) + 273150, 1000)
+#define KELVIN_TO_MILLICELSIUS(t) ((t) * 1000L - 273150)
+
+struct nvme_hwmon_data {
+ struct nvme_ctrl *ctrl;
+ struct nvme_smart_log log;
+ struct mutex read_lock;
+};
+
+static int nvme_get_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under,
+ long *temp)
+{
+ unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
+ u32 status;
+ int ret;
+
+ if (under)
+ threshold |= NVME_TEMP_THRESH_TYPE_UNDER;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
+ &status);
+ if (ret > 0)
+ return -EIO;
+ if (ret < 0)
+ return ret;
+ *temp = KELVIN_TO_MILLICELSIUS(status & NVME_TEMP_THRESH_MASK);
+
+ return 0;
+}
+
+static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under,
+ long temp)
+{
+ unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
+ int ret;
+
+ temp = MILLICELSIUS_TO_KELVIN(temp);
+ threshold |= clamp_val(temp, 0, NVME_TEMP_THRESH_MASK);
+
+ if (under)
+ threshold |= NVME_TEMP_THRESH_TYPE_UNDER;
+
+ ret = nvme_set_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
+ NULL);
+ if (ret > 0)
+ return -EIO;
+
+ return ret;
+}
+
+static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
+{
+ int ret;
+
+ ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
+ &data->log, sizeof(data->log), 0);
+
+ return ret <= 0 ? ret : -EIO;
+}
+
+static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *val)
+{
+ struct nvme_hwmon_data *data = dev_get_drvdata(dev);
+ struct nvme_smart_log *log = &data->log;
+ int temp;
+ int err;
+
+ /*
+ * First handle attributes which don't require us to read
+ * the smart log.
+ */
+ switch (attr) {
+ case hwmon_temp_max:
+ return nvme_get_temp_thresh(data->ctrl, channel, false, val);
+ case hwmon_temp_min:
+ return nvme_get_temp_thresh(data->ctrl, channel, true, val);
+ case hwmon_temp_crit:
+ *val = KELVIN_TO_MILLICELSIUS(data->ctrl->cctemp);
+ return 0;
+ default:
+ break;
+ }
+
+ mutex_lock(&data->read_lock);
+ err = nvme_hwmon_get_smart_log(data);
+ if (err)
+ goto unlock;
+
+ switch (attr) {
+ case hwmon_temp_input:
+ if (!channel)
+ temp = get_unaligned_le16(log->temperature);
+ else
+ temp = le16_to_cpu(log->temp_sensor[channel - 1]);
+ *val = KELVIN_TO_MILLICELSIUS(temp);
+ break;
+ case hwmon_temp_alarm:
+ *val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+unlock:
+ mutex_unlock(&data->read_lock);
+ return err;
+}
+
+static int nvme_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long val)
+{
+ struct nvme_hwmon_data *data = dev_get_drvdata(dev);
+
+ switch (attr) {
+ case hwmon_temp_max:
+ return nvme_set_temp_thresh(data->ctrl, channel, false, val);
+ case hwmon_temp_min:
+ return nvme_set_temp_thresh(data->ctrl, channel, true, val);
+ default:
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const char * const nvme_hwmon_sensor_names[] = {
+ "Composite",
+ "Sensor 1",
+ "Sensor 2",
+ "Sensor 3",
+ "Sensor 4",
+ "Sensor 5",
+ "Sensor 6",
+ "Sensor 7",
+ "Sensor 8",
+};
+
+static int nvme_hwmon_read_string(struct device *dev,
+ enum hwmon_sensor_types type, u32 attr,
+ int channel, const char **str)
+{
+ *str = nvme_hwmon_sensor_names[channel];
+ return 0;
+}
+
+static umode_t nvme_hwmon_is_visible(const void *_data,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+ const struct nvme_hwmon_data *data = _data;
+
+ switch (attr) {
+ case hwmon_temp_crit:
+ if (!channel && data->ctrl->cctemp)
+ return 0444;
+ break;
+ case hwmon_temp_max:
+ case hwmon_temp_min:
+ if ((!channel && data->ctrl->wctemp) ||
+ (channel && data->log.temp_sensor[channel - 1])) {
+ if (data->ctrl->quirks &
+ NVME_QUIRK_NO_TEMP_THRESH_CHANGE)
+ return 0444;
+ return 0644;
+ }
+ break;
+ case hwmon_temp_alarm:
+ if (!channel)
+ return 0444;
+ break;
+ case hwmon_temp_input:
+ case hwmon_temp_label:
+ if (!channel || data->log.temp_sensor[channel - 1])
+ return 0444;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static const struct hwmon_channel_info *nvme_hwmon_info[] = {
+ HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ),
+ HWMON_CHANNEL_INFO(temp,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_CRIT | HWMON_T_LABEL | HWMON_T_ALARM,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL,
+ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
+ HWMON_T_LABEL),
+ NULL
+};
+
+static const struct hwmon_ops nvme_hwmon_ops = {
+ .is_visible = nvme_hwmon_is_visible,
+ .read = nvme_hwmon_read,
+ .read_string = nvme_hwmon_read_string,
+ .write = nvme_hwmon_write,
+};
+
+static const struct hwmon_chip_info nvme_hwmon_chip_info = {
+ .ops = &nvme_hwmon_ops,
+ .info = nvme_hwmon_info,
+};
+
+void nvme_hwmon_init(struct nvme_ctrl *ctrl)
+{
+ struct device *dev = ctrl->dev;
+ struct nvme_hwmon_data *data;
+ struct device *hwmon;
+ int err;
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ data->ctrl = ctrl;
+ mutex_init(&data->read_lock);
+
+ err = nvme_hwmon_get_smart_log(data);
+ if (err) {
+ dev_warn(dev, "Failed to read smart log (error %d)\n", err);
+ devm_kfree(dev, data);
+ return;
+ }
+
+ hwmon = devm_hwmon_device_register_with_info(dev, "nvme", data,
+ &nvme_hwmon_chip_info,
+ NULL);
+ if (IS_ERR(hwmon)) {
+ dev_warn(dev, "Failed to instantiate hwmon device\n");
+ devm_kfree(dev, data);
+ }
+}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 30de7efef003..797c18337d96 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -95,6 +95,7 @@ void nvme_failover_req(struct request *req)
}
break;
case NVME_SC_HOST_PATH_ERROR:
+ case NVME_SC_HOST_ABORTED_CMD:
/*
* Temporary transport disruption in talking to the controller.
* Try to send on a new path.
@@ -158,9 +159,11 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
struct nvme_ns *ns;
mutex_lock(&ctrl->scan_lock);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
if (nvme_mpath_clear_current_path(ns))
kblockd_schedule_work(&ns->head->requeue_work);
+ up_read(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->scan_lock);
}
@@ -444,8 +447,14 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
struct nvme_ana_group_desc *desc = base + offset;
- u32 nr_nsids = le32_to_cpu(desc->nnsids);
- size_t nsid_buf_size = nr_nsids * sizeof(__le32);
+ u32 nr_nsids;
+ size_t nsid_buf_size;
+
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
+ return -EINVAL;
+
+ nr_nsids = le32_to_cpu(desc->nnsids);
+ nsid_buf_size = nr_nsids * sizeof(__le32);
if (WARN_ON_ONCE(desc->grpid == 0))
return -EINVAL;
@@ -465,8 +474,6 @@ static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
return error;
offset += nsid_buf_size;
- if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
- return -EINVAL;
}
return 0;
@@ -522,14 +529,13 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
return 0;
}
-static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
{
u32 nr_change_groups = 0;
int error;
mutex_lock(&ctrl->ana_lock);
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
- groups_only ? NVME_ANA_LOG_RGO : 0,
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
@@ -565,7 +571,7 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
- nvme_read_ana_log(ctrl, false);
+ nvme_read_ana_log(ctrl);
}
static void nvme_anatt_timeout(struct timer_list *t)
@@ -715,7 +721,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
goto out;
}
- error = nvme_read_ana_log(ctrl, true);
+ error = nvme_read_ana_log(ctrl);
if (error)
goto out_free_ana_log_buf;
return 0;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b5013c101b35..3b9cbe0668fa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -15,6 +15,7 @@
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
+#include <linux/wait.h>
#include <trace/events/block.h>
@@ -114,6 +115,11 @@ enum nvme_quirks {
* Prevent tag overlap between queues
*/
NVME_QUIRK_SHARED_TAGS = (1 << 13),
+
+ /*
+ * Don't change the value of the temperature threshold feature
+ */
+ NVME_QUIRK_NO_TEMP_THRESH_CHANGE = (1 << 14),
};
/*
@@ -161,7 +167,6 @@ static inline u16 nvme_req_qid(struct request *req)
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
- NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
@@ -199,6 +204,7 @@ struct nvme_ctrl {
struct cdev cdev;
struct work_struct reset_work;
struct work_struct delete_work;
+ wait_queue_head_t state_wq;
struct nvme_subsystem *subsys;
struct list_head subsys_entry;
@@ -221,6 +227,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
+ u16 sqsize;
u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
@@ -229,6 +236,8 @@ struct nvme_ctrl {
u16 kas;
u8 npss;
u8 apsta;
+ u16 wctemp;
+ u16 cctemp;
u32 oaes;
u32 aen_result;
u32 ctratt;
@@ -269,7 +278,6 @@ struct nvme_ctrl {
u16 hmmaxd;
/* Fabrics only */
- u16 sqsize;
u32 ioccsz;
u32 iorcsz;
u16 icdoff;
@@ -418,9 +426,20 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
}
-static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
+/*
+ * Convert a 512B sector number to a device logical block number.
+ */
+static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
+{
+ return sector >> (ns->lba_shift - SECTOR_SHIFT);
+}
+
+/*
+ * Convert a device logical block number to a 512B sector number.
+ */
+static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
{
- return (sector >> (ns->lba_shift - 9));
+ return lba << (ns->lba_shift - SECTOR_SHIFT);
}
static inline void nvme_end_request(struct request *req, __le16 status,
@@ -445,10 +464,16 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
put_device(ctrl->device);
}
+static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
+{
+ return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH;
+}
+
void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
+bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
@@ -499,6 +524,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
@@ -649,4 +675,10 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
return dev_to_disk(dev)->private_data;
}
+#ifdef CONFIG_NVME_HWMON
+void nvme_hwmon_init(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
+#endif
+
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c0808f9eb8ab..dcaad5831cee 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;
+ unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
+ unsigned int first_prp_len = dev->ctrl.page_size - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma))
@@ -924,7 +925,6 @@ static void nvme_pci_complete_rq(struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_dev *dev = iod->nvmeq->dev;
- nvme_cleanup_cmd(req);
if (blk_integrity_rq(req))
dma_unmap_page(dev->dev, iod->meta_dma,
rq_integrity_vec(req)->bv_len, rq_data_dir(req));
@@ -967,8 +967,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvmeq->qid == 0 &&
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
+ if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result);
return;
@@ -2263,10 +2262,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
-/*
- * return error value only when tagset allocation failed
- */
-static int nvme_dev_add(struct nvme_dev *dev)
+static void nvme_dev_add(struct nvme_dev *dev)
{
int ret;
@@ -2296,7 +2292,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
- return ret;
+ return;
}
dev->ctrl.tagset = &dev->tagset;
} else {
@@ -2307,7 +2303,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
}
nvme_dbbuf_set(dev);
- return 0;
}
static int nvme_pci_enable(struct nvme_dev *dev)
@@ -2467,6 +2462,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
mutex_unlock(&dev->shutdown_lock);
}
+static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
+{
+ if (!nvme_wait_reset(&dev->ctrl))
+ return -EBUSY;
+ nvme_dev_disable(dev, shutdown);
+ return 0;
+}
+
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
@@ -2490,14 +2493,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
+static void nvme_free_tagset(struct nvme_dev *dev)
+{
+ if (dev->tagset.tags)
+ blk_mq_free_tag_set(&dev->tagset);
+ dev->ctrl.tagset = NULL;
+}
+
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev);
put_device(dev->dev);
- if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ nvme_free_tagset(dev);
if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q);
kfree(dev->queues);
@@ -2508,6 +2517,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{
+ /*
+ * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+ * may be holding this pci_dev's device lock.
+ */
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
nvme_kill_queues(&dev->ctrl);
@@ -2521,7 +2535,6 @@ static void nvme_reset_work(struct work_struct *work)
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
- enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
result = -ENODEV;
@@ -2615,13 +2628,11 @@ static void nvme_reset_work(struct work_struct *work)
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
- new_state = NVME_CTRL_ADMIN_ONLY;
+ nvme_free_tagset(dev);
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- /* hit this only when allocate tagset fails */
- if (nvme_dev_add(dev))
- new_state = NVME_CTRL_ADMIN_ONLY;
+ nvme_dev_add(dev);
nvme_unfreeze(&dev->ctrl);
}
@@ -2629,9 +2640,9 @@ static void nvme_reset_work(struct work_struct *work)
* If only admin queue live, keep it to do further investigation or
* recovery.
*/
- if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
- "failed to mark controller state %d\n", new_state);
+ "failed to mark controller live state\n");
result = -ENODEV;
goto out;
}
@@ -2672,7 +2683,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
- *val = readq(to_nvme_dev(ctrl)->bar + off);
+ *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
return 0;
}
@@ -2836,19 +2847,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_reset_prepare(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, false);
+
+ /*
+ * We don't need to check the return value from waiting for the reset
+ * state as pci_dev device lock is held, making it impossible to race
+ * with ->remove().
+ */
+ nvme_disable_prepare_reset(dev, false);
+ nvme_sync_queues(&dev->ctrl);
}
static void nvme_reset_done(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_reset_ctrl_sync(&dev->ctrl);
+
+ if (!nvme_try_sched_reset(&dev->ctrl))
+ flush_work(&dev->ctrl.reset_work);
}
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, true);
+ nvme_disable_prepare_reset(dev, true);
}
/*
@@ -2901,7 +2921,7 @@ static int nvme_resume(struct device *dev)
if (ndev->last_ps == U32_MAX ||
nvme_set_power_state(ctrl, ndev->last_ps) != 0)
- nvme_reset_ctrl(ctrl);
+ return nvme_try_sched_reset(&ndev->ctrl);
return 0;
}
@@ -2929,43 +2949,42 @@ static int nvme_suspend(struct device *dev)
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
- (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) {
- nvme_dev_disable(ndev, true);
- return 0;
- }
+ (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
+ return nvme_disable_prepare_reset(ndev, true);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
nvme_sync_queues(ctrl);
- if (ctrl->state != NVME_CTRL_LIVE &&
- ctrl->state != NVME_CTRL_ADMIN_ONLY)
+ if (ctrl->state != NVME_CTRL_LIVE)
goto unfreeze;
ret = nvme_get_power_state(ctrl, &ndev->last_ps);
if (ret < 0)
goto unfreeze;
+ /*
+ * A saved state prevents pci pm from generically controlling the
+ * device's power. If we're using protocol specific settings, we don't
+ * want pci interfering.
+ */
+ pci_save_state(pdev);
+
ret = nvme_set_power_state(ctrl, ctrl->npss);
if (ret < 0)
goto unfreeze;
if (ret) {
+ /* discard the saved state */
+ pci_load_saved_state(pdev, NULL);
+
/*
* Clearing npss forces a controller reset on resume. The
- * correct value will be resdicovered then.
+ * correct value will be rediscovered then.
*/
- nvme_dev_disable(ndev, true);
+ ret = nvme_disable_prepare_reset(ndev, true);
ctrl->npss = 0;
- ret = 0;
- goto unfreeze;
}
- /*
- * A saved state prevents pci pm from generically controlling the
- * device's power. If we're using protocol specific settings, we don't
- * want pci interfering.
- */
- pci_save_state(pdev);
unfreeze:
nvme_unfreeze(ctrl);
return ret;
@@ -2974,9 +2993,7 @@ unfreeze:
static int nvme_simple_suspend(struct device *dev)
{
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
-
- nvme_dev_disable(ndev, true);
- return 0;
+ return nvme_disable_prepare_reset(ndev, true);
}
static int nvme_simple_resume(struct device *dev)
@@ -2984,8 +3001,7 @@ static int nvme_simple_resume(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);
- nvme_reset_ctrl(&ndev->ctrl);
- return 0;
+ return nvme_try_sched_reset(&ndev->ctrl);
}
static const struct dev_pm_ops nvme_dev_pm_ops = {
@@ -3064,7 +3080,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
- NVME_QUIRK_MEDIUM_PRIO_SQ },
+ NVME_QUIRK_MEDIUM_PRIO_SQ |
+ NVME_QUIRK_NO_TEMP_THRESH_CHANGE },
{ PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
@@ -3090,6 +3107,9 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index dfa07bb9dfeb..dce59459ed41 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -427,7 +427,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
{
return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
- ibdev->attrs.max_fast_reg_page_list_len);
+ ibdev->attrs.max_fast_reg_page_list_len - 1);
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@@ -437,7 +437,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
- int ret;
+ int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
if (!queue->device) {
@@ -479,10 +479,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
goto out_destroy_qp;
}
+ /*
+ * Currently we don't use SG_GAPS MR's so if the first entry is
+ * misaligned we'll end up using two entries for a single data page,
+ * so one additional entry is required.
+ */
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
queue->queue_size,
IB_MR_TYPE_MEM_REG,
- nvme_rdma_get_max_fr_pages(ibdev), 0);
+ pages_per_mr, 0);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
@@ -614,7 +620,8 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
if (!ret) {
set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
} else {
- __nvme_rdma_stop_queue(queue);
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ __nvme_rdma_stop_queue(queue);
dev_info(ctrl->ctrl.device,
"failed to connect queue: %d ret=%d\n", idx, ret);
}
@@ -820,8 +827,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error)
goto out_stop_queue;
- ctrl->ctrl.max_hw_sectors =
- (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
@@ -1153,8 +1160,6 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
}
ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
-
- nvme_cleanup_cmd(rq);
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
}
@@ -1494,8 +1499,8 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+ if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
+ cqe->command_id)))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
@@ -1694,6 +1699,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue));
+ /*
+ * Restart the timer if a controller reset is already scheduled. Any
+ * timed out commands would be handled before entering the connecting
+ * state.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
+ return BLK_EH_RESET_TIMER;
+
if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/*
* Teardown immediately if controller times out while starting
@@ -1753,7 +1766,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(err < 0)) {
dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", err);
- nvme_cleanup_cmd(rq);
goto err;
}
@@ -1764,18 +1776,19 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
req->mr ? &req->reg_wr.wr : NULL);
- if (unlikely(err)) {
- nvme_rdma_unmap_data(queue, rq);
- goto err;
- }
+ if (unlikely(err))
+ goto err_unmap;
return BLK_STS_OK;
+err_unmap:
+ nvme_rdma_unmap_data(queue, rq);
err:
if (err == -ENOMEM || err == -EAGAIN)
ret = BLK_STS_RESOURCE;
else
ret = BLK_STS_IOERR;
+ nvme_cleanup_cmd(rq);
unmap_qe:
ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
DMA_TO_DEVICE);
@@ -2118,8 +2131,16 @@ err_unreg_client:
static void __exit nvme_rdma_cleanup_module(void)
{
+ struct nvme_rdma_ctrl *ctrl;
+
nvmf_unregister_transport(&nvme_rdma_transport);
ib_unregister_client(&nvme_rdma_ib_client);
+
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
}
module_init(nvme_rdma_init_module);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4ffd5957637a..6d43b23a0fc8 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -491,8 +491,8 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+ if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
+ cqe->command_id)))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
@@ -1042,7 +1042,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
- unsigned long start = jiffies + msecs_to_jiffies(1);
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
do {
bool pending = false;
@@ -1067,7 +1067,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
if (!pending)
return;
- } while (time_after(jiffies, start)); /* quota is exhausted */
+ } while (!time_after(jiffies, deadline)); /* quota is exhausted */
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
@@ -1386,7 +1386,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+#ifdef CONFIG_NET_RX_BUSY_POLL
queue->sock->sk->sk_ll_usec = 1;
+#endif
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
@@ -2044,6 +2046,14 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ /*
+ * Restart the timer if a controller reset is already scheduled. Any
+ * timed out commands would be handled before entering the connecting
+ * state.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
+ return BLK_EH_RESET_TIMER;
+
dev_warn(ctrl->ctrl.device,
"queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
@@ -2126,6 +2136,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
ret = nvme_tcp_map_data(queue, rq);
if (unlikely(ret)) {
+ nvme_cleanup_cmd(rq);
dev_err(queue->ctrl->ctrl.device,
"Failed to map data (%d)\n", ret);
return ret;
@@ -2208,7 +2219,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
- if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue))
+ if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
return queue->nr_cqe;