aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c258
1 files changed, 198 insertions, 60 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d3c5086673bc..b82492cd7503 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold,
"Use SGLs when average request segment size is larger or equal to "
"this size. Use 0 to disable SGLs.");
+#define NVME_PCI_MIN_QUEUE_SIZE 2
+#define NVME_PCI_MAX_QUEUE_SIZE 4095
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
@@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = {
static unsigned int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
-MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
@@ -135,6 +137,7 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
u32 last_ps;
+ bool hmb;
mempool_t *iod_mempool;
@@ -153,18 +156,14 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+
+ bool attrs_added;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
- int ret;
- u32 n;
-
- ret = kstrtou32(val, 10, &n);
- if (ret != 0 || n < 2)
- return -EINVAL;
-
- return param_set_uint(val, kp);
+ return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
+ NVME_PCI_MAX_QUEUE_SIZE);
}
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@@ -1014,7 +1013,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
return;
}
- req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
+ req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
if (unlikely(!req)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
@@ -1554,6 +1553,28 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
wmb(); /* ensure the first interrupt sees the initialization */
}
+/*
+ * Try getting shutdown_lock while setting up IO queues.
+ */
+static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
+{
+ /*
+ * Give up if the lock is being held by nvme_dev_disable.
+ */
+ if (!mutex_trylock(&dev->shutdown_lock))
+ return -ENODEV;
+
+ /*
+ * Controller is in wrong state, fail early.
+ */
+ if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+ mutex_unlock(&dev->shutdown_lock);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
@@ -1582,8 +1603,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
goto release_cq;
nvmeq->cq_vector = vector;
- nvme_init_queue(nvmeq, qid);
+ result = nvme_setup_io_queues_trylock(dev);
+ if (result)
+ return result;
+ nvme_init_queue(nvmeq, qid);
if (!polled) {
result = queue_request_irq(nvmeq);
if (result < 0)
@@ -1591,10 +1615,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
}
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
+ mutex_unlock(&dev->shutdown_lock);
return result;
release_sq:
dev->online_queues--;
+ mutex_unlock(&dev->shutdown_lock);
adapter_delete_sq(dev, qid);
release_cq:
adapter_delete_cq(dev, qid);
@@ -1781,17 +1807,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
return ret >= 0 ? 0 : ret;
}
-static ssize_t nvme_cmb_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
-
- return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n",
- ndev->cmbloc, ndev->cmbsz);
-}
-static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
-
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{
u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
@@ -1860,20 +1875,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
pci_p2pmem_publish(pdev, true);
-
- if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
- &dev_attr_cmb.attr, NULL))
- dev_warn(dev->ctrl.device,
- "failed to add sysfs attribute for CMB\n");
-}
-
-static inline void nvme_release_cmb(struct nvme_dev *dev)
-{
- if (dev->cmb_size) {
- sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
- &dev_attr_cmb.attr, NULL);
- dev->cmb_size = 0;
- }
}
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
@@ -1896,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
dev_warn(dev->ctrl.device,
"failed to set host mem (err %d, flags %#x).\n",
ret, bits);
- }
+ } else
+ dev->hmb = bits & NVME_HOST_MEM_ENABLE;
+
return ret;
}
@@ -2053,6 +2056,102 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
return ret;
}
+static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n",
+ ndev->cmbloc, ndev->cmbsz);
+}
+static DEVICE_ATTR_RO(cmb);
+
+static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%u\n", ndev->cmbloc);
+}
+static DEVICE_ATTR_RO(cmbloc);
+
+static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%u\n", ndev->cmbsz);
+}
+static DEVICE_ATTR_RO(cmbsz);
+
+static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%d\n", ndev->hmb);
+}
+
+static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+ bool new;
+ int ret;
+
+ if (strtobool(buf, &new) < 0)
+ return -EINVAL;
+
+ if (new == ndev->hmb)
+ return count;
+
+ if (new) {
+ ret = nvme_setup_host_mem(ndev);
+ } else {
+ ret = nvme_set_host_mem(ndev, 0);
+ if (!ret)
+ nvme_free_host_mem(ndev);
+ }
+
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+static DEVICE_ATTR_RW(hmb);
+
+static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct nvme_ctrl *ctrl =
+ dev_get_drvdata(container_of(kobj, struct device, kobj));
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+ if (a == &dev_attr_cmb.attr ||
+ a == &dev_attr_cmbloc.attr ||
+ a == &dev_attr_cmbsz.attr) {
+ if (!dev->cmbsz)
+ return 0;
+ }
+ if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute *nvme_pci_attrs[] = {
+ &dev_attr_cmb.attr,
+ &dev_attr_cmbloc.attr,
+ &dev_attr_cmbsz.attr,
+ &dev_attr_hmb.attr,
+ NULL,
+};
+
+static const struct attribute_group nvme_pci_attr_group = {
+ .attrs = nvme_pci_attrs,
+ .is_visible = nvme_pci_attrs_are_visible,
+};
+
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@@ -2167,7 +2266,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (nr_io_queues == 0)
return 0;
- clear_bit(NVMEQ_ENABLED, &adminq->flags);
+ /*
+ * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
+ * from set to unset. If there is a window to it is truely freed,
+ * pci_free_irq_vectors() jumping into this window will crash.
+ * And take lock to avoid racing with pci_free_irq_vectors() in
+ * nvme_dev_disable() path.
+ */
+ result = nvme_setup_io_queues_trylock(dev);
+ if (result)
+ return result;
+ if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
+ pci_free_irq(pdev, 0, adminq);
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2183,14 +2293,17 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
result = nvme_remap_bar(dev, size);
if (!result)
break;
- if (!--nr_io_queues)
- return -ENOMEM;
+ if (!--nr_io_queues) {
+ result = -ENOMEM;
+ goto out_unlock;
+ }
} while (1);
adminq->q_db = dev->dbs;
retry:
/* Deregister the admin queue's interrupt */
- pci_free_irq(pdev, 0, adminq);
+ if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
+ pci_free_irq(pdev, 0, adminq);
/*
* If we enable msix early due to not intx, disable it again before
@@ -2199,8 +2312,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
pci_free_irq_vectors(pdev);
result = nvme_setup_irqs(dev, nr_io_queues);
- if (result <= 0)
- return -EIO;
+ if (result <= 0) {
+ result = -EIO;
+ goto out_unlock;
+ }
dev->num_vecs = result;
result = max(result - 1, 1);
@@ -2214,8 +2329,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
*/
result = queue_request_irq(adminq);
if (result)
- return result;
+ goto out_unlock;
set_bit(NVMEQ_ENABLED, &adminq->flags);
+ mutex_unlock(&dev->shutdown_lock);
result = nvme_create_io_queues(dev);
if (result || dev->online_queues < 2)
@@ -2224,6 +2340,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
nvme_disable_io_queues(dev);
+ result = nvme_setup_io_queues_trylock(dev);
+ if (result)
+ return result;
nvme_suspend_io_queues(dev);
goto retry;
}
@@ -2232,6 +2351,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
dev->io_queues[HCTX_TYPE_READ],
dev->io_queues[HCTX_TYPE_POLL]);
return 0;
+out_unlock:
+ mutex_unlock(&dev->shutdown_lock);
+ return result;
}
static void nvme_del_queue_end(struct request *req, blk_status_t error)
@@ -2581,7 +2703,9 @@ static void nvme_reset_work(struct work_struct *work)
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
- if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
+ if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+ dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
+ dev->ctrl.state);
result = -ENODEV;
goto out;
}
@@ -2699,6 +2823,10 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
+ if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
+ &nvme_pci_attr_group))
+ dev->attrs_added = true;
+
nvme_start_ctrl(&dev->ctrl);
return;
@@ -2947,6 +3075,13 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
+static void nvme_remove_attrs(struct nvme_dev *dev)
+{
+ if (dev->attrs_added)
+ sysfs_remove_group(&dev->ctrl.device->kobj,
+ &nvme_pci_attr_group);
+}
+
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@@ -2962,14 +3097,13 @@ static void nvme_remove(struct pci_dev *pdev)
if (!pci_device_is_present(pdev)) {
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
nvme_dev_disable(dev, true);
- nvme_dev_remove_admin(dev);
}
flush_work(&dev->ctrl.reset_work);
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
- nvme_release_cmb(dev);
+ nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
@@ -2996,8 +3130,13 @@ static int nvme_resume(struct device *dev)
if (ndev->last_ps == U32_MAX ||
nvme_set_power_state(ctrl, ndev->last_ps) != 0)
- return nvme_try_sched_reset(&ndev->ctrl);
+ goto reset;
+ if (ctrl->hmpre && nvme_setup_host_mem(ndev))
+ goto reset;
+
return 0;
+reset:
+ return nvme_try_sched_reset(ctrl);
}
static int nvme_suspend(struct device *dev)
@@ -3021,15 +3160,9 @@ static int nvme_suspend(struct device *dev)
* the PCI bus layer to put it into D3 in order to take the PCIe link
* down, so as to allow the platform to achieve its minimum low-power
* state (which may not be possible if the link is up).
- *
- * If a host memory buffer is enabled, shut down the device as the NVMe
- * specification allows the device to access the host memory buffer in
- * host DRAM from all power states, but hosts will fail access to DRAM
- * during S3.
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
- ndev->nr_host_mem_descs ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
return nvme_disable_prepare_reset(ndev, true);
@@ -3040,6 +3173,17 @@ static int nvme_suspend(struct device *dev)
if (ctrl->state != NVME_CTRL_LIVE)
goto unfreeze;
+ /*
+ * Host memory access may not be successful in a system suspend state,
+ * but the specification allows the controller to access memory in a
+ * non-operational power state.
+ */
+ if (ndev->hmb) {
+ ret = nvme_set_host_mem(ndev, 0);
+ if (ret < 0)
+ goto unfreeze;
+ }
+
ret = nvme_get_power_state(ctrl, &ndev->last_ps);
if (ret < 0)
goto unfreeze;
@@ -3192,12 +3336,6 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
- { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */
- .driver_data = NVME_QUIRK_LIGHTNVM, },
- { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */
- .driver_data = NVME_QUIRK_LIGHTNVM, },
- { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
- .driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */