aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c272
1 files changed, 105 insertions, 167 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4fd733ff72b1..78dca3193ca4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -54,8 +54,7 @@
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
*/
-#define NVME_NR_AEN_COMMANDS 1
-#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
+#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS)
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
@@ -92,9 +91,7 @@ struct nvme_dev {
struct msix_entry *entry;
void __iomem *bar;
struct work_struct reset_work;
- struct work_struct scan_work;
struct work_struct remove_work;
- struct work_struct async_work;
struct timer_list watchdog_timer;
struct mutex shutdown_lock;
bool subsystem;
@@ -102,11 +99,6 @@ struct nvme_dev {
dma_addr_t cmb_dma_addr;
u64 cmb_size;
u32 cmbsz;
- unsigned long flags;
-
-#define NVME_CTRL_RESETTING 0
-#define NVME_CTRL_REMOVING 1
-
struct nvme_ctrl ctrl;
struct completion ioq_wait;
};
@@ -271,40 +263,6 @@ static int nvme_init_request(void *data, struct request *req,
return 0;
}
-static void nvme_queue_scan(struct nvme_dev *dev)
-{
- /*
- * Do not queue new scan work when a controller is reset during
- * removal.
- */
- if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
- return;
- queue_work(nvme_workq, &dev->scan_work);
-}
-
-static void nvme_complete_async_event(struct nvme_dev *dev,
- struct nvme_completion *cqe)
-{
- u16 status = le16_to_cpu(cqe->status) >> 1;
- u32 result = le32_to_cpu(cqe->result);
-
- if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
- ++dev->ctrl.event_limit;
- queue_work(nvme_workq, &dev->async_work);
- }
-
- if (status != NVME_SC_SUCCESS)
- return;
-
- switch (result & 0xff07) {
- case NVME_AER_NOTICE_NS_CHANGED:
- dev_info(dev->ctrl.device, "rescanning\n");
- nvme_queue_scan(dev);
- default:
- dev_warn(dev->ctrl.device, "async event result %08x\n", result);
- }
-}
-
/**
* __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
@@ -334,16 +292,11 @@ static __le64 **iod_list(struct request *req)
return (__le64 **)(iod->sg + req->nr_phys_segments);
}
-static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+static int nvme_init_iod(struct request *rq, unsigned size,
+ struct nvme_dev *dev)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
int nseg = rq->nr_phys_segments;
- unsigned size;
-
- if (rq->cmd_flags & REQ_DISCARD)
- size = sizeof(struct nvme_dsm_range);
- else
- size = blk_rq_bytes(rq);
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
@@ -368,6 +321,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
__le64 **list = iod_list(req);
dma_addr_t prp_dma = iod->first_dma;
+ nvme_cleanup_cmd(req);
+
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
for (i = 0; i < iod->npages; i++) {
@@ -529,7 +484,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req,
}
static int nvme_map_data(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+ unsigned size, struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct request_queue *q = req->q;
@@ -546,7 +501,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
goto out;
- if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
+ if (!nvme_setup_prps(dev, req, size))
goto out_unmap;
ret = BLK_MQ_RQ_QUEUE_ERROR;
@@ -596,37 +551,6 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
}
/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
-static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct request *req, struct nvme_command *cmnd)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dsm_range *range;
-
- range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
- &iod->first_dma);
- if (!range)
- return BLK_MQ_RQ_QUEUE_BUSY;
- iod_list(req)[0] = (__le64 *)range;
- iod->npages = 0;
-
- range->cattr = cpu_to_le32(0);
- range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
- range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-
- memset(cmnd, 0, sizeof(*cmnd));
- cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
- cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
- cmnd->dsm.nr = 0;
- cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
- return BLK_MQ_RQ_QUEUE_OK;
-}
-
-/*
* NOTE: ns is NULL when called on the admin queue.
*/
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -637,6 +561,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_command cmnd;
+ unsigned map_len;
int ret = BLK_MQ_RQ_QUEUE_OK;
/*
@@ -652,23 +577,17 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
}
}
- ret = nvme_init_iod(req, dev);
+ map_len = nvme_map_len(req);
+ ret = nvme_init_iod(req, map_len, dev);
if (ret)
return ret;
- if (req->cmd_flags & REQ_DISCARD) {
- ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
- } else {
- if (req->cmd_type == REQ_TYPE_DRV_PRIV)
- memcpy(&cmnd, req->cmd, sizeof(cmnd));
- else if (req->cmd_flags & REQ_FLUSH)
- nvme_setup_flush(ns, &cmnd);
- else
- nvme_setup_rw(ns, req, &cmnd);
+ ret = nvme_setup_cmd(ns, req, &cmnd);
+ if (ret)
+ goto out;
- if (req->nr_phys_segments)
- ret = nvme_map_data(dev, req, &cmnd);
- }
+ if (req->nr_phys_segments)
+ ret = nvme_map_data(dev, req, map_len, &cmnd);
if (ret)
goto out;
@@ -764,7 +683,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
*/
if (unlikely(nvmeq->qid == 0 &&
cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
- nvme_complete_async_event(nvmeq->dev, &cqe);
+ nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe);
continue;
}
@@ -833,21 +752,18 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return 0;
}
-static void nvme_async_event_work(struct work_struct *work)
+static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
struct nvme_queue *nvmeq = dev->queues[0];
struct nvme_command c;
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
+ c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
spin_lock_irq(&nvmeq->q_lock);
- while (dev->ctrl.event_limit > 0) {
- c.common.command_id = NVME_AQ_BLKMQ_DEPTH +
- --dev->ctrl.event_limit;
- __nvme_submit_cmd(nvmeq, &c);
- }
+ __nvme_submit_cmd(nvmeq, &c);
spin_unlock_irq(&nvmeq->q_lock);
}
@@ -939,7 +855,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* cancellation error. All outstanding requests are completed on
* shutdown, so we return BLK_EH_HANDLED.
*/
- if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
+ if (dev->ctrl.state == NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
@@ -1003,16 +919,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
return BLK_EH_RESET_TIMER;
}
-static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
+static void nvme_cancel_io(struct request *req, void *data, bool reserved)
{
- struct nvme_queue *nvmeq = data;
int status;
if (!blk_mq_request_started(req))
return;
- dev_dbg_ratelimited(nvmeq->dev->ctrl.device,
- "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
+ dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device,
+ "Cancelling I/O %d", req->tag);
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
@@ -1069,14 +984,6 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
return 0;
}
-static void nvme_clear_queue(struct nvme_queue *nvmeq)
-{
- spin_lock_irq(&nvmeq->q_lock);
- if (nvmeq->tags && *nvmeq->tags)
- blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
- spin_unlock_irq(&nvmeq->q_lock);
-}
-
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
struct nvme_queue *nvmeq = dev->queues[0];
@@ -1350,22 +1257,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
return result;
}
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+ /* If true, indicates loss of adapter communication, possibly by a
+ * NVMe Subsystem reset.
+ */
+ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+ /* If there is a reset ongoing, we shouldn't reset again. */
+ if (work_busy(&dev->reset_work))
+ return false;
+
+ /* We shouldn't reset unless the controller is on fatal error state
+ * _or_ if we lost the communication with it.
+ */
+ if (!(csts & NVME_CSTS_CFS) && !nssro)
+ return false;
+
+ /* If PCI error recovery process is happening, we cannot reset or
+ * the recovery mechanism will surely fail.
+ */
+ if (pci_channel_offline(to_pci_dev(dev->dev)))
+ return false;
+
+ return true;
+}
+
static void nvme_watchdog_timer(unsigned long data)
{
struct nvme_dev *dev = (struct nvme_dev *)data;
u32 csts = readl(dev->bar + NVME_REG_CSTS);
- /*
- * Skip controllers currently under reset.
- */
- if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) &&
- ((csts & NVME_CSTS_CFS) ||
- (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) {
- if (queue_work(nvme_workq, &dev->reset_work)) {
+ /* Skip controllers under certain specific conditions. */
+ if (nvme_should_reset(dev, csts)) {
+ if (queue_work(nvme_workq, &dev->reset_work))
dev_warn(dev->dev,
"Failed status: 0x%x, reset controller.\n",
csts);
- }
return;
}
@@ -1465,7 +1394,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, i, vecs, nr_io_queues, size;
- nr_io_queues = num_possible_cpus();
+ nr_io_queues = num_online_cpus();
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
@@ -1551,8 +1480,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return result;
}
-static void nvme_set_irq_hints(struct nvme_dev *dev)
+static void nvme_pci_post_scan(struct nvme_ctrl *ctrl)
{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
struct nvme_queue *nvmeq;
int i;
@@ -1567,16 +1497,6 @@ static void nvme_set_irq_hints(struct nvme_dev *dev)
}
}
-static void nvme_dev_scan(struct work_struct *work)
-{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
-
- if (!dev->tagset.tags)
- return;
- nvme_scan_namespaces(&dev->ctrl);
- nvme_set_irq_hints(dev);
-}
-
static void nvme_del_queue_end(struct request *req, int error)
{
struct nvme_queue *nvmeq = req->end_io_data;
@@ -1592,7 +1512,13 @@ static void nvme_del_cq_end(struct request *req, int error)
if (!error) {
unsigned long flags;
- spin_lock_irqsave(&nvmeq->q_lock, flags);
+ /*
+ * We might be called with the AQ q_lock held
+ * and the I/O queue q_lock should always
+ * nest inside the AQ one.
+ */
+ spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
+ SINGLE_DEPTH_NESTING);
nvme_process_cq(nvmeq);
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
}
@@ -1625,12 +1551,12 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
- int pass;
+ int pass, queues = dev->online_queues - 1;
unsigned long timeout;
u8 opcode = nvme_admin_delete_sq;
for (pass = 0; pass < 2; pass++) {
- int sent = 0, i = dev->queue_count - 1;
+ int sent = 0, i = queues;
reinit_completion(&dev->ioq_wait);
retry:
@@ -1684,7 +1610,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
nvme_free_queues(dev, dev->online_queues);
}
- nvme_queue_scan(dev);
return 0;
}
@@ -1797,8 +1722,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
}
nvme_pci_disable(dev);
- for (i = dev->queue_count - 1; i >= 0; i--)
- nvme_clear_queue(dev->queues[i]);
+ blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev);
+ blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev);
mutex_unlock(&dev->shutdown_lock);
}
@@ -1854,7 +1779,7 @@ static void nvme_reset_work(struct work_struct *work)
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
int result = -ENODEV;
- if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
+ if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
goto out;
/*
@@ -1864,11 +1789,9 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
- if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
goto out;
- set_bit(NVME_CTRL_RESETTING, &dev->flags);
-
result = nvme_pci_enable(dev);
if (result)
goto out;
@@ -1890,8 +1813,14 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;
- dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
- queue_work(nvme_workq, &dev->async_work);
+ /*
+ * A controller that can not execute IO typically requires user
+ * intervention to correct. For such degraded controllers, the driver
+ * should not submit commands the user did not request, so skip
+ * registering for asynchronous event notification on this condition.
+ */
+ if (dev->online_queues > 1)
+ nvme_queue_async_events(&dev->ctrl);
mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
@@ -1901,13 +1830,20 @@ static void nvme_reset_work(struct work_struct *work)
*/
if (dev->online_queues < 2) {
dev_warn(dev->ctrl.device, "IO queues not created\n");
+ nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
} else {
nvme_start_queues(&dev->ctrl);
nvme_dev_add(dev);
}
- clear_bit(NVME_CTRL_RESETTING, &dev->flags);
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+ dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+ goto out;
+ }
+
+ if (dev->online_queues > 1)
+ nvme_queue_scan(&dev->ctrl);
return;
out:
@@ -1921,7 +1857,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
nvme_kill_queues(&dev->ctrl);
if (pci_get_drvdata(pdev))
- pci_stop_and_remove_bus_device_locked(pdev);
+ device_release_driver(&pdev->dev);
nvme_put_ctrl(&dev->ctrl);
}
@@ -1955,13 +1891,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
return 0;
}
-static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
-{
- struct nvme_dev *dev = to_nvme_dev(ctrl);
-
- return !dev->bar || dev->online_queues < 2;
-}
-
static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
{
return nvme_reset(to_nvme_dev(ctrl));
@@ -1972,9 +1901,10 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
- .io_incapable = nvme_pci_io_incapable,
.reset_ctrl = nvme_pci_reset_ctrl,
.free_ctrl = nvme_pci_free_ctrl,
+ .post_scan = nvme_pci_post_scan,
+ .submit_async_event = nvme_pci_submit_async_event,
};
static int nvme_dev_map(struct nvme_dev *dev)
@@ -2026,10 +1956,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto free;
- INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
- INIT_WORK(&dev->async_work, nvme_async_event_work);
setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
(unsigned long)dev);
mutex_init(&dev->shutdown_lock);
@@ -2086,15 +2014,16 @@ static void nvme_remove(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- set_bit(NVME_CTRL_REMOVING, &dev->flags);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+
pci_set_drvdata(pdev, NULL);
- flush_work(&dev->async_work);
+
+ if (!pci_device_is_present(pdev))
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+
flush_work(&dev->reset_work);
- flush_work(&dev->scan_work);
- nvme_remove_namespaces(&dev->ctrl);
nvme_uninit_ctrl(&dev->ctrl);
nvme_dev_disable(dev, true);
- flush_work(&dev->reset_work);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
nvme_release_cmb(dev);
@@ -2135,14 +2064,17 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
* shutdown the controller to quiesce. The controller will be restarted
* after the slot reset through driver's slot_reset callback.
*/
- dev_warn(dev->ctrl.device, "error detected: state:%d\n", state);
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
+ dev_warn(dev->ctrl.device,
+ "frozen state error detected, reset controller\n");
nvme_dev_disable(dev, false);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
+ dev_warn(dev->ctrl.device,
+ "failure state error detected, request disconnect\n");
return PCI_ERS_RESULT_DISCONNECT;
}
return PCI_ERS_RESULT_NEED_RESET;
@@ -2177,6 +2109,12 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
+ { PCI_VDEVICE(INTEL, 0x0a53),
+ .driver_data = NVME_QUIRK_STRIPE_SIZE |
+ NVME_QUIRK_DISCARD_ZEROES, },
+ { PCI_VDEVICE(INTEL, 0x0a54),
+ .driver_data = NVME_QUIRK_STRIPE_SIZE |
+ NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },