NVMe: Move error handling to failed reset handler

This moves failed queue handling out of the namespace removal path and into the reset failure path, fixing a hanging condition if the controller fails or link down during del_gendisk. Previously the driver had to see the controller as degraded prior to calling del_gendisk to setup the queues to fail. But, if the controller happened to fail after this, there was no task to end outstanding requests. On failure, all namespace states are set to dead. This has capacity revalidate to 0, and ends all new requests with error status. Signed-off-by: Keith Busch <keith.busch@intel.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Jens Axboe <axboe@fb.com>
author: Keith Busch <keith.busch@intel.com> 2016-02-24 09:15:56 -0700
committer: Jens Axboe <axboe@fb.com> 2016-03-03 14:42:50 -0700
commit: 69d9a99c258eb1d6478fd9608a2070890797eed7 (patch)
tree: c431aebc0a314c5f84604f4bf609482df5248938 /drivers/nvme/host/pci.c
parent: NVMe: Simplify device reset failure (diff)
download: linux-dev-69d9a99c258eb1d6478fd9608a2070890797eed7.tar.xz
linux-dev-69d9a99c258eb1d6478fd9608a2070890797eed7.zip
1 files changed, 12 insertions, 1 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6d2e4257308b..680f5780750c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -690,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	spin_lock_irq(&nvmeq->q_lock);
 	if (unlikely(nvmeq->cq_vector < 0)) {
-		ret = BLK_MQ_RQ_QUEUE_BUSY;
+		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+			ret = BLK_MQ_RQ_QUEUE_BUSY;
+		else
+			ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out;
 	}
@@ -1261,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1901,6 +1910,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 	dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
 
 	kref_get(&dev->ctrl.kref);
+	nvme_dev_disable(dev, false);
 	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
@@ -1973,6 +1983,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
author	Keith Busch <keith.busch@intel.com>	2016-02-24 09:15:56 -0700
committer	Jens Axboe <axboe@fb.com>	2016-03-03 14:42:50 -0700
commit	69d9a99c258eb1d6478fd9608a2070890797eed7 (patch)
tree	c431aebc0a314c5f84604f4bf609482df5248938 /drivers/nvme/host/pci.c
parent	NVMe: Simplify device reset failure (diff)
download	linux-dev-69d9a99c258eb1d6478fd9608a2070890797eed7.tar.xz linux-dev-69d9a99c258eb1d6478fd9608a2070890797eed7.zip