aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/nvme-core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/nvme-core.c')
-rw-r--r--drivers/block/nvme-core.c906
1 files changed, 494 insertions, 412 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 683dff272562..34338d7438f5 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -29,6 +29,7 @@
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
+#include <linux/list_sort.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -80,6 +81,7 @@ static wait_queue_head_t nvme_kthread_wait;
static struct class *nvme_class;
static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_reset(struct nvme_dev *dev);
static int nvme_process_cq(struct nvme_queue *nvmeq);
struct async_cmd_info {
@@ -102,6 +104,7 @@ struct nvme_queue {
spinlock_t q_lock;
struct nvme_command *sq_cmds;
volatile struct nvme_completion *cqes;
+ struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
@@ -114,7 +117,6 @@ struct nvme_queue {
u8 cq_phase;
u8 cqe_seen;
struct async_cmd_info cmdinfo;
- struct blk_mq_hw_ctx *hctx;
};
/*
@@ -182,12 +184,22 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = dev->queues[0];
- WARN_ON(nvmeq->hctx);
- nvmeq->hctx = hctx;
+ WARN_ON(hctx_idx != 0);
+ WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+ WARN_ON(nvmeq->tags);
+
hctx->driver_data = nvmeq;
+ nvmeq->tags = &dev->admin_tagset.tags[0];
return 0;
}
+static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ nvmeq->tags = NULL;
+}
+
static int nvme_admin_init_request(void *data, struct request *req,
unsigned int hctx_idx, unsigned int rq_idx,
unsigned int numa_node)
@@ -201,27 +213,16 @@ static int nvme_admin_init_request(void *data, struct request *req,
return 0;
}
-static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
- nvmeq->hctx = NULL;
-}
-
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[
- (hctx_idx % dev->queue_count) + 1];
-
- if (!nvmeq->hctx)
- nvmeq->hctx = hctx;
+ struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
- /* nvmeq queues are shared between namespaces. We assume here that
- * blk-mq map the tags so they match up with the nvme queue tags. */
- WARN_ON(nvmeq->hctx->tags != hctx->tags);
+ if (!nvmeq->tags)
+ nvmeq->tags = &dev->tagset.tags[hctx_idx];
+ WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
}
@@ -307,9 +308,16 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
++nvmeq->dev->event_limit;
- if (status == NVME_SC_SUCCESS)
- dev_warn(nvmeq->q_dmadev,
- "async event result %08x\n", result);
+ if (status != NVME_SC_SUCCESS)
+ return;
+
+ switch (result & 0xff07) {
+ case NVME_AER_NOTICE_NS_CHANGED:
+ dev_info(nvmeq->q_dmadev, "rescanning\n");
+ schedule_work(&nvmeq->dev->scan_work);
+ default:
+ dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
+ }
}
static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
@@ -320,7 +328,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
u16 status = le16_to_cpup(&cqe->status) >> 1;
u32 result = le32_to_cpup(&cqe->result);
- blk_mq_free_hctx_request(nvmeq->hctx, req);
+ blk_mq_free_request(req);
dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
++nvmeq->dev->abort_limit;
@@ -333,14 +341,13 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
cmdinfo->result = le32_to_cpup(&cqe->result);
cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
- blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
+ blk_mq_free_request(cmdinfo->req);
}
static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
unsigned int tag)
{
- struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
- struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+ struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
return blk_mq_rq_to_pdu(req);
}
@@ -445,7 +452,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
(unsigned long) rq, gfp);
}
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
int i;
@@ -605,22 +612,33 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
spin_unlock_irqrestore(req->q->queue_lock, flags);
return;
}
- req->errors = nvme_error_status(status);
+ if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+ if (cmd_rq->ctx == CMD_CTX_CANCELLED)
+ req->errors = -EINTR;
+ else
+ req->errors = status;
+ } else {
+ req->errors = nvme_error_status(status);
+ }
} else
req->errors = 0;
+ if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+ u32 result = le32_to_cpup(&cqe->result);
+ req->special = (void *)(uintptr_t)result;
+ }
if (cmd_rq->aborted)
- dev_warn(&nvmeq->dev->pci_dev->dev,
+ dev_warn(nvmeq->dev->dev,
"completing aborted command with status:%04x\n",
status);
if (iod->nents) {
- dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+ dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
if (blk_integrity_rq(req)) {
if (!rq_data_dir(req))
nvme_dif_remap(req, nvme_dif_complete);
- dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+ dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
}
}
@@ -630,8 +648,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
- gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
+ int total_len, gfp_t gfp)
{
struct dma_pool *pool;
int length = total_len;
@@ -709,6 +727,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
return total_len;
}
+static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
+ struct nvme_iod *iod)
+{
+ struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+ memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
+ cmnd->rw.command_id = req->tag;
+ if (req->nr_phys_segments) {
+ cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+ }
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
/*
* We reuse the small pool to allocate the 16-byte range here as it is not
* worth having a special pool for these or additional cases to handle freeing
@@ -807,11 +842,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
return 0;
}
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
@@ -822,15 +861,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
* unless this namespace is formated such that the metadata can be
* stripped/generated by the controller with PRACT=1.
*/
- if (ns->ms && !blk_integrity_rq(req)) {
- if (!(ns->pi_type && ns->ms == 8)) {
+ if (ns && ns->ms && !blk_integrity_rq(req)) {
+ if (!(ns->pi_type && ns->ms == 8) &&
+ req->cmd_type != REQ_TYPE_DRV_PRIV) {
req->errors = -EFAULT;
blk_mq_complete_request(req);
return BLK_MQ_RQ_QUEUE_OK;
}
}
- iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
+ iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
@@ -841,8 +881,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
* as it is not worth having a special pool for these or
* additional cases to handle freeing the iod.
*/
- range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
- GFP_ATOMIC,
+ range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
&iod->first_dma);
if (!range)
goto retry_cmd;
@@ -860,9 +899,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
if (blk_rq_bytes(req) !=
- nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
- dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
- iod->nents, dma_dir);
+ nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
goto retry_cmd;
}
if (blk_integrity_rq(req)) {
@@ -884,7 +922,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_set_info(cmd, iod, req_completion);
spin_lock_irq(&nvmeq->q_lock);
- if (req->cmd_flags & REQ_DISCARD)
+ if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+ nvme_submit_priv(nvmeq, req, iod);
+ else if (req->cmd_flags & REQ_DISCARD)
nvme_submit_discard(nvmeq, ns, req, iod);
else if (req->cmd_flags & REQ_FLUSH)
nvme_submit_flush(nvmeq, ns, req->tag);
@@ -896,10 +936,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_OK;
error_cmd:
- nvme_free_iod(nvmeq->dev, iod);
+ nvme_free_iod(dev, iod);
return BLK_MQ_RQ_QUEUE_ERROR;
retry_cmd:
- nvme_free_iod(nvmeq->dev, iod);
+ nvme_free_iod(dev, iod);
return BLK_MQ_RQ_QUEUE_BUSY;
}
@@ -942,15 +982,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
return 1;
}
-/* Admin queue isn't initialized as a request queue. If at some point this
- * happens anyway, make sure to notify the user */
-static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
-{
- WARN_ON_ONCE(1);
- return BLK_MQ_RQ_QUEUE_ERROR;
-}
-
static irqreturn_t nvme_irq(int irq, void *data)
{
irqreturn_t result;
@@ -972,46 +1003,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_WAKE_THREAD;
}
-struct sync_cmd_info {
- struct task_struct *task;
- u32 result;
- int status;
-};
-
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
- struct nvme_completion *cqe)
-{
- struct sync_cmd_info *cmdinfo = ctx;
- cmdinfo->result = le32_to_cpup(&cqe->result);
- cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
- wake_up_process(cmdinfo->task);
-}
-
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
*/
-static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
- u32 *result, unsigned timeout)
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void *buffer, void __user *ubuffer, unsigned bufflen,
+ u32 *result, unsigned timeout)
{
- struct sync_cmd_info cmdinfo;
- struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+ bool write = cmd->common.opcode & 1;
+ struct bio *bio = NULL;
+ struct request *req;
+ int ret;
- cmdinfo.task = current;
- cmdinfo.status = -EINTR;
+ req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- cmd->common.command_id = req->tag;
+ req->cmd_type = REQ_TYPE_DRV_PRIV;
+ req->cmd_flags |= REQ_FAILFAST_DRIVER;
+ req->__data_len = 0;
+ req->__sector = (sector_t) -1;
+ req->bio = req->biotail = NULL;
+
+ req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
- nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
+ req->cmd = (unsigned char *)cmd;
+ req->cmd_len = sizeof(struct nvme_command);
+ req->special = (void *)0;
- set_current_state(TASK_UNINTERRUPTIBLE);
- nvme_submit_cmd(nvmeq, cmd);
- schedule();
+ if (buffer && bufflen) {
+ ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+ if (ret)
+ goto out;
+ } else if (ubuffer && bufflen) {
+ ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+ if (ret)
+ goto out;
+ bio = req->bio;
+ }
+ blk_execute_rq(req->q, NULL, req, 0);
+ if (bio)
+ blk_rq_unmap_user(bio);
if (result)
- *result = cmdinfo.result;
- return cmdinfo.status;
+ *result = (u32)(uintptr_t)req->special;
+ ret = req->errors;
+ out:
+ blk_mq_free_request(req);
+ return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void *buffer, unsigned bufflen)
+{
+ return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
}
static int nvme_submit_async_admin_req(struct nvme_dev *dev)
@@ -1033,7 +1079,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
c.common.opcode = nvme_admin_async_event;
c.common.command_id = req->tag;
- blk_mq_free_hctx_request(nvmeq->hctx, req);
+ blk_mq_free_request(req);
return __nvme_submit_cmd(nvmeq, &c);
}
@@ -1060,41 +1106,6 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
return nvme_submit_cmd(nvmeq, cmd);
}
-static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
- u32 *result, unsigned timeout)
-{
- int res;
- struct request *req;
-
- req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
- if (IS_ERR(req))
- return PTR_ERR(req);
- res = nvme_submit_sync_cmd(req, cmd, result, timeout);
- blk_mq_free_request(req);
- return res;
-}
-
-int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
- u32 *result)
-{
- return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
-}
-
-int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
- struct nvme_command *cmd, u32 *result)
-{
- int res;
- struct request *req;
-
- req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
- res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
- blk_mq_free_request(req);
- return res;
-}
-
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
struct nvme_command c;
@@ -1103,7 +1114,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
c.delete_queue.opcode = opcode;
c.delete_queue.qid = cpu_to_le16(id);
- return nvme_submit_admin_cmd(dev, &c, NULL);
+ return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
}
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1112,6 +1123,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+ /*
+ * Note: we (ab)use the fact the the prp fields survive if no data
+ * is attached to the request.
+ */
memset(&c, 0, sizeof(c));
c.create_cq.opcode = nvme_admin_create_cq;
c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
@@ -1120,7 +1135,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
- return nvme_submit_admin_cmd(dev, &c, NULL);
+ return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
}
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1129,6 +1144,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+ /*
+ * Note: we (ab)use the fact the the prp fields survive if no data
+ * is attached to the request.
+ */
memset(&c, 0, sizeof(c));
c.create_sq.opcode = nvme_admin_create_sq;
c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
@@ -1137,7 +1156,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
- return nvme_submit_admin_cmd(dev, &c, NULL);
+ return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
}
static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1150,18 +1169,45 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}
-int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
- dma_addr_t dma_addr)
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
{
- struct nvme_command c;
+ struct nvme_command c = { };
+ int error;
- memset(&c, 0, sizeof(c));
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(nsid);
- c.identify.prp1 = cpu_to_le64(dma_addr);
- c.identify.cns = cpu_to_le32(cns);
+ c.identify.cns = cpu_to_le32(1);
- return nvme_submit_admin_cmd(dev, &c, NULL);
+ *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+ sizeof(struct nvme_id_ctrl));
+ if (error)
+ kfree(*id);
+ return error;
+}
+
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+ struct nvme_id_ns **id)
+{
+ struct nvme_command c = { };
+ int error;
+
+ /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+ c.identify.opcode = nvme_admin_identify,
+ c.identify.nsid = cpu_to_le32(nsid),
+
+ *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+ sizeof(struct nvme_id_ns));
+ if (error)
+ kfree(*id);
+ return error;
}
int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
@@ -1175,7 +1221,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
c.features.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
- return nvme_submit_admin_cmd(dev, &c, result);
+ return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+ result, 0);
}
int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
@@ -1189,7 +1236,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
- return nvme_submit_admin_cmd(dev, &c, result);
+ return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+ result, 0);
+}
+
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
+{
+ struct nvme_command c = { };
+ int error;
+
+ c.common.opcode = nvme_admin_get_log_page,
+ c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+ c.common.cdw10[0] = cpu_to_le32(
+ (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+ NVME_LOG_SMART),
+
+ *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+ if (!*log)
+ return -ENOMEM;
+
+ error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+ sizeof(struct nvme_smart_log));
+ if (error)
+ kfree(*log);
+ return error;
}
/**
@@ -1214,8 +1284,7 @@ static void nvme_abort_req(struct request *req)
if (work_busy(&dev->reset_work))
goto out;
list_del_init(&dev->node);
- dev_warn(&dev->pci_dev->dev,
- "I/O %d QID %d timeout, reset controller\n",
+ dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
dev->reset_workfn = nvme_reset_failed_dev;
queue_work(nvme_workq, &dev->reset_work);
@@ -1254,8 +1323,7 @@ static void nvme_abort_req(struct request *req)
}
}
-static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
- struct request *req, void *data, bool reserved)
+static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
{
struct nvme_queue *nvmeq = data;
void *ctx;
@@ -1352,11 +1420,9 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
static void nvme_clear_queue(struct nvme_queue *nvmeq)
{
- struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
-
spin_lock_irq(&nvmeq->q_lock);
- if (hctx && hctx->tags)
- blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
+ if (nvmeq->tags && *nvmeq->tags)
+ blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
spin_unlock_irq(&nvmeq->q_lock);
}
@@ -1384,22 +1450,21 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth)
{
- struct device *dmadev = &dev->pci_dev->dev;
struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
if (!nvmeq)
return NULL;
- nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
+ nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
- nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
goto free_cqdma;
- nvmeq->q_dmadev = dmadev;
+ nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
dev->instance, qid);
@@ -1409,13 +1474,16 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->qid = qid;
- dev->queue_count++;
dev->queues[qid] = nvmeq;
+ /* make sure queue descriptor is set before queue count, for kthread */
+ mb();
+ dev->queue_count++;
+
return nvmeq;
free_cqdma:
- dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+ dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
kfree(nvmeq);
@@ -1487,7 +1555,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
- dev_err(&dev->pci_dev->dev,
+ dev_err(dev->dev,
"Device not ready; aborting %s\n", enabled ?
"initialisation" : "reset");
return -ENODEV;
@@ -1537,7 +1605,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
- dev_err(&dev->pci_dev->dev,
+ dev_err(dev->dev,
"Device shutdown incomplete; abort shutdown\n");
return -ENODEV;
}
@@ -1547,10 +1615,10 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
}
static struct blk_mq_ops nvme_mq_admin_ops = {
- .queue_rq = nvme_admin_queue_rq,
+ .queue_rq = nvme_queue_rq,
.map_queue = blk_mq_map_queue,
.init_hctx = nvme_admin_init_hctx,
- .exit_hctx = nvme_exit_hctx,
+ .exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
@@ -1559,7 +1627,6 @@ static struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.map_queue = blk_mq_map_queue,
.init_hctx = nvme_init_hctx,
- .exit_hctx = nvme_exit_hctx,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
};
@@ -1580,7 +1647,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.reserved_tags = 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
- dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+ dev->admin_tagset.numa_node = dev_to_node(dev->dev);
dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
@@ -1594,6 +1661,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
}
if (!blk_get_queue(dev->admin_q)) {
nvme_dev_remove_admin(dev);
+ dev->admin_q = NULL;
return -ENODEV;
}
} else
@@ -1613,14 +1681,14 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
if (page_shift < dev_page_min) {
- dev_err(&dev->pci_dev->dev,
+ dev_err(dev->dev,
"Minimum device page size (%u) too large for "
"host (%u)\n", 1 << dev_page_min,
1 << page_shift);
return -ENODEV;
}
if (page_shift > dev_page_max) {
- dev_info(&dev->pci_dev->dev,
+ dev_info(dev->dev,
"Device maximum page size (%u) smaller than "
"host (%u); enabling work-around\n",
1 << dev_page_max, 1 << page_shift);
@@ -1668,126 +1736,43 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
return result;
}
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
- unsigned long addr, unsigned length)
-{
- int i, err, count, nents, offset;
- struct scatterlist *sg;
- struct page **pages;
- struct nvme_iod *iod;
-
- if (addr & 3)
- return ERR_PTR(-EINVAL);
- if (!length || length > INT_MAX - PAGE_SIZE)
- return ERR_PTR(-EINVAL);
-
- offset = offset_in_page(addr);
- count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
- pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- err = get_user_pages_fast(addr, count, 1, pages);
- if (err < count) {
- count = err;
- err = -EFAULT;
- goto put_pages;
- }
-
- err = -ENOMEM;
- iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
- if (!iod)
- goto put_pages;
-
- sg = iod->sg;
- sg_init_table(sg, count);
- for (i = 0; i < count; i++) {
- sg_set_page(&sg[i], pages[i],
- min_t(unsigned, length, PAGE_SIZE - offset),
- offset);
- length -= (PAGE_SIZE - offset);
- offset = 0;
- }
- sg_mark_end(&sg[i - 1]);
- iod->nents = count;
-
- nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
- write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- if (!nents)
- goto free_iod;
-
- kfree(pages);
- return iod;
-
- free_iod:
- kfree(iod);
- put_pages:
- for (i = 0; i < count; i++)
- put_page(pages[i]);
- kfree(pages);
- return ERR_PTR(err);
-}
-
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
- struct nvme_iod *iod)
-{
- int i;
-
- dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
- write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
- for (i = 0; i < iod->nents; i++)
- put_page(sg_page(&iod->sg[i]));
-}
-
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_dev *dev = ns->dev;
struct nvme_user_io io;
struct nvme_command c;
- unsigned length, meta_len, prp_len;
+ unsigned length, meta_len;
int status, write;
- struct nvme_iod *iod;
dma_addr_t meta_dma = 0;
void *meta = NULL;
void __user *metadata;
if (copy_from_user(&io, uio, sizeof(io)))
return -EFAULT;
- length = (io.nblocks + 1) << ns->lba_shift;
- meta_len = (io.nblocks + 1) * ns->ms;
-
- if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext)
- return -EINVAL;
- else if (meta_len && ns->ext) {
- length += meta_len;
- meta_len = 0;
- }
-
- metadata = (void __user *)(unsigned long)io.metadata;
-
- write = io.opcode & 1;
switch (io.opcode) {
case nvme_cmd_write:
case nvme_cmd_read:
case nvme_cmd_compare:
- iod = nvme_map_user_pages(dev, write, io.addr, length);
break;
default:
return -EINVAL;
}
- if (IS_ERR(iod))
- return PTR_ERR(iod);
+ length = (io.nblocks + 1) << ns->lba_shift;
+ meta_len = (io.nblocks + 1) * ns->ms;
+ metadata = (void __user *)(unsigned long)io.metadata;
+ write = io.opcode & 1;
- prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
- if (length != prp_len) {
- status = -ENOMEM;
- goto unmap;
+ if (ns->ext) {
+ length += meta_len;
+ meta_len = 0;
}
if (meta_len) {
- meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+ if (((io.metadata & 3) || !io.metadata) && !ns->ext)
+ return -EINVAL;
+
+ meta = dma_alloc_coherent(dev->dev, meta_len,
&meta_dma, GFP_KERNEL);
if (!meta) {
@@ -1813,19 +1798,17 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.reftag = cpu_to_le32(io.reftag);
c.rw.apptag = cpu_to_le16(io.apptag);
c.rw.appmask = cpu_to_le16(io.appmask);
- c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- c.rw.prp2 = cpu_to_le64(iod->first_dma);
c.rw.metadata = cpu_to_le64(meta_dma);
- status = nvme_submit_io_cmd(dev, ns, &c, NULL);
+
+ status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+ (void __user *)io.addr, length, NULL, 0);
unmap:
- nvme_unmap_user_pages(dev, write, iod);
- nvme_free_iod(dev, iod);
if (meta) {
if (status == NVME_SC_SUCCESS && !write) {
if (copy_to_user(metadata, meta, meta_len))
status = -EFAULT;
}
- dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma);
+ dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
}
return status;
}
@@ -1835,9 +1818,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
- int status, length;
- struct nvme_iod *uninitialized_var(iod);
- unsigned timeout;
+ unsigned timeout = 0;
+ int status;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -1857,46 +1839,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
- length = cmd.data_len;
- if (cmd.data_len) {
- iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
- length);
- if (IS_ERR(iod))
- return PTR_ERR(iod);
- length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
- c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- c.common.prp2 = cpu_to_le64(iod->first_dma);
- }
-
- timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
- ADMIN_TIMEOUT;
-
- if (length != cmd.data_len)
- status = -ENOMEM;
- else if (ns) {
- struct request *req;
-
- req = blk_mq_alloc_request(ns->queue, WRITE,
- (GFP_KERNEL|__GFP_WAIT), false);
- if (IS_ERR(req))
- status = PTR_ERR(req);
- else {
- status = nvme_submit_sync_cmd(req, &c, &cmd.result,
- timeout);
- blk_mq_free_request(req);
- }
- } else
- status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
+ if (cmd.timeout_ms)
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
- if (cmd.data_len) {
- nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
- nvme_free_iod(dev, iod);
+ status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
+ NULL, (void __user *)cmd.addr, cmd.data_len,
+ &cmd.result, timeout);
+ if (status >= 0) {
+ if (put_user(cmd.result, &ucmd->result))
+ return -EFAULT;
}
- if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
- sizeof(cmd.result)))
- status = -EFAULT;
-
return status;
}
@@ -1988,23 +1941,18 @@ static int nvme_revalidate_disk(struct gendisk *disk)
struct nvme_ns *ns = disk->private_data;
struct nvme_dev *dev = ns->dev;
struct nvme_id_ns *id;
- dma_addr_t dma_addr;
u8 lbaf, pi_type;
u16 old_ms;
unsigned short bs;
- id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
- GFP_KERNEL);
- if (!id) {
- dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
- __func__);
- return 0;
+ if (nvme_identify_ns(dev, ns->ns_id, &id)) {
+ dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
+ dev->instance, ns->ns_id);
+ return -ENODEV;
}
- if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
- dev_warn(&dev->pci_dev->dev,
- "identify failed ns:%d, setting capacity to 0\n",
- ns->ns_id);
- memset(id, 0, sizeof(*id));
+ if (id->ncap == 0) {
+ kfree(id);
+ return -ENODEV;
}
old_ms = ns->ms;
@@ -2038,7 +1986,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
!ns->ext)
nvme_init_integrity(ns);
- if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk)))
+ if (ns->ms && !blk_get_integrity(disk))
set_capacity(disk, 0);
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -2046,7 +1994,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
if (dev->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
- dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+ kfree(id);
return 0;
}
@@ -2073,7 +2021,7 @@ static int nvme_kthread(void *data)
if (work_busy(&dev->reset_work))
continue;
list_del_init(&dev->node);
- dev_warn(&dev->pci_dev->dev,
+ dev_warn(dev->dev,
"Failed status: %x, reset controller\n",
readl(&dev->bar->csts));
dev->reset_workfn = nvme_reset_failed_dev;
@@ -2105,7 +2053,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
- int node = dev_to_node(&dev->pci_dev->dev);
+ int node = dev_to_node(dev->dev);
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
@@ -2153,11 +2101,16 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
* requires it.
*/
set_capacity(disk, 0);
- nvme_revalidate_disk(ns->disk);
+ if (nvme_revalidate_disk(ns->disk))
+ goto out_free_disk;
+
add_disk(ns->disk);
if (ns->ms)
revalidate_disk(ns->disk);
return;
+ out_free_disk:
+ kfree(disk);
+ list_del(&ns->list);
out_free_queue:
blk_cleanup_queue(ns->queue);
out_free_ns:
@@ -2188,8 +2141,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
if (status < 0)
return status;
if (status > 0) {
- dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
- status);
+ dev_err(dev->dev, "Could not set queue count (%d)\n", status);
return 0;
}
return min(result & 0xffff, result >> 16) + 1;
@@ -2203,7 +2155,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = dev->queues[0];
- struct pci_dev *pdev = dev->pci_dev;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, i, vecs, nr_io_queues, size;
nr_io_queues = num_possible_cpus();
@@ -2275,6 +2227,99 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return result;
}
+static void nvme_free_namespace(struct nvme_ns *ns)
+{
+ list_del(&ns->list);
+
+ spin_lock(&dev_list_lock);
+ ns->disk->private_data = NULL;
+ spin_unlock(&dev_list_lock);
+
+ put_disk(ns->disk);
+ kfree(ns);
+}
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+ struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+ return nsa->ns_id - nsb->ns_id;
+}
+
+static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry(ns, &dev->namespaces, list) {
+ if (ns->ns_id == nsid)
+ return ns;
+ if (ns->ns_id > nsid)
+ break;
+ }
+ return NULL;
+}
+
+static inline bool nvme_io_incapable(struct nvme_dev *dev)
+{
+ return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS ||
+ dev->online_queues < 2);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+ bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
+
+ if (kill)
+ blk_set_queue_dying(ns->queue);
+ if (ns->disk->flags & GENHD_FL_UP) {
+ if (blk_get_integrity(ns->disk))
+ blk_integrity_unregister(ns->disk);
+ del_gendisk(ns->disk);
+ }
+ if (kill || !blk_queue_dying(ns->queue)) {
+ blk_mq_abort_requeue_list(ns->queue);
+ blk_cleanup_queue(ns->queue);
+ }
+}
+
+static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
+{
+ struct nvme_ns *ns, *next;
+ unsigned i;
+
+ for (i = 1; i <= nn; i++) {
+ ns = nvme_find_ns(dev, i);
+ if (ns) {
+ if (revalidate_disk(ns->disk)) {
+ nvme_ns_remove(ns);
+ nvme_free_namespace(ns);
+ }
+ } else
+ nvme_alloc_ns(dev, i);
+ }
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+ if (ns->ns_id > nn) {
+ nvme_ns_remove(ns);
+ nvme_free_namespace(ns);
+ }
+ }
+ list_sort(NULL, &dev->namespaces, ns_cmp);
+}
+
+static void nvme_dev_scan(struct work_struct *work)
+{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
+ struct nvme_id_ctrl *ctrl;
+
+ if (!dev->tagset.tags)
+ return;
+ if (nvme_identify_ctrl(dev, &ctrl))
+ return;
+ nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
+ kfree(ctrl);
+}
+
/*
* Return: error value if an error occurred setting up the queues or calling
* Identify Device. 0 if these succeeded, even if adding some of the
@@ -2283,26 +2328,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
*/
static int nvme_dev_add(struct nvme_dev *dev)
{
- struct pci_dev *pdev = dev->pci_dev;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
int res;
- unsigned nn, i;
+ unsigned nn;
struct nvme_id_ctrl *ctrl;
- void *mem;
- dma_addr_t dma_addr;
int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
- mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
- if (!mem)
- return -ENOMEM;
-
- res = nvme_identify(dev, 0, 1, dma_addr);
+ res = nvme_identify_ctrl(dev, &ctrl);
if (res) {
- dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
- dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+ dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
return -EIO;
}
- ctrl = mem;
nn = le32_to_cpup(&ctrl->nn);
dev->oncs = le16_to_cpup(&ctrl->oncs);
dev->abort_limit = ctrl->acl + 1;
@@ -2324,24 +2361,23 @@ static int nvme_dev_add(struct nvme_dev *dev)
} else
dev->max_hw_sectors = max_hw_sectors;
}
- dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+ kfree(ctrl);
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
- dev->tagset.queue_depth =
+ if (!dev->tagset.tags) {
+ dev->tagset.ops = &nvme_mq_ops;
+ dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.timeout = NVME_IO_TIMEOUT;
+ dev->tagset.numa_node = dev_to_node(dev->dev);
+ dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = nvme_cmd_size(dev);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
-
- if (blk_mq_alloc_tag_set(&dev->tagset))
- return 0;
-
- for (i = 1; i <= nn; i++)
- nvme_alloc_ns(dev, i);
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
+ dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tagset.driver_data = dev;
+ if (blk_mq_alloc_tag_set(&dev->tagset))
+ return 0;
+ }
+ schedule_work(&dev->scan_work);
return 0;
}
@@ -2349,7 +2385,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
{
u64 cap;
int bars, result = -ENOMEM;
- struct pci_dev *pdev = dev->pci_dev;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_enable_device_mem(pdev))
return result;
@@ -2363,8 +2399,8 @@ static int nvme_dev_map(struct nvme_dev *dev)
if (pci_request_selected_regions(pdev, bars, "nvme"))
goto disable_pci;
- if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
- dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+ if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
@@ -2405,19 +2441,21 @@ static int nvme_dev_map(struct nvme_dev *dev)
static void nvme_dev_unmap(struct nvme_dev *dev)
{
- if (dev->pci_dev->msi_enabled)
- pci_disable_msi(dev->pci_dev);
- else if (dev->pci_dev->msix_enabled)
- pci_disable_msix(dev->pci_dev);
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ if (pdev->msi_enabled)
+ pci_disable_msi(pdev);
+ else if (pdev->msix_enabled)
+ pci_disable_msix(pdev);
if (dev->bar) {
iounmap(dev->bar);
dev->bar = NULL;
- pci_release_regions(dev->pci_dev);
+ pci_release_regions(pdev);
}
- if (pci_is_enabled(dev->pci_dev))
- pci_disable_device(dev->pci_dev);
+ if (pci_is_enabled(pdev))
+ pci_disable_device(pdev);
}
struct nvme_delq_ctx {
@@ -2536,7 +2574,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
&worker, "nvme%d", dev->instance);
if (IS_ERR(kworker_task)) {
- dev_err(&dev->pci_dev->dev,
+ dev_err(dev->dev,
"Failed to create queue del task\n");
for (i = dev->queue_count - 1; i > 0; i--)
nvme_disable_queue(dev, i);
@@ -2587,9 +2625,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev)
list_for_each_entry(ns, &dev->namespaces, list) {
blk_mq_freeze_queue_start(ns->queue);
- spin_lock(ns->queue->queue_lock);
+ spin_lock_irq(ns->queue->queue_lock);
queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
- spin_unlock(ns->queue->queue_lock);
+ spin_unlock_irq(ns->queue->queue_lock);
blk_mq_cancel_requeue_work(ns->queue);
blk_mq_stop_hw_queues(ns->queue);
@@ -2639,29 +2677,19 @@ static void nvme_dev_remove(struct nvme_dev *dev)
{
struct nvme_ns *ns;
- list_for_each_entry(ns, &dev->namespaces, list) {
- if (ns->disk->flags & GENHD_FL_UP) {
- if (blk_get_integrity(ns->disk))
- blk_integrity_unregister(ns->disk);
- del_gendisk(ns->disk);
- }
- if (!blk_queue_dying(ns->queue)) {
- blk_mq_abort_requeue_list(ns->queue);
- blk_cleanup_queue(ns->queue);
- }
- }
+ list_for_each_entry(ns, &dev->namespaces, list)
+ nvme_ns_remove(ns);
}
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
- struct device *dmadev = &dev->pci_dev->dev;
- dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+ dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
PAGE_SIZE, PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+ dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
@@ -2709,28 +2737,22 @@ static void nvme_free_namespaces(struct nvme_dev *dev)
{
struct nvme_ns *ns, *next;
- list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
- list_del(&ns->list);
-
- spin_lock(&dev_list_lock);
- ns->disk->private_data = NULL;
- spin_unlock(&dev_list_lock);
-
- put_disk(ns->disk);
- kfree(ns);
- }
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list)
+ nvme_free_namespace(ns);
}
static void nvme_free_dev(struct kref *kref)
{
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
- pci_dev_put(dev->pci_dev);
+ put_device(dev->dev);
put_device(dev->device);
nvme_free_namespaces(dev);
nvme_release_instance(dev);
- blk_mq_free_tag_set(&dev->tagset);
- blk_put_queue(dev->admin_q);
+ if (dev->tagset.tags)
+ blk_mq_free_tag_set(&dev->tagset);
+ if (dev->admin_q)
+ blk_put_queue(dev->admin_q);
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
@@ -2781,6 +2803,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
return -ENOTTY;
ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
return nvme_user_cmd(dev, ns, (void __user *)arg);
+ case NVME_IOCTL_RESET:
+ dev_warn(dev->dev, "resetting controller\n");
+ return nvme_reset(dev);
default:
return -ENOTTY;
}
@@ -2802,11 +2827,11 @@ static void nvme_set_irq_hints(struct nvme_dev *dev)
for (i = 0; i < dev->online_queues; i++) {
nvmeq = dev->queues[i];
- if (!nvmeq->hctx)
+ if (!nvmeq->tags || !(*nvmeq->tags))
continue;
irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
- nvmeq->hctx->cpumask);
+ blk_mq_tags_cpumask(*nvmeq->tags));
}
}
@@ -2858,6 +2883,9 @@ static int nvme_dev_start(struct nvme_dev *dev)
free_tags:
nvme_dev_remove_admin(dev);
+ blk_put_queue(dev->admin_q);
+ dev->admin_q = NULL;
+ dev->queues[0]->tags = NULL;
disable:
nvme_disable_queue(dev, 0);
nvme_dev_list_remove(dev);
@@ -2869,7 +2897,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
static int nvme_remove_dead_ctrl(void *arg)
{
struct nvme_dev *dev = (struct nvme_dev *)arg;
- struct pci_dev *pdev = dev->pci_dev;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_get_drvdata(pdev))
pci_stop_and_remove_bus_device_locked(pdev);
@@ -2899,24 +2927,43 @@ static int nvme_dev_resume(struct nvme_dev *dev)
spin_unlock(&dev_list_lock);
} else {
nvme_unfreeze_queues(dev);
+ nvme_dev_add(dev);
nvme_set_irq_hints(dev);
}
return 0;
}
+static void nvme_dead_ctrl(struct nvme_dev *dev)
+{
+ dev_warn(dev->dev, "Device failed to resume\n");
+ kref_get(&dev->kref);
+ if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+ dev->instance))) {
+ dev_err(dev->dev,
+ "Failed to start controller remove task\n");
+ kref_put(&dev->kref, nvme_free_dev);
+ }
+}
+
static void nvme_dev_reset(struct nvme_dev *dev)
{
+ bool in_probe = work_busy(&dev->probe_work);
+
nvme_dev_shutdown(dev);
- if (nvme_dev_resume(dev)) {
- dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
- kref_get(&dev->kref);
- if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
- dev->instance))) {
- dev_err(&dev->pci_dev->dev,
- "Failed to start controller remove task\n");
- kref_put(&dev->kref, nvme_free_dev);
- }
+
+ /* Synchronize with device probe so that work will see failure status
+ * and exit gracefully without trying to schedule another reset */
+ flush_work(&dev->probe_work);
+
+ /* Fail this device if reset occured during probe to avoid
+ * infinite initialization loops. */
+ if (in_probe) {
+ nvme_dead_ctrl(dev);
+ return;
}
+ /* Schedule device resume asynchronously so the reset work is available
+ * to cleanup errors that may occur during reinitialization */
+ schedule_work(&dev->probe_work);
}
static void nvme_reset_failed_dev(struct work_struct *ws)
@@ -2931,6 +2978,45 @@ static void nvme_reset_workfn(struct work_struct *work)
dev->reset_workfn(work);
}
+static int nvme_reset(struct nvme_dev *dev)
+{
+ int ret = -EBUSY;
+
+ if (!dev->admin_q || blk_queue_dying(dev->admin_q))
+ return -ENODEV;
+
+ spin_lock(&dev_list_lock);
+ if (!work_pending(&dev->reset_work)) {
+ dev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &dev->reset_work);
+ ret = 0;
+ }
+ spin_unlock(&dev_list_lock);
+
+ if (!ret) {
+ flush_work(&dev->reset_work);
+ flush_work(&dev->probe_work);
+ return 0;
+ }
+
+ return ret;
+}
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct nvme_dev *ndev = dev_get_drvdata(dev);
+ int ret;
+
+ ret = nvme_reset(ndev);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
static void nvme_async_probe(struct work_struct *work);
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
@@ -2956,7 +3042,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_LIST_HEAD(&dev->namespaces);
dev->reset_workfn = nvme_reset_failed_dev;
INIT_WORK(&dev->reset_work, nvme_reset_workfn);
- dev->pci_dev = pci_dev_get(pdev);
+ dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_set_instance(dev);
if (result)
@@ -2975,18 +3061,27 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto release_pools;
}
get_device(dev->device);
+ dev_set_drvdata(dev->device, dev);
+
+ result = device_create_file(dev->device, &dev_attr_reset_controller);
+ if (result)
+ goto put_dev;
INIT_LIST_HEAD(&dev->node);
+ INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->probe_work, nvme_async_probe);
schedule_work(&dev->probe_work);
return 0;
+ put_dev:
+ device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
+ put_device(dev->device);
release_pools:
nvme_release_prp_pools(dev);
release:
nvme_release_instance(dev);
put_pci:
- pci_dev_put(dev->pci_dev);
+ put_device(dev->dev);
free:
kfree(dev->queues);
kfree(dev->entry);
@@ -2997,24 +3092,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_async_probe(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
- int result;
-
- result = nvme_dev_start(dev);
- if (result)
- goto reset;
- if (dev->online_queues > 1)
- result = nvme_dev_add(dev);
- if (result)
- goto reset;
-
- nvme_set_irq_hints(dev);
- return;
- reset:
- if (!work_busy(&dev->reset_work)) {
- dev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &dev->reset_work);
- }
+ if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
+ nvme_dead_ctrl(dev);
}
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
@@ -3044,8 +3124,10 @@ static void nvme_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
flush_work(&dev->probe_work);
flush_work(&dev->reset_work);
- nvme_dev_shutdown(dev);
+ flush_work(&dev->scan_work);
+ device_remove_file(dev->device, &dev_attr_reset_controller);
nvme_dev_remove(dev);
+ nvme_dev_shutdown(dev);
nvme_dev_remove_admin(dev);
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
nvme_free_queues(dev, 0);