aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/nvme/host/pci.c (renamed from drivers/block/nvme-core.c)497
1 files changed, 300 insertions, 197 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/nvme/host/pci.c
index 6f04771f1019..f3b53af789ef 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/nvme/host/pci.c
@@ -12,7 +12,6 @@
* more details.
*/
-#include <linux/nvme.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
@@ -40,8 +39,13 @@
#include <linux/slab.h>
#include <linux/t10-pi.h>
#include <linux/types.h>
+#include <linux/pr.h>
#include <scsi/sg.h>
-#include <asm-generic/io-64-nonatomic-lo-hi.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <asm/unaligned.h>
+
+#include <uapi/linux/nvme_ioctl.h>
+#include "nvme.h"
#define NVME_MINORS (1U << MINORBITS)
#define NVME_Q_DEPTH 1024
@@ -84,9 +88,10 @@ static wait_queue_head_t nvme_kthread_wait;
static struct class *nvme_class;
-static void nvme_reset_failed_dev(struct work_struct *ws);
+static int __nvme_reset(struct nvme_dev *dev);
static int nvme_reset(struct nvme_dev *dev);
-static int nvme_process_cq(struct nvme_queue *nvmeq);
+static void nvme_process_cq(struct nvme_queue *nvmeq);
+static void nvme_dead_ctrl(struct nvme_dev *dev);
struct async_cmd_info {
struct kthread_work work;
@@ -535,7 +540,7 @@ static void nvme_dif_remap(struct request *req,
virt = bip_get_seed(bip);
phys = nvme_block_nr(ns, blk_rq_pos(req));
nlb = (blk_rq_bytes(req) >> ns->lba_shift);
- ts = ns->disk->integrity->tuple_size;
+ ts = ns->disk->queue->integrity.tuple_size;
for (i = 0; i < nlb; i++, virt++, phys++) {
pi = (struct t10_pi_tuple *)p;
@@ -545,36 +550,20 @@ static void nvme_dif_remap(struct request *req,
kunmap_atomic(pmap);
}
-static int nvme_noop_verify(struct blk_integrity_iter *iter)
-{
- return 0;
-}
-
-static int nvme_noop_generate(struct blk_integrity_iter *iter)
-{
- return 0;
-}
-
-struct blk_integrity nvme_meta_noop = {
- .name = "NVME_META_NOOP",
- .generate_fn = nvme_noop_generate,
- .verify_fn = nvme_noop_verify,
-};
-
static void nvme_init_integrity(struct nvme_ns *ns)
{
struct blk_integrity integrity;
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- integrity = t10_pi_type3_crc;
+ integrity.profile = &t10_pi_type3_crc;
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- integrity = t10_pi_type1_crc;
+ integrity.profile = &t10_pi_type1_crc;
break;
default:
- integrity = nvme_meta_noop;
+ integrity.profile = NULL;
break;
}
integrity.tuple_size = ns->ms;
@@ -603,27 +592,31 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_iod *iod = ctx;
struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-
u16 status = le16_to_cpup(&cqe->status) >> 1;
+ bool requeue = false;
+ int error = 0;
if (unlikely(status)) {
if (!(status & NVME_SC_DNR || blk_noretry_request(req))
&& (jiffies - req->start_time) < req->timeout) {
unsigned long flags;
+ requeue = true;
blk_mq_requeue_request(req);
spin_lock_irqsave(req->q->queue_lock, flags);
if (!blk_queue_stopped(req->q))
blk_mq_kick_requeue_list(req->q);
spin_unlock_irqrestore(req->q->queue_lock, flags);
- return;
+ goto release_iod;
}
if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
if (cmd_rq->ctx == CMD_CTX_CANCELLED)
- status = -EINTR;
+ error = -EINTR;
+ else
+ error = status;
} else {
- status = nvme_error_status(status);
+ error = nvme_error_status(status);
}
}
@@ -635,8 +628,9 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
if (cmd_rq->aborted)
dev_warn(nvmeq->dev->dev,
"completing aborted command with status:%04x\n",
- status);
+ error);
+release_iod:
if (iod->nents) {
dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@ -649,7 +643,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
}
nvme_free_iod(nvmeq->dev, iod);
- blk_mq_complete_request(req, status);
+ if (likely(!requeue))
+ blk_mq_complete_request(req, error);
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
@@ -901,19 +896,28 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
}
if (blk_integrity_rq(req)) {
- if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
+ if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents,
+ dma_dir);
goto error_cmd;
+ }
sg_init_table(iod->meta_sg, 1);
if (blk_rq_map_integrity_sg(
- req->q, req->bio, iod->meta_sg) != 1)
+ req->q, req->bio, iod->meta_sg) != 1) {
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents,
+ dma_dir);
goto error_cmd;
+ }
if (rq_data_dir(req))
nvme_dif_remap(req, nvme_dif_prep);
- if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
+ if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) {
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents,
+ dma_dir);
goto error_cmd;
+ }
}
}
@@ -940,7 +944,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_BUSY;
}
-static int nvme_process_cq(struct nvme_queue *nvmeq)
+static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
{
u16 head, phase;
@@ -958,6 +962,8 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
head = 0;
phase = !phase;
}
+ if (tag && *tag == cqe.command_id)
+ *tag = -1;
ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
fn(nvmeq, ctx, &cqe);
}
@@ -969,14 +975,19 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
* a big problem.
*/
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
- return 0;
+ return;
- writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+ if (likely(nvmeq->cq_vector >= 0))
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
nvmeq->cqe_seen = 1;
- return 1;
+}
+
+static void nvme_process_cq(struct nvme_queue *nvmeq)
+{
+ __nvme_process_cq(nvmeq, NULL);
}
static irqreturn_t nvme_irq(int irq, void *data)
@@ -1000,6 +1011,23 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_WAKE_THREAD;
}
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
+ nvmeq->cq_phase) {
+ spin_lock_irq(&nvmeq->q_lock);
+ __nvme_process_cq(nvmeq, &tag);
+ spin_unlock_irq(&nvmeq->q_lock);
+
+ if (tag == -1)
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
@@ -1030,11 +1058,13 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req->special = (void *)0;
if (buffer && bufflen) {
- ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+ ret = blk_rq_map_kern(q, req, buffer, bufflen,
+ __GFP_DIRECT_RECLAIM);
if (ret)
goto out;
} else if (ubuffer && bufflen) {
- ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+ ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+ __GFP_DIRECT_RECLAIM);
if (ret)
goto out;
bio = req->bio;
@@ -1277,18 +1307,13 @@ static void nvme_abort_req(struct request *req)
struct nvme_command cmd;
if (!nvmeq->qid || cmd_rq->aborted) {
- unsigned long flags;
-
- spin_lock_irqsave(&dev_list_lock, flags);
- if (work_busy(&dev->reset_work))
- goto out;
- list_del_init(&dev->node);
- dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
- req->tag, nvmeq->qid);
- dev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &dev->reset_work);
- out:
- spin_unlock_irqrestore(&dev_list_lock, flags);
+ spin_lock(&dev_list_lock);
+ if (!__nvme_reset(dev)) {
+ dev_warn(dev->dev,
+ "I/O %d QID %d timeout, reset controller\n",
+ req->tag, nvmeq->qid);
+ }
+ spin_unlock(&dev_list_lock);
return;
}
@@ -1664,6 +1689,7 @@ static struct blk_mq_ops nvme_mq_ops = {
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
+ .poll = nvme_poll,
};
static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -1709,11 +1735,15 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
- u64 cap = readq(&dev->bar->cap);
+ u64 cap = lo_hi_readq(&dev->bar->cap);
struct nvme_queue *nvmeq;
- unsigned page_shift = PAGE_SHIFT;
+ /*
+ * default to a 4K page size, with the intention to update this
+ * path in the future to accomodate architectures with differing
+ * kernel and IO page sizes.
+ */
+ unsigned page_shift = 12;
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
- unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
if (page_shift < dev_page_min) {
dev_err(dev->dev,
@@ -1722,13 +1752,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1 << page_shift);
return -ENODEV;
}
- if (page_shift > dev_page_max) {
- dev_info(dev->dev,
- "Device maximum page size (%u) smaller than "
- "host (%u); enabling work-around\n",
- 1 << dev_page_max, 1 << page_shift);
- page_shift = dev_page_max;
- }
dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
NVME_CAP_NSSRC(cap) : 0;
@@ -1758,8 +1781,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
writel(aqa, &dev->bar->aqa);
- writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
- writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+ lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+ lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
result = nvme_enable_ctrl(dev, cap);
if (result)
@@ -1804,7 +1827,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(unsigned long)io.metadata;
+ metadata = (void __user *)(uintptr_t)io.metadata;
write = io.opcode & 1;
if (ns->ext) {
@@ -1844,7 +1867,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.metadata = cpu_to_le64(meta_dma);
status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
- (void __user *)io.addr, length, NULL, 0);
+ (void __user *)(uintptr_t)io.addr, length, NULL, 0);
unmap:
if (meta) {
if (status == NVME_SC_SUCCESS && !write) {
@@ -1886,7 +1909,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
timeout = msecs_to_jiffies(cmd.timeout_ms);
status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
- NULL, (void __user *)cmd.addr, cmd.data_len,
+ NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
&cmd.result, timeout);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -1943,6 +1966,23 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
#define nvme_compat_ioctl NULL
#endif
+static void nvme_free_dev(struct kref *kref);
+static void nvme_free_ns(struct kref *kref)
+{
+ struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+ if (ns->type == NVME_NS_LIGHTNVM)
+ nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+
+ spin_lock(&dev_list_lock);
+ ns->disk->private_data = NULL;
+ spin_unlock(&dev_list_lock);
+
+ kref_put(&ns->dev->kref, nvme_free_dev);
+ put_disk(ns->disk);
+ kfree(ns);
+}
+
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
int ret = 0;
@@ -1952,21 +1992,17 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
ns = bdev->bd_disk->private_data;
if (!ns)
ret = -ENXIO;
- else if (!kref_get_unless_zero(&ns->dev->kref))
+ else if (!kref_get_unless_zero(&ns->kref))
ret = -ENXIO;
spin_unlock(&dev_list_lock);
return ret;
}
-static void nvme_free_dev(struct kref *kref);
-
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
struct nvme_ns *ns = disk->private_data;
- struct nvme_dev *dev = ns->dev;
-
- kref_put(&dev->kref, nvme_free_dev);
+ kref_put(&ns->kref, nvme_free_ns);
}
static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
@@ -2007,6 +2043,16 @@ static int nvme_revalidate_disk(struct gendisk *disk)
return -ENODEV;
}
+ if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+ if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+ dev_warn(dev->dev,
+ "%s: LightNVM init failure\n", __func__);
+ kfree(id);
+ return -ENODEV;
+ }
+ ns->type = NVME_NS_LIGHTNVM;
+ }
+
old_ms = ns->ms;
lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
ns->lba_shift = id->lbaf[lbaf].ds;
@@ -2025,6 +2071,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
id->dps & NVME_NS_DPS_PI_MASK : 0;
+ blk_mq_freeze_queue(disk->queue);
if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
ns->ms != old_ms ||
bs != queue_logical_block_size(disk->queue) ||
@@ -2034,22 +2081,116 @@ static int nvme_revalidate_disk(struct gendisk *disk)
ns->pi_type = pi_type;
blk_queue_logical_block_size(ns->queue, bs);
- if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
- !ns->ext)
+ if (ns->ms && !ns->ext)
nvme_init_integrity(ns);
- if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+ if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
+ !blk_get_integrity(disk)) ||
+ ns->type == NVME_NS_LIGHTNVM)
set_capacity(disk, 0);
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
if (dev->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
+ blk_mq_unfreeze_queue(disk->queue);
kfree(id);
return 0;
}
+static char nvme_pr_type(enum pr_type type)
+{
+ switch (type) {
+ case PR_WRITE_EXCLUSIVE:
+ return 1;
+ case PR_EXCLUSIVE_ACCESS:
+ return 2;
+ case PR_WRITE_EXCLUSIVE_REG_ONLY:
+ return 3;
+ case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+ return 4;
+ case PR_WRITE_EXCLUSIVE_ALL_REGS:
+ return 5;
+ case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+ return 6;
+ default:
+ return 0;
+ }
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+ u64 key, u64 sa_key, u8 op)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+ struct nvme_command c;
+ u8 data[16] = { 0, };
+
+ put_unaligned_le64(key, &data[0]);
+ put_unaligned_le64(sa_key, &data[8]);
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = op;
+ c.common.nsid = cpu_to_le32(ns->ns_id);
+ c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+ return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+ u64 new, unsigned flags)
+{
+ u32 cdw10;
+
+ if (flags & ~PR_FL_IGNORE_KEY)
+ return -EOPNOTSUPP;
+
+ cdw10 = old ? 2 : 0;
+ cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+ cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+ return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+ enum pr_type type, unsigned flags)
+{
+ u32 cdw10;
+
+ if (flags & ~PR_FL_IGNORE_KEY)
+ return -EOPNOTSUPP;
+
+ cdw10 = nvme_pr_type(type) << 8;
+ cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+ enum pr_type type, bool abort)
+{
+ u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+ return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+ u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+ u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+ .pr_register = nvme_pr_register,
+ .pr_reserve = nvme_pr_reserve,
+ .pr_release = nvme_pr_release,
+ .pr_preempt = nvme_pr_preempt,
+ .pr_clear = nvme_pr_clear,
+};
+
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
@@ -2058,6 +2199,7 @@ static const struct block_device_operations nvme_fops = {
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
+ .pr_ops = &nvme_pr_ops,
};
static int nvme_kthread(void *data)
@@ -2073,14 +2215,11 @@ static int nvme_kthread(void *data)
if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
csts & NVME_CSTS_CFS) {
- if (work_busy(&dev->reset_work))
- continue;
- list_del_init(&dev->node);
- dev_warn(dev->dev,
- "Failed status: %x, reset controller\n",
- readl(&dev->bar->csts));
- dev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &dev->reset_work);
+ if (!__nvme_reset(dev)) {
+ dev_warn(dev->dev,
+ "Failed status: %x, reset controller\n",
+ readl(&dev->bar->csts));
+ }
continue;
}
for (i = 0; i < dev->queue_count; i++) {
@@ -2126,6 +2265,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
if (!disk)
goto out_free_queue;
+ kref_init(&ns->kref);
ns->ns_id = nsid;
ns->disk = disk;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
@@ -2135,7 +2275,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
if (dev->max_hw_sectors) {
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
blk_queue_max_segments(ns->queue,
- ((dev->max_hw_sectors << 9) / dev->page_size) + 1);
+ (dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
}
if (dev->stripe_size)
blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
@@ -2162,17 +2302,20 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
if (nvme_revalidate_disk(ns->disk))
goto out_free_disk;
- add_disk(ns->disk);
- if (ns->ms) {
- struct block_device *bd = bdget_disk(ns->disk, 0);
- if (!bd)
- return;
- if (blkdev_get(bd, FMODE_READ, NULL)) {
- bdput(bd);
- return;
+ kref_get(&dev->kref);
+ if (ns->type != NVME_NS_LIGHTNVM) {
+ add_disk(ns->disk);
+ if (ns->ms) {
+ struct block_device *bd = bdget_disk(ns->disk, 0);
+ if (!bd)
+ return;
+ if (blkdev_get(bd, FMODE_READ, NULL)) {
+ bdput(bd);
+ return;
+ }
+ blkdev_reread_part(bd);
+ blkdev_put(bd, FMODE_READ);
}
- blkdev_reread_part(bd);
- blkdev_put(bd, FMODE_READ);
}
return;
out_free_disk:
@@ -2184,6 +2327,13 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
kfree(ns);
}
+/*
+ * Create I/O queues. Failing to create an I/O queue is not an issue,
+ * we can continue with less than the desired amount of queues, and
+ * even a controller without I/O queues an still be used to issue
+ * admin commands. This might be useful to upgrade a buggy firmware
+ * for example.
+ */
static void nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i;
@@ -2193,8 +2343,10 @@ static void nvme_create_io_queues(struct nvme_dev *dev)
break;
for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
- if (nvme_create_queue(dev->queues[i], i))
+ if (nvme_create_queue(dev->queues[i], i)) {
+ nvme_free_queues(dev, i);
break;
+ }
}
static int set_queue_count(struct nvme_dev *dev, int count)
@@ -2357,18 +2509,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return result;
}
-static void nvme_free_namespace(struct nvme_ns *ns)
-{
- list_del(&ns->list);
-
- spin_lock(&dev_list_lock);
- ns->disk->private_data = NULL;
- spin_unlock(&dev_list_lock);
-
- put_disk(ns->disk);
- kfree(ns);
-}
-
static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
@@ -2402,15 +2542,14 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (kill)
blk_set_queue_dying(ns->queue);
- if (ns->disk->flags & GENHD_FL_UP) {
- if (blk_get_integrity(ns->disk))
- blk_integrity_unregister(ns->disk);
+ if (ns->disk->flags & GENHD_FL_UP)
del_gendisk(ns->disk);
- }
if (kill || !blk_queue_dying(ns->queue)) {
blk_mq_abort_requeue_list(ns->queue);
blk_cleanup_queue(ns->queue);
- }
+ }
+ list_del_init(&ns->list);
+ kref_put(&ns->kref, nvme_free_ns);
}
static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
@@ -2421,18 +2560,14 @@ static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
for (i = 1; i <= nn; i++) {
ns = nvme_find_ns(dev, i);
if (ns) {
- if (revalidate_disk(ns->disk)) {
+ if (revalidate_disk(ns->disk))
nvme_ns_remove(ns);
- nvme_free_namespace(ns);
- }
} else
nvme_alloc_ns(dev, i);
}
list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
- if (ns->ns_id > nn) {
+ if (ns->ns_id > nn)
nvme_ns_remove(ns);
- nvme_free_namespace(ns);
- }
}
list_sort(NULL, &dev->namespaces, ns_cmp);
}
@@ -2478,7 +2613,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
struct pci_dev *pdev = to_pci_dev(dev->dev);
int res;
struct nvme_id_ctrl *ctrl;
- int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+ int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12;
res = nvme_identify_ctrl(dev, &ctrl);
if (res) {
@@ -2494,6 +2629,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
if (ctrl->mdts)
dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
+ else
+ dev->max_hw_sectors = UINT_MAX;
if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
(pdev->device == 0x0953) && ctrl->vs[3]) {
unsigned int max_hw_sectors;
@@ -2567,7 +2704,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
goto unmap;
}
- cap = readq(&dev->bar->cap);
+ cap = lo_hi_readq(&dev->bar->cap);
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = ((void __iomem *)dev->bar) + 4096;
@@ -2630,7 +2767,7 @@ static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
* queues than admin tags.
*/
set_current_state(TASK_RUNNING);
- nvme_disable_ctrl(dev, readq(&dev->bar->cap));
+ nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap));
nvme_clear_queue(dev->queues[0]);
flush_kthread_worker(dq->worker);
nvme_disable_queue(dev, 0);
@@ -2657,6 +2794,10 @@ static void nvme_del_queue_end(struct nvme_queue *nvmeq)
{
struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
nvme_put_dq(dq);
+
+ spin_lock_irq(&nvmeq->q_lock);
+ nvme_process_cq(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
}
static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
@@ -2822,9 +2963,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
static void nvme_dev_remove(struct nvme_dev *dev)
{
- struct nvme_ns *ns;
+ struct nvme_ns *ns, *next;
- list_for_each_entry(ns, &dev->namespaces, list)
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list)
nvme_ns_remove(ns);
}
@@ -2880,21 +3021,12 @@ static void nvme_release_instance(struct nvme_dev *dev)
spin_unlock(&dev_list_lock);
}
-static void nvme_free_namespaces(struct nvme_dev *dev)
-{
- struct nvme_ns *ns, *next;
-
- list_for_each_entry_safe(ns, next, &dev->namespaces, list)
- nvme_free_namespace(ns);
-}
-
static void nvme_free_dev(struct kref *kref)
{
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
put_device(dev->dev);
put_device(dev->device);
- nvme_free_namespaces(dev);
nvme_release_instance(dev);
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
@@ -2968,14 +3100,15 @@ static const struct file_operations nvme_dev_fops = {
.compat_ioctl = nvme_dev_ioctl,
};
-static int nvme_dev_start(struct nvme_dev *dev)
+static void nvme_probe_work(struct work_struct *work)
{
- int result;
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
bool start_thread = false;
+ int result;
result = nvme_dev_map(dev);
if (result)
- return result;
+ goto out;
result = nvme_configure_admin_queue(dev);
if (result)
@@ -3010,7 +3143,20 @@ static int nvme_dev_start(struct nvme_dev *dev)
goto free_tags;
dev->event_limit = 1;
- return result;
+
+ /*
+ * Keep the controller around but remove all namespaces if we don't have
+ * any working I/O queue.
+ */
+ if (dev->online_queues < 2) {
+ dev_warn(dev->dev, "IO queues not created\n");
+ nvme_dev_remove(dev);
+ } else {
+ nvme_unfreeze_queues(dev);
+ nvme_dev_add(dev);
+ }
+
+ return;
free_tags:
nvme_dev_remove_admin(dev);
@@ -3022,7 +3168,9 @@ static int nvme_dev_start(struct nvme_dev *dev)
nvme_dev_list_remove(dev);
unmap:
nvme_dev_unmap(dev);
- return result;
+ out:
+ if (!work_busy(&dev->reset_work))
+ nvme_dead_ctrl(dev);
}
static int nvme_remove_dead_ctrl(void *arg)
@@ -3036,33 +3184,6 @@ static int nvme_remove_dead_ctrl(void *arg)
return 0;
}
-static void nvme_remove_disks(struct work_struct *ws)
-{
- struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
-
- nvme_free_queues(dev, 1);
- nvme_dev_remove(dev);
-}
-
-static int nvme_dev_resume(struct nvme_dev *dev)
-{
- int ret;
-
- ret = nvme_dev_start(dev);
- if (ret)
- return ret;
- if (dev->online_queues < 2) {
- spin_lock(&dev_list_lock);
- dev->reset_workfn = nvme_remove_disks;
- queue_work(nvme_workq, &dev->reset_work);
- spin_unlock(&dev_list_lock);
- } else {
- nvme_unfreeze_queues(dev);
- nvme_dev_add(dev);
- }
- return 0;
-}
-
static void nvme_dead_ctrl(struct nvme_dev *dev)
{
dev_warn(dev->dev, "Device failed to resume\n");
@@ -3075,8 +3196,9 @@ static void nvme_dead_ctrl(struct nvme_dev *dev)
}
}
-static void nvme_dev_reset(struct nvme_dev *dev)
+static void nvme_reset_work(struct work_struct *ws)
{
+ struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
bool in_probe = work_busy(&dev->probe_work);
nvme_dev_shutdown(dev);
@@ -3096,31 +3218,24 @@ static void nvme_dev_reset(struct nvme_dev *dev)
schedule_work(&dev->probe_work);
}
-static void nvme_reset_failed_dev(struct work_struct *ws)
-{
- struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
- nvme_dev_reset(dev);
-}
-
-static void nvme_reset_workfn(struct work_struct *work)
+static int __nvme_reset(struct nvme_dev *dev)
{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
- dev->reset_workfn(work);
+ if (work_pending(&dev->reset_work))
+ return -EBUSY;
+ list_del_init(&dev->node);
+ queue_work(nvme_workq, &dev->reset_work);
+ return 0;
}
static int nvme_reset(struct nvme_dev *dev)
{
- int ret = -EBUSY;
+ int ret;
if (!dev->admin_q || blk_queue_dying(dev->admin_q))
return -ENODEV;
spin_lock(&dev_list_lock);
- if (!work_pending(&dev->reset_work)) {
- dev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &dev->reset_work);
- ret = 0;
- }
+ ret = __nvme_reset(dev);
spin_unlock(&dev_list_lock);
if (!ret) {
@@ -3147,7 +3262,6 @@ static ssize_t nvme_sysfs_reset(struct device *dev,
}
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
-static void nvme_async_probe(struct work_struct *work);
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -3170,8 +3284,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto free;
INIT_LIST_HEAD(&dev->namespaces);
- dev->reset_workfn = nvme_reset_failed_dev;
- INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+ INIT_WORK(&dev->reset_work, nvme_reset_work);
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_set_instance(dev);
@@ -3199,7 +3312,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_LIST_HEAD(&dev->node);
INIT_WORK(&dev->scan_work, nvme_dev_scan);
- INIT_WORK(&dev->probe_work, nvme_async_probe);
+ INIT_WORK(&dev->probe_work, nvme_probe_work);
schedule_work(&dev->probe_work);
return 0;
@@ -3219,14 +3332,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return result;
}
-static void nvme_async_probe(struct work_struct *work)
-{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
-
- if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
- nvme_dead_ctrl(dev);
-}
-
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
@@ -3234,7 +3339,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
if (prepare)
nvme_dev_shutdown(dev);
else
- nvme_dev_resume(dev);
+ schedule_work(&dev->probe_work);
}
static void nvme_shutdown(struct pci_dev *pdev)
@@ -3288,10 +3393,7 @@ static int nvme_resume(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);
- if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
- ndev->reset_workfn = nvme_reset_failed_dev;
- queue_work(nvme_workq, &ndev->reset_work);
- }
+ schedule_work(&ndev->probe_work);
return 0;
}
#endif
@@ -3312,6 +3414,7 @@ static const struct pci_error_handlers nvme_err_handler = {
static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+ { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);