aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c703
1 files changed, 348 insertions, 355 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4..56e2a22e8a02 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,26 +89,38 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static int _nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
+static void nvme_update_bdev_size(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+
+ if (bdev) {
+ bd_set_nr_sectors(bdev, get_capacity(disk));
+ bdput(bdev);
+ }
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex. This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
static void nvme_set_queue_dying(struct nvme_ns *ns)
{
- /*
- * Revalidating a dead namespace sets capacity to 0. This will end
- * buffered writers dirtying pages that can't be synced.
- */
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
+
blk_set_queue_dying(ns->queue);
- /* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
- /*
- * Revalidate after unblocking dispatchers that may be holding bd_butex
- */
- revalidate_disk(ns->disk);
+
+ set_capacity(ns->disk, 0);
+ nvme_update_bdev_size(ns->disk);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@ -241,17 +253,6 @@ static blk_status_t nvme_error_status(u16 status)
}
}
-static inline bool nvme_req_needs_retry(struct request *req)
-{
- if (blk_noretry_request(req))
- return false;
- if (nvme_req(req)->status & NVME_SC_DNR)
- return false;
- if (nvme_req(req)->retries >= nvme_max_retries)
- return false;
- return true;
-}
-
static void nvme_retry_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
@@ -268,34 +269,67 @@ static void nvme_retry_req(struct request *req)
blk_mq_delay_kick_requeue_list(req->q, delay);
}
-void nvme_complete_rq(struct request *req)
+enum nvme_disposition {
+ COMPLETE,
+ RETRY,
+ FAILOVER,
+};
+
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
{
- blk_status_t status = nvme_error_status(nvme_req(req)->status);
+ if (likely(nvme_req(req)->status == 0))
+ return COMPLETE;
- trace_nvme_complete_rq(req);
+ if (blk_noretry_request(req) ||
+ (nvme_req(req)->status & NVME_SC_DNR) ||
+ nvme_req(req)->retries >= nvme_max_retries)
+ return COMPLETE;
- nvme_cleanup_cmd(req);
+ if (req->cmd_flags & REQ_NVME_MPATH) {
+ if (nvme_is_path_error(nvme_req(req)->status) ||
+ blk_queue_dying(req->q))
+ return FAILOVER;
+ } else {
+ if (blk_queue_dying(req->q))
+ return COMPLETE;
+ }
- if (nvme_req(req)->ctrl->kas)
- nvme_req(req)->ctrl->comp_seen = true;
+ return RETRY;
+}
- if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
- return;
+static inline void nvme_end_req(struct request *req)
+{
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
- if (!blk_queue_dying(req->q)) {
- nvme_retry_req(req);
- return;
- }
- } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND) {
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
- }
nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
+
+void nvme_complete_rq(struct request *req)
+{
+ trace_nvme_complete_rq(req);
+ nvme_cleanup_cmd(req);
+
+ if (nvme_req(req)->ctrl->kas)
+ nvme_req(req)->ctrl->comp_seen = true;
+
+ switch (nvme_decide_disposition(req)) {
+ case COMPLETE:
+ nvme_end_req(req);
+ return;
+ case RETRY:
+ nvme_retry_req(req);
+ return;
+ case FAILOVER:
+ nvme_failover_req(req);
+ return;
+ }
+}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
bool nvme_cancel_request(struct request *req, void *data, bool reserved)
@@ -330,7 +364,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -340,7 +374,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -350,7 +384,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -361,7 +395,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -371,7 +405,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING:
case NVME_CTRL_DEAD:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -380,7 +414,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_DELETING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -933,10 +967,10 @@ static u32 nvme_known_admin_effects(u8 opcode)
{
switch (opcode) {
case nvme_admin_format_nvm:
- return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
NVME_CMD_EFFECTS_CSE_MASK;
case nvme_admin_sanitize_nvm:
- return NVME_CMD_EFFECTS_CSE_MASK;
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
default:
break;
}
@@ -974,7 +1008,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
* For simplicity, IO to all namespaces is quiesced even if the command
* effects say only one namespace is affected.
*/
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
mutex_lock(&ctrl->scan_lock);
mutex_lock(&ctrl->subsys->lock);
nvme_mpath_start_freeze(ctrl->subsys);
@@ -985,36 +1019,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return effects;
}
-static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
-{
- struct nvme_ns *ns;
-
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- if (_nvme_revalidate_disk(ns->disk))
- nvme_set_queue_dying(ns);
- else if (blk_queue_is_zoned(ns->disk->queue)) {
- /*
- * IO commands are required to fully revalidate a zoned
- * device. Force the command effects to trigger rescan
- * work so report zones can run in a context with
- * unfrozen IO queues.
- */
- *effects |= NVME_CMD_EFFECTS_NCC;
- }
- up_read(&ctrl->namespaces_rwsem);
-}
-
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
{
- /*
- * Revalidate LBA changes prior to unfreezing. This is necessary to
- * prevent memory corruption if a logical block size was changed by
- * this command.
- */
- if (effects & NVME_CMD_EFFECTS_LBCC)
- nvme_update_formats(ctrl, &effects);
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
@@ -1274,6 +1281,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
int status, pos, len;
void *data;
+ if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
+ return 0;
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
return 0;
@@ -1317,19 +1326,8 @@ free_data:
return status;
}
-static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
-{
- struct nvme_command c = { };
-
- c.identify.opcode = nvme_admin_identify;
- c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
- c.identify.nsid = cpu_to_le32(nsid);
- return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
- NVME_IDENTIFY_DATA_SIZE);
-}
-
-static int nvme_identify_ns(struct nvme_ctrl *ctrl,
- unsigned nsid, struct nvme_id_ns **id)
+static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+ struct nvme_ns_ids *ids, struct nvme_id_ns **id)
{
struct nvme_command c = { };
int error;
@@ -1346,9 +1344,24 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl,
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
- kfree(*id);
+ goto out_free_id;
}
+ error = -ENODEV;
+ if ((*id)->ncap == 0) /* namespace not allocated or attached */
+ goto out_free_id;
+
+ if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+ !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+ memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+ if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+ !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+ memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+
+ return 0;
+
+out_free_id:
+ kfree(*id);
return error;
}
@@ -1870,20 +1883,6 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
nvme_lba_to_sect(ns, max_blocks));
}
-static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
- struct nvme_id_ns *id, struct nvme_ns_ids *ids)
-{
- memset(ids, 0, sizeof(*ids));
-
- if (ctrl->vs >= NVME_VS(1, 1, 0))
- memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
- if (ctrl->vs >= NVME_VS(1, 2, 0))
- memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
- return nvme_identify_ns_descs(ctrl, nsid, ids);
- return 0;
-}
-
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
{
return !uuid_is_null(&ids->uuid) ||
@@ -1924,6 +1923,68 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return 0;
}
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ /*
+ * The PI implementation requires the metadata size to be equal to the
+ * t10 pi tuple size.
+ */
+ ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ if (ns->ms == sizeof(struct t10_pi_tuple))
+ ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ else
+ ns->pi_type = 0;
+
+ ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ return 0;
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
+ /*
+ * The NVMe over Fabrics specification only supports metadata as
+ * part of the extended data LBA. We rely on HCA/HBA support to
+ * remap the separate metadata buffer from the block layer.
+ */
+ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
+ return -EINVAL;
+ if (ctrl->max_integrity_segments)
+ ns->features |=
+ (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ } else {
+ /*
+ * For PCIe controllers, we can't easily remap the separate
+ * metadata buffer from the block layer and thus require a
+ * separate metadata buffer for block layer metadata/PI support.
+ * We allow extended LBAs for the passthrough interface, though.
+ */
+ if (id->flbas & NVME_NS_FLBAS_META_EXT)
+ ns->features |= NVME_NS_EXT_LBAS;
+ else
+ ns->features |= NVME_NS_METADATA_SUPPORTED;
+ }
+
+ return 0;
+}
+
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+ struct request_queue *q)
+{
+ bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
+
+ if (ctrl->max_hw_sectors) {
+ u32 max_segments =
+ (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+
+ max_segments = min_not_zero(max_segments, ctrl->max_segments);
+ blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+ blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+ }
+ blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
+ blk_queue_dma_alignment(q, 7);
+ blk_queue_write_cache(q, vwc, vwc);
+}
+
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
@@ -1931,11 +1992,15 @@ static void nvme_update_disk_info(struct gendisk *disk,
unsigned short bs = 1 << ns->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
+ /*
+ * The block layer can't support LBA sizes larger than the page size
+ * yet, so catch this early and don't allow block I/O.
+ */
if (ns->lba_shift > PAGE_SHIFT) {
- /* unsupported block size, set capacity to 0 later */
+ capacity = 0;
bs = (1 << 9);
}
- blk_mq_freeze_queue(disk->queue);
+
blk_integrity_unregister(disk);
atomic_bs = phys_bs = bs;
@@ -1970,13 +2035,6 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_queue_io_opt(disk->queue, io_opt);
/*
- * The block layer can't support LBA sizes larger than the page size
- * yet, so catch this early and don't allow block I/O.
- */
- if (ns->lba_shift > PAGE_SHIFT)
- capacity = 0;
-
- /*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
* I/O to namespaces with metadata except when the namespace supports
@@ -2000,162 +2058,88 @@ static void nvme_update_disk_info(struct gendisk *disk,
set_disk_ro(disk, true);
else
set_disk_ro(disk, false);
+}
- blk_mq_unfreeze_queue(disk->queue);
+static inline bool nvme_first_scan(struct gendisk *disk)
+{
+ /* nvme_alloc_ns() scans the disk prior to adding it */
+ return !(disk->flags & GENHD_FL_UP);
}
-static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
{
- unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
- struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
- int ret;
u32 iob;
- /*
- * If identify namespace failed, use default 512 byte block size so
- * block layer can use before failing read/write for 0 capacity.
- */
- ns->lba_shift = id->lbaf[lbaf].ds;
- if (ns->lba_shift == 0)
- ns->lba_shift = 9;
-
- switch (ns->head->ids.csi) {
- case NVME_CSI_NVM:
- break;
- case NVME_CSI_ZNS:
- ret = nvme_update_zone_info(disk, ns, lbaf);
- if (ret) {
- dev_warn(ctrl->device,
- "failed to add zoned namespace:%u ret:%d\n",
- ns->head->ns_id, ret);
- return ret;
- }
- break;
- default:
- dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
- ns->head->ids.csi, ns->head->ns_id);
- return -ENODEV;
- }
-
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
- ns->features = 0;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- /* the PI implementation requires metadata equal t10 pi tuple size */
- if (ns->ms == sizeof(struct t10_pi_tuple))
- ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
- else
- ns->pi_type = 0;
+ if (!iob)
+ return;
- if (ns->ms) {
- /*
- * For PCIe only the separate metadata pointer is supported,
- * as the block layer supplies metadata in a separate bio_vec
- * chain. For Fabrics, only metadata as part of extended data
- * LBA is supported on the wire per the Fabrics specification,
- * but the HBA/HCA will do the remapping from the separate
- * metadata buffers for us.
- */
- if (id->flbas & NVME_NS_FLBAS_META_EXT) {
- ns->features |= NVME_NS_EXT_LBAS;
- if ((ctrl->ops->flags & NVME_F_FABRICS) &&
- (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) &&
- ctrl->max_integrity_segments)
- ns->features |= NVME_NS_METADATA_SUPPORTED;
- } else {
- if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS))
- return -EINVAL;
- if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
- ns->features |= NVME_NS_METADATA_SUPPORTED;
- }
+ if (!is_power_of_2(iob)) {
+ if (nvme_first_scan(ns->disk))
+ pr_warn("%s: ignoring unaligned IO boundary:%u\n",
+ ns->disk->disk_name, iob);
+ return;
}
- if (iob)
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
- nvme_update_disk_info(disk, ns, id);
-#ifdef CONFIG_NVME_MULTIPATH
- if (ns->head->disk) {
- nvme_update_disk_info(ns->head->disk, ns, id);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- nvme_mpath_update_disk_size(ns->head->disk);
+ if (blk_queue_is_zoned(ns->disk->queue)) {
+ if (nvme_first_scan(ns->disk))
+ pr_warn("%s: ignoring zoned namespace IO boundary\n",
+ ns->disk->disk_name);
+ return;
}
-#endif
- return 0;
+
+ blk_queue_chunk_sectors(ns->queue, iob);
}
-static int _nvme_revalidate_disk(struct gendisk *disk)
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
{
- struct nvme_ns *ns = disk->private_data;
- struct nvme_ctrl *ctrl = ns->ctrl;
- struct nvme_id_ns *id;
- struct nvme_ns_ids ids;
- int ret = 0;
-
- if (test_bit(NVME_NS_DEAD, &ns->flags)) {
- set_capacity(disk, 0);
- return -ENODEV;
- }
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+ int ret;
- ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
- if (ret)
- goto out;
+ blk_mq_freeze_queue(ns->disk->queue);
+ ns->lba_shift = id->lbaf[lbaf].ds;
+ nvme_set_queue_limits(ns->ctrl, ns->queue);
- if (id->ncap == 0) {
- ret = -ENODEV;
- goto free_id;
+ if (ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_update_zone_info(ns, lbaf);
+ if (ret)
+ goto out_unfreeze;
}
- ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ ret = nvme_configure_metadata(ns, id);
if (ret)
- goto free_id;
+ goto out_unfreeze;
+ nvme_set_chunk_sectors(ns, id);
+ nvme_update_disk_info(ns->disk, ns, id);
+ blk_mq_unfreeze_queue(ns->disk->queue);
- if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
- dev_err(ctrl->device,
- "identifiers changed for nsid %d\n", ns->head->ns_id);
- ret = -ENODEV;
- goto free_id;
+ if (blk_queue_is_zoned(ns->queue)) {
+ ret = nvme_revalidate_zones(ns);
+ if (ret)
+ return ret;
}
- ret = __nvme_revalidate_disk(disk, id);
-free_id:
- kfree(id);
-out:
- /*
- * Only fail the function if we got a fatal error back from the
- * device, otherwise ignore the error and just move on.
- */
- if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
- ret = 0;
- else if (ret > 0)
- ret = blk_status_to_errno(nvme_error_status(ret));
- return ret;
-}
-
-static int nvme_revalidate_disk(struct gendisk *disk)
-{
- int ret;
-
- ret = _nvme_revalidate_disk(disk);
- if (ret)
- return ret;
-
-#ifdef CONFIG_BLK_DEV_ZONED
- if (blk_queue_is_zoned(disk->queue)) {
- struct nvme_ns *ns = disk->private_data;
- struct nvme_ctrl *ctrl = ns->ctrl;
-
- ret = blk_revalidate_disk_zones(disk, NULL);
- if (!ret)
- blk_queue_max_zone_append_sectors(disk->queue,
- ctrl->max_zone_append);
+#ifdef CONFIG_NVME_MULTIPATH
+ if (ns->head->disk) {
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ nvme_update_disk_info(ns->head->disk, ns, id);
+ blk_stack_limits(&ns->head->disk->queue->limits,
+ &ns->queue->limits, 0);
+ blk_queue_update_readahead(ns->head->disk->queue);
+ nvme_update_bdev_size(ns->head->disk);
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
}
#endif
+ return 0;
+
+out_unfreeze:
+ blk_mq_unfreeze_queue(ns->disk->queue);
return ret;
}
@@ -2288,7 +2272,6 @@ static const struct block_device_operations nvme_fops = {
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
- .revalidate_disk= nvme_revalidate_disk,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -2438,26 +2421,6 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
- struct request_queue *q)
-{
- bool vwc = false;
-
- if (ctrl->max_hw_sectors) {
- u32 max_segments =
- (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
-
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
- }
- blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, 7);
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- vwc = true;
- blk_queue_write_cache(q, vwc, vwc);
-}
-
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
{
__le64 ts;
@@ -2961,26 +2924,10 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
-{
- struct nvme_cel *cel, *ret = NULL;
-
- spin_lock(&ctrl->lock);
- list_for_each_entry(cel, &ctrl->cels, entry) {
- if (cel->csi == csi) {
- ret = cel;
- break;
- }
- }
- spin_unlock(&ctrl->lock);
-
- return ret;
-}
-
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
- struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
+ struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
int ret;
if (cel)
@@ -2990,7 +2937,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
if (!cel)
return -ENOMEM;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+ ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
&cel->log, sizeof(cel->log), 0);
if (ret) {
kfree(cel);
@@ -2998,10 +2945,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
}
cel->csi = csi;
-
- spin_lock(&ctrl->lock);
- list_add_tail(&cel->entry, &ctrl->cels);
- spin_unlock(&ctrl->lock);
+ xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
out:
*log = &cel->log;
return 0;
@@ -3185,8 +3129,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
- if (!ctrl->identified)
- nvme_hwmon_init(ctrl);
+ if (!ctrl->identified) {
+ ret = nvme_hwmon_init(ctrl);
+ if (ret < 0)
+ return ret;
+ }
ctrl->identified = true;
@@ -3210,10 +3157,26 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
return -EWOULDBLOCK;
}
+ nvme_get_ctrl(ctrl);
+ if (!try_module_get(ctrl->ops->module)) {
+ nvme_put_ctrl(ctrl);
+ return -EINVAL;
+ }
+
file->private_data = ctrl;
return 0;
}
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+ module_put(ctrl->ops->module);
+ nvme_put_ctrl(ctrl);
+ return 0;
+}
+
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;
@@ -3276,6 +3239,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
+ .release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -3474,10 +3438,6 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
- /* Can't delete non-created controllers */
- if (!ctrl->created)
- return -EBUSY;
-
if (device_remove_file_self(dev, attr))
nvme_delete_ctrl_sync(ctrl);
return count;
@@ -3654,6 +3614,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_hostid.attr && !ctrl->opts)
return 0;
+ if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
+ return 0;
+ if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
+ return 0;
return a->mode;
}
@@ -3762,25 +3726,16 @@ out:
}
static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
- struct nvme_id_ns *id)
+ struct nvme_ns_ids *ids, bool is_shared)
{
struct nvme_ctrl *ctrl = ns->ctrl;
- bool is_shared = id->nmic & NVME_NS_NMIC_SHARED;
struct nvme_ns_head *head = NULL;
- struct nvme_ns_ids ids;
int ret = 0;
- ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
- if (ret) {
- if (ret < 0)
- return ret;
- return blk_status_to_errno(nvme_error_status(ret));
- }
-
mutex_lock(&ctrl->subsys->lock);
head = nvme_find_ns_head(ctrl->subsys, nsid);
if (!head) {
- head = nvme_alloc_ns_head(ctrl, nsid, &ids);
+ head = nvme_alloc_ns_head(ctrl, nsid, ids);
if (IS_ERR(head)) {
ret = PTR_ERR(head);
goto out_unlock;
@@ -3793,7 +3748,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
"Duplicate unshared namespace %d\n", nsid);
goto out_put_ns_head;
}
- if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+ if (!nvme_ns_ids_equal(&head->ids, ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
nsid);
@@ -3841,7 +3796,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+ struct nvme_ns_ids *ids)
{
struct nvme_ns *ns;
struct gendisk *disk;
@@ -3849,17 +3805,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
char disk_name[DISK_NAME_LEN];
int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
+ if (nvme_identify_ns(ctrl, nsid, ids, &id))
+ return;
+
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
- return;
+ goto out_free_id;
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
goto out_free_ns;
if (ctrl->opts && ctrl->opts->data_digest)
- ns->queue->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
@@ -3867,23 +3825,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
-
kref_init(&ns->kref);
- ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
-
- blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- nvme_set_queue_limits(ctrl, ns->queue);
- ret = nvme_identify_ns(ctrl, nsid, &id);
+ ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
if (ret)
goto out_free_queue;
-
- if (id->ncap == 0) /* no namespace (legacy quirk) */
- goto out_free_id;
-
- ret = nvme_init_ns_head(ns, nsid, id);
- if (ret)
- goto out_free_id;
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
disk = alloc_disk_node(0, node);
@@ -3897,7 +3843,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
ns->disk = disk;
- if (__nvme_revalidate_disk(disk, id))
+ if (nvme_update_ns_info(ns, id))
goto out_put_disk;
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
@@ -3932,12 +3878,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
list_del_init(&ns->head->entry);
mutex_unlock(&ctrl->subsys->lock);
nvme_put_ns_head(ns->head);
- out_free_id:
- kfree(id);
out_free_queue:
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
+ out_free_id:
+ kfree(id);
}
static void nvme_ns_remove(struct nvme_ns *ns)
@@ -3945,6 +3891,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
+ set_capacity(ns->disk, 0);
nvme_fault_inject_fini(&ns->fault_inject);
mutex_lock(&ns->ctrl->subsys->lock);
@@ -3982,17 +3929,75 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
}
}
-static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+{
+ struct nvme_id_ns *id;
+ int ret = -ENODEV;
+
+ if (test_bit(NVME_NS_DEAD, &ns->flags))
+ goto out;
+
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
+ if (ret)
+ goto out;
+
+ ret = -ENODEV;
+ if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
+ dev_err(ns->ctrl->device,
+ "identifiers changed for nsid %d\n", ns->head->ns_id);
+ goto out_free_id;
+ }
+
+ ret = nvme_update_ns_info(ns, id);
+
+out_free_id:
+ kfree(id);
+out:
+ /*
+ * Only remove the namespace if we got a fatal error back from the
+ * device, otherwise ignore the error and just move on.
+ *
+ * TODO: we should probably schedule a delayed retry here.
+ */
+ if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR)))
+ nvme_ns_remove(ns);
+ else
+ revalidate_disk_size(ns->disk, true);
+}
+
+static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
+ struct nvme_ns_ids ids = { };
struct nvme_ns *ns;
+ if (nvme_identify_ns_descs(ctrl, nsid, &ids))
+ return;
+
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- if (revalidate_disk(ns->disk))
- nvme_ns_remove(ns);
+ nvme_validate_ns(ns, &ids);
nvme_put_ns(ns);
- } else
- nvme_alloc_ns(ctrl, nsid);
+ return;
+ }
+
+ switch (ids.csi) {
+ case NVME_CSI_NVM:
+ nvme_alloc_ns(ctrl, nsid, &ids);
+ break;
+ case NVME_CSI_ZNS:
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ dev_warn(ctrl->device,
+ "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
+ nsid);
+ break;
+ }
+ nvme_alloc_ns(ctrl, nsid, &ids);
+ break;
+ default:
+ dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
+ ids.csi, nsid);
+ break;
+ }
}
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -4028,7 +4033,14 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
return -ENOMEM;
for (;;) {
- ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+ struct nvme_command cmd = {
+ .identify.opcode = nvme_admin_identify,
+ .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
+ .identify.nsid = cpu_to_le32(prev),
+ };
+
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
+ NVME_IDENTIFY_DATA_SIZE);
if (ret)
goto free;
@@ -4037,7 +4049,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
if (!nsid) /* end of the list? */
goto out;
- nvme_validate_ns(ctrl, nsid);
+ nvme_validate_or_alloc_ns(ctrl, nsid);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
@@ -4060,7 +4072,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
kfree(id);
for (i = 1; i <= nn; i++)
- nvme_validate_ns(ctrl, i);
+ nvme_validate_or_alloc_ns(ctrl, i);
nvme_remove_invalid_namespaces(ctrl, nn);
}
@@ -4348,7 +4360,6 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
nvme_queue_scan(ctrl);
nvme_start_queues(ctrl);
}
- ctrl->created = true;
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
@@ -4366,15 +4377,11 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_ctrl *ctrl =
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
- struct nvme_cel *cel, *next;
- if (subsys && ctrl->instance != subsys->instance)
+ if (!subsys || ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
- list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
- list_del(&cel->entry);
- kfree(cel);
- }
+ xa_destroy(&ctrl->cels);
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -4406,7 +4413,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
- INIT_LIST_HEAD(&ctrl->cels);
+ xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
@@ -4512,7 +4519,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
@@ -4523,6 +4530,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
break;
}
up_read(&ctrl->namespaces_rwsem);
+ return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4585,28 +4593,13 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
-struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
{
- struct nvme_ctrl *ctrl;
- struct file *f;
-
- f = filp_open(path, O_RDWR, 0);
- if (IS_ERR(f))
- return ERR_CAST(f);
-
- if (f->f_op != &nvme_dev_fops) {
- ctrl = ERR_PTR(-EINVAL);
- goto out_close;
- }
-
- ctrl = f->private_data;
- nvme_get_ctrl(ctrl);
-
-out_close:
- filp_close(f, NULL);
- return ctrl;
+ if (file->f_op != &nvme_dev_fops)
+ return NULL;
+ return file->private_data;
}
-EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
/*
* Check we didn't inadvertently grow the command structure sizes: