aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c582
-rw-r--r--drivers/nvme/host/fabrics.c2
-rw-r--r--drivers/nvme/host/fabrics.h3
-rw-r--r--drivers/nvme/host/fc.c10
-rw-r--r--drivers/nvme/host/hwmon.c5
-rw-r--r--drivers/nvme/host/lightnvm.c4
-rw-r--r--drivers/nvme/host/multipath.c55
-rw-r--r--drivers/nvme/host/nvme.h100
-rw-r--r--drivers/nvme/host/pci.c197
-rw-r--r--drivers/nvme/host/rdma.c134
-rw-r--r--drivers/nvme/host/tcp.c109
-rw-r--r--drivers/nvme/host/zns.c256
-rw-r--r--drivers/nvme/target/Kconfig12
-rw-r--r--drivers/nvme/target/Makefile1
-rw-r--r--drivers/nvme/target/admin-cmd.c26
-rw-r--r--drivers/nvme/target/configfs.c117
-rw-r--r--drivers/nvme/target/core.c81
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fc.c30
-rw-r--r--drivers/nvme/target/fcloop.c29
-rw-r--r--drivers/nvme/target/loop.c17
-rw-r--r--drivers/nvme/target/nvmet.h60
-rw-r--r--drivers/nvme/target/passthru.c544
-rw-r--r--drivers/nvme/target/rdma.c17
-rw-r--r--drivers/nvme/target/tcp.c13
26 files changed, 1936 insertions, 471 deletions
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index fc7b26be692d..d7f6a87687b8 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -13,6 +13,7 @@ nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
+nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index add040168e67..88cff309d8e4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static int nvme_revalidate_disk(struct gendisk *disk);
+static int _nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
@@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
* Revalidating a dead namespace sets capacity to 0. This will end
* buffered writers dirtying pages that can't be synced.
*/
- if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
nvme_retry_req(req);
return;
}
+ } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ le64_to_cpu(nvme_req(req)->result.u64));
}
nvme_trace_bio_complete(req, status);
@@ -304,7 +308,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
return true;
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
- blk_mq_force_complete_rq(req);
+ blk_mq_complete_request(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
}
break;
+ case NVME_CTRL_DELETING_NOIO:
+ switch (old_state) {
+ case NVME_CTRL_DELETING:
+ case NVME_CTRL_DEAD:
+ changed = true;
+ /* FALLTHRU */
+ default:
+ break;
+ }
+ break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
@@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
case NVME_CTRL_CONNECTING:
return false;
case NVME_CTRL_DELETING:
+ case NVME_CTRL_DELETING_NOIO:
case NVME_CTRL_DEAD:
return true;
default:
@@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
-static void nvme_put_ns(struct nvme_ns *ns)
+void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
static inline void nvme_clear_nvme_request(struct request *req)
{
@@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
goto out_disable_stream;
}
- ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
@@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
}
+static void nvme_setup_passthrough(struct request *req,
+ struct nvme_command *cmd)
+{
+ memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+ /* passthru commands should let the driver set the SGL flags */
+ cmd->common.flags &= ~NVME_CMD_SGL_ALL;
+}
+
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
- struct request *req, struct nvme_command *cmnd)
+ struct request *req, struct nvme_command *cmnd,
+ enum nvme_opcode op)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
@@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd->rw.opcode = op;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
+ if (op == nvme_cmd_zone_append)
+ control |= NVME_RW_APPEND_PIREMAP;
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
@@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
- memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+ nvme_setup_passthrough(req, cmd);
break;
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ case REQ_OP_ZONE_RESET:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+ break;
case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
@@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+ break;
case REQ_OP_WRITE:
- ret = nvme_setup_rw(ns, req, cmd);
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
default:
WARN_ON_ONCE(1);
@@ -884,6 +929,120 @@ out:
return ERR_PTR(ret);
}
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+ switch (opcode) {
+ case nvme_admin_format_nvm:
+ return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+ NVME_CMD_EFFECTS_CSE_MASK;
+ case nvme_admin_sanitize_nvm:
+ return NVME_CMD_EFFECTS_CSE_MASK;
+ default:
+ break;
+ }
+ return 0;
+}
+
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
+{
+ u32 effects = 0;
+
+ if (ns) {
+ if (ns->head->effects)
+ effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
+ if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
+ dev_warn(ctrl->device,
+ "IO command:%02x has unhandled effects:%08x\n",
+ opcode, effects);
+ return 0;
+ }
+
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+ effects |= nvme_known_admin_effects(opcode);
+
+ return effects;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u8 opcode)
+{
+ u32 effects = nvme_command_effects(ctrl, ns, opcode);
+
+ /*
+ * For simplicity, IO to all namespaces is quiesced even if the command
+ * effects say only one namespace is affected.
+ */
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ mutex_lock(&ctrl->scan_lock);
+ mutex_lock(&ctrl->subsys->lock);
+ nvme_mpath_start_freeze(ctrl->subsys);
+ nvme_mpath_wait_freeze(ctrl->subsys);
+ nvme_start_freeze(ctrl);
+ nvme_wait_freeze(ctrl);
+ }
+ return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ if (_nvme_revalidate_disk(ns->disk))
+ nvme_set_queue_dying(ns);
+ else if (blk_queue_is_zoned(ns->disk->queue)) {
+ /*
+ * IO commands are required to fully revalidate a zoned
+ * device. Force the command effects to trigger rescan
+ * work so report zones can run in a context with
+ * unfrozen IO queues.
+ */
+ *effects |= NVME_CMD_EFFECTS_NCC;
+ }
+ up_read(&ctrl->namespaces_rwsem);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+ /*
+ * Revalidate LBA changes prior to unfreezing. This is necessary to
+ * prevent memory corruption if a logical block size was changed by
+ * this command.
+ */
+ if (effects & NVME_CMD_EFFECTS_LBCC)
+ nvme_update_formats(ctrl, &effects);
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ nvme_unfreeze(ctrl);
+ nvme_mpath_unfreeze(ctrl->subsys);
+ mutex_unlock(&ctrl->subsys->lock);
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
+ mutex_unlock(&ctrl->scan_lock);
+ }
+ if (effects & NVME_CMD_EFFECTS_CCC)
+ nvme_init_identify(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
+ nvme_queue_scan(ctrl);
+ flush_work(&ctrl->scan_work);
+ }
+}
+
+void nvme_execute_passthru_rq(struct request *rq)
+{
+ struct nvme_command *cmd = nvme_req(rq)->cmd;
+ struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
+ struct nvme_ns *ns = rq->q->queuedata;
+ struct gendisk *disk = ns ? ns->disk : NULL;
+ u32 effects;
+
+ effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
+ blk_execute_rq(rq->q, disk, rq, 0);
+ nvme_passthru_end(ctrl, effects);
+}
+EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
+
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
@@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
}
}
- blk_execute_rq(req->q, disk, req, 0);
+ nvme_execute_passthru_rq(req);
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
@@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
return error;
}
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+ return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
- struct nvme_ns_id_desc *cur)
+ struct nvme_ns_id_desc *cur, bool *csi_seen)
{
const char *warn_str = "ctrl returned bogus length:";
void *data = cur;
@@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
uuid_copy(&ids->uuid, data + sizeof(*cur));
return NVME_NIDT_UUID_LEN;
+ case NVME_NIDT_CSI:
+ if (cur->nidl != NVME_NIDT_CSI_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+ *csi_seen = true;
+ return NVME_NIDT_CSI_LEN;
default:
/* Skip unknown types */
return cur->nidl;
@@ -1097,10 +1270,12 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
- int status;
+ bool csi_seen = false;
+ int status, pos, len;
void *data;
- int pos;
- int len;
+
+ if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
+ return 0;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
@@ -1115,18 +1290,6 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (status) {
dev_warn(ctrl->device,
"Identify Descriptors failed (%d)\n", status);
- /*
- * Don't treat non-retryable errors as fatal, as we potentially
- * already have a NGUID or EUI-64. If we failed with DNR set,
- * we want to silently ignore the error as we can still
- * identify the device, but if the status has DNR set, we want
- * to propagate the error back specifically for the disk
- * revalidation flow to make sure we don't abandon the
- * device just because of a temporal retry-able error (such
- * as path of transport errors).
- */
- if (status > 0 && (status & NVME_SC_DNR))
- status = 0;
goto free_data;
}
@@ -1136,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
- len = nvme_process_ns_desc(ctrl, ids, cur);
+ len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
if (len < 0)
- goto free_data;
+ break;
len += sizeof(*cur);
}
+
+ if (nvme_multi_css(ctrl) && !csi_seen) {
+ dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+ nsid);
+ status = -EINVAL;
+ }
+
free_data:
kfree(data);
return status;
@@ -1330,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
-static u32 nvme_known_admin_effects(u8 opcode)
-{
- switch (opcode) {
- case nvme_admin_format_nvm:
- return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
- NVME_CMD_EFFECTS_CSE_MASK;
- case nvme_admin_sanitize_nvm:
- return NVME_CMD_EFFECTS_CSE_MASK;
- default:
- break;
- }
- return 0;
-}
-
-static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 opcode)
-{
- u32 effects = 0;
-
- if (ns) {
- if (ctrl->effects)
- effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
- if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
- dev_warn(ctrl->device,
- "IO command:%02x has unhandled effects:%08x\n",
- opcode, effects);
- return 0;
- }
-
- if (ctrl->effects)
- effects = le32_to_cpu(ctrl->effects->acs[opcode]);
- effects |= nvme_known_admin_effects(opcode);
-
- /*
- * For simplicity, IO to all namespaces is quiesced even if the command
- * effects say only one namespace is affected.
- */
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
- mutex_lock(&ctrl->scan_lock);
- mutex_lock(&ctrl->subsys->lock);
- nvme_mpath_start_freeze(ctrl->subsys);
- nvme_mpath_wait_freeze(ctrl->subsys);
- nvme_start_freeze(ctrl);
- nvme_wait_freeze(ctrl);
- }
- return effects;
-}
-
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
-{
- struct nvme_ns *ns;
-
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- if (ns->disk && nvme_revalidate_disk(ns->disk))
- nvme_set_queue_dying(ns);
- up_read(&ctrl->namespaces_rwsem);
-}
-
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
-{
- /*
- * Revalidate LBA changes prior to unfreezing. This is necessary to
- * prevent memory corruption if a logical block size was changed by
- * this command.
- */
- if (effects & NVME_CMD_EFFECTS_LBCC)
- nvme_update_formats(ctrl);
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
- nvme_unfreeze(ctrl);
- nvme_mpath_unfreeze(ctrl->subsys);
- mutex_unlock(&ctrl->subsys->lock);
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
- mutex_unlock(&ctrl->scan_lock);
- }
- if (effects & NVME_CMD_EFFECTS_CCC)
- nvme_init_identify(ctrl);
- if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
- nvme_queue_scan(ctrl);
- flush_work(&ctrl->scan_work);
- }
-}
-
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
- u32 effects;
u64 result;
int status;
@@ -1446,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &result, timeout);
- nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@@ -1467,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
- u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1493,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
- nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -1512,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
* Issue ioctl requests on the first available path. Note that unlike normal
* block layer requests we will not retry failed request on another controller.
*/
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns_head **head, int *srcu_idx)
{
#ifdef CONFIG_NVME_MULTIPATH
@@ -1532,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
return disk->private_data;
}
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
{
if (head)
srcu_read_unlock(&head->srcu, idx);
@@ -1798,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0))
+ if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
return nvme_identify_ns_descs(ctrl, nsid, ids);
return 0;
}
@@ -1814,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
- memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+ a->csi == b->csi;
}
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1924,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk,
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
- ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+ ns->lba_shift = id->lbaf[lbaf].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
+ switch (ns->head->ids.csi) {
+ case NVME_CSI_NVM:
+ break;
+ case NVME_CSI_ZNS:
+ ret = nvme_update_zone_info(disk, ns, lbaf);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "failed to add zoned namespace:%u ret:%d\n",
+ ns->head->ns_id, ret);
+ return ret;
+ }
+ break;
+ default:
+ dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
+ ns->head->ids.csi, ns->head->ns_id);
+ return -ENODEV;
+ }
+
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
@@ -1943,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
ns->features = 0;
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@@ -1979,14 +2081,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
- blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+ blk_stack_limits(&ns->head->disk->queue->limits,
+ &ns->queue->limits, 0);
nvme_mpath_update_disk_size(ns->head->disk);
}
#endif
return 0;
}
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int _nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -2034,6 +2137,28 @@ out:
return ret;
}
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ int ret;
+
+ ret = _nvme_revalidate_disk(disk);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(disk->queue)) {
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ ret = blk_revalidate_disk_zones(disk, NULL);
+ if (!ret)
+ blk_queue_max_zone_append_sectors(disk->queue,
+ ctrl->max_zone_append);
+ }
+#endif
+ return ret;
+}
+
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@@ -2164,6 +2289,7 @@ static const struct block_device_operations nvme_fops = {
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -2184,11 +2310,13 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
const struct block_device_operations nvme_ns_head_ops = {
.owner = THIS_MODULE,
+ .submit_bio = nvme_ns_head_submit_bio,
.open = nvme_ns_head_open,
.release = nvme_ns_head_release,
.ioctl = nvme_ioctl,
.compat_ioctl = nvme_compat_ioctl,
.getgeo = nvme_getgeo,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
#endif /* CONFIG_NVME_MULTIPATH */
@@ -2246,12 +2374,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
- /*
- * Default to a 4K page size, with the intention to update this
- * path in the future to accomodate architectures with differing
- * kernel and IO page sizes.
- */
- unsigned dev_page_min, page_shift = 12;
+ unsigned dev_page_min;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
@@ -2261,17 +2384,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
- if (page_shift < dev_page_min) {
+ if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
- 1 << dev_page_min, 1 << page_shift);
+ 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
return -ENODEV;
}
- ctrl->page_size = 1 << page_shift;
-
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
- ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+ if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+ ctrl->ctrl_config = NVME_CC_CSS_CSI;
+ else
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
+ ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
@@ -2321,13 +2445,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
if (ctrl->max_hw_sectors) {
u32 max_segments =
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+ (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
max_segments = min_not_zero(max_segments, ctrl->max_segments);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- blk_queue_virt_boundary(q, ctrl->page_size - 1);
+ blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
blk_queue_dma_alignment(q, 7);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
@@ -2818,7 +2942,7 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
@@ -2832,27 +2956,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+ c.get_log_page.csi = csi;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
{
+ struct nvme_cel *cel, *ret = NULL;
+
+ spin_lock(&ctrl->lock);
+ list_for_each_entry(cel, &ctrl->cels, entry) {
+ if (cel->csi == csi) {
+ ret = cel;
+ break;
+ }
+ }
+ spin_unlock(&ctrl->lock);
+
+ return ret;
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
+ struct nvme_effects_log **log)
+{
+ struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
int ret;
- if (!ctrl->effects)
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+ if (cel)
+ goto out;
- if (!ctrl->effects)
- return 0;
+ cel = kzalloc(sizeof(*cel), GFP_KERNEL);
+ if (!cel)
+ return -ENOMEM;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
- ctrl->effects, sizeof(*ctrl->effects), 0);
+ ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+ &cel->log, sizeof(cel->log), 0);
if (ret) {
- kfree(ctrl->effects);
- ctrl->effects = NULL;
+ kfree(cel);
+ return ret;
}
- return ret;
+
+ cel->csi = csi;
+
+ spin_lock(&ctrl->lock);
+ list_add_tail(&cel->entry, &ctrl->cels);
+ spin_unlock(&ctrl->lock);
+out:
+ *log = &cel->log;
+ return 0;
}
/*
@@ -2873,7 +3025,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return ret;
}
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
- ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+ ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
@@ -2885,7 +3037,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
- ret = nvme_get_effects_log(ctrl);
+ ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
if (ret < 0)
goto out_free;
}
@@ -2947,7 +3099,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (id->rtd3e) {
/* us -> s */
- u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
+ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
@@ -3353,6 +3505,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
+ [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
[NVME_CTRL_DEAD] = "dead",
};
@@ -3405,6 +3558,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+
+ if (ctrl->opts->max_reconnects == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n",
+ opts->max_reconnects * opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ctrl_loss_tmo, err;
+
+ err = kstrtoint(buf, 10, &ctrl_loss_tmo);
+ if (err)
+ return -EINVAL;
+
+ else if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+ return count;
+}
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
+ nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
+
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (ctrl->opts->reconnect_delay == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ unsigned int v;
+ int err;
+
+ err = kstrtou32(buf, 10, &v);
+ if (err)
+ return err;
+
+ ctrl->opts->reconnect_delay = v;
+ return count;
+}
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
+ nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -3422,6 +3635,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_sqsize.attr,
&dev_attr_hostnqn.attr,
&dev_attr_hostid.attr,
+ &dev_attr_ctrl_loss_tmo.attr,
+ &dev_attr_reconnect_delay.attr,
NULL
};
@@ -3518,6 +3733,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
goto out_cleanup_srcu;
}
+ if (head->ids.csi) {
+ ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
+ if (ret)
+ goto out_cleanup_srcu;
+ } else
+ head->effects = ctrl->effects;
+
ret = nvme_mpath_alloc_disk(ctrl, head);
if (ret)
goto out_cleanup_srcu;
@@ -3599,7 +3821,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
return nsa->head->ns_id - nsb->head->ns_id;
}
-static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
@@ -3617,6 +3839,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
up_read(&ctrl->namespaces_rwsem);
return ret;
}
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
@@ -3734,7 +3957,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk->flags & GENHD_FL_UP) {
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
if (blk_get_integrity(ns->disk))
@@ -3765,7 +3988,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- if (ns->disk && revalidate_disk(ns->disk))
+ if (revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
@@ -3858,8 +4081,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
- log_size, 0);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
+ NVME_CSI_NVM, log, log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -3920,6 +4143,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
if (ctrl->state == NVME_CTRL_DEAD)
nvme_kill_queues(ctrl);
+ /* this is a no-op when called from the controller reset handler */
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
+
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
@@ -4003,8 +4229,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
- sizeof(*log), 0))
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
+ log, sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -4114,8 +4340,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
- if (ctrl->kato)
- nvme_start_keep_alive(ctrl);
+ nvme_start_keep_alive(ctrl);
nvme_enable_aen(ctrl);
@@ -4141,11 +4366,16 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_ctrl *ctrl =
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
+ struct nvme_cel *cel, *next;
if (subsys && ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
- kfree(ctrl->effects);
+ list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
+ list_del(&cel->entry);
+ kfree(cel);
+ }
+
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -4176,6 +4406,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
+ INIT_LIST_HEAD(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
@@ -4354,6 +4585,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
+struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
+{
+ struct nvme_ctrl *ctrl;
+ struct file *f;
+
+ f = filp_open(path, O_RDWR, 0);
+ if (IS_ERR(f))
+ return ERR_CAST(f);
+
+ if (f->f_op != &nvme_dev_fops) {
+ ctrl = ERR_PTR(-EINVAL);
+ goto out_close;
+ }
+
+ ctrl = f->private_data;
+ nvme_get_ctrl(ctrl);
+
+out_close:
+ filp_close(f, NULL);
+ return ctrl;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
+
/*
* Check we didn't inadvertently grow the command structure sizes:
*/
@@ -4372,6 +4626,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 2a6c8190eeb7..4ec4829d6233 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *rq)
{
- if (ctrl->state != NVME_CTRL_DELETING &&
+ if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
ctrl->state != NVME_CTRL_DEAD &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a0ec40ab62ee..a9c1e3b4585e 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
- if (likely(ctrl->state == NVME_CTRL_LIVE))
+ if (likely(ctrl->state == NVME_CTRL_LIVE ||
+ ctrl->state == NVME_CTRL_DELETING))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e999a8c4b7e8..eae43bb444e0 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -227,6 +227,7 @@ static DECLARE_COMPLETION(nvme_fc_unload_proceed);
*/
static struct device *fc_udev_device;
+static void nvme_fc_complete_rq(struct request *rq);
/* *********************** FC-NVME Port Management ************************ */
@@ -825,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
break;
case NVME_CTRL_DELETING:
+ case NVME_CTRL_DELETING_NOIO:
default:
/* no action to take - let it delete */
break;
@@ -2033,7 +2035,8 @@ done:
}
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
- nvme_end_request(rq, status, result);
+ if (!nvme_end_request(rq, status, result))
+ nvme_fc_complete_rq(rq);
check_error:
if (terminate_assoc)
@@ -2999,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_disconnect_admin_queue;
- ctrl->ctrl.max_hw_sectors =
- (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
+ ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments;
+ ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
+ (ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 2e6477ed420f..412a6c97c0d8 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
int ret;
ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
- &data->log, sizeof(data->log), 0);
+ NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
return ret <= 0 ? ret : -EIO;
}
@@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
err = nvme_hwmon_get_smart_log(data);
if (err) {
- dev_warn(dev, "Failed to read smart log (error %d)\n", err);
+ dev_warn(ctrl->device,
+ "Failed to read smart log (error %d)\n", err);
devm_kfree(dev, data);
return;
}
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 69608755d415..8e562d0f2c30 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
dev_meta_off = dev_meta;
ret = nvme_get_log(ctrl, ns->head->ns_id,
- NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
- offset);
+ NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM,
+ dev_meta, len, offset);
if (ret) {
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
break;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 66509472fe06..3ded54d2c9c6 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
- return ns->ctrl->state != NVME_CTRL_LIVE ||
- test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
- test_bit(NVME_NS_REMOVING, &ns->flags);
+ /*
+ * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
+ * still be able to complete assuming that the controller is connected.
+ * Otherwise it will fail immediately and return to the requeue list.
+ */
+ if (ns->ctrl->state != NVME_CTRL_LIVE &&
+ ns->ctrl->state != NVME_CTRL_DELETING)
+ return true;
+ if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
+ test_bit(NVME_NS_REMOVING, &ns->flags))
+ return true;
+ return false;
}
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
@@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
fallback = ns;
}
+ /* No optimized path found, re-check the current path */
+ if (!nvme_path_is_disabled(old) &&
+ old->ana_state == NVME_ANA_OPTIMIZED) {
+ found = old;
+ goto out;
+ }
if (!fallback)
return NULL;
found = fallback;
@@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
- ns = nvme_round_robin_path(head, node, ns);
- if (unlikely(!ns || !nvme_path_is_optimized(ns)))
- ns = __nvme_find_path(head, node);
+ if (unlikely(!ns))
+ return __nvme_find_path(head, node);
+
+ if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ return nvme_round_robin_path(head, node, ns);
+ if (unlikely(!nvme_path_is_optimized(ns)))
+ return __nvme_find_path(head, node);
return ns;
}
@@ -291,8 +309,7 @@ static bool nvme_available_path(struct nvme_ns_head *head)
return false;
}
-static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
- struct bio *bio)
+blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
{
struct nvme_ns_head *head = bio->bi_disk->private_data;
struct device *dev = disk_to_dev(head->disk);
@@ -301,12 +318,11 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
int srcu_idx;
/*
- * The namespace might be going away and the bio might
- * be moved to a different queue via blk_steal_bios(),
- * so we need to use the bio_split pool from the original
- * queue to allocate the bvecs from.
+ * The namespace might be going away and the bio might be moved to a
+ * different queue via blk_steal_bios(), so we need to use the bio_split
+ * pool from the original queue to allocate the bvecs from.
*/
- blk_queue_split(q, &bio);
+ blk_queue_split(&bio);
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
@@ -316,7 +332,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
trace_block_bio_remap(bio->bi_disk->queue, bio,
disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
- ret = direct_make_request(bio);
+ ret = submit_bio_noacct(bio);
} else if (nvme_available_path(head)) {
dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
@@ -353,7 +369,7 @@ static void nvme_requeue_work(struct work_struct *work)
* path.
*/
bio->bi_disk = head->disk;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -375,7 +391,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
return 0;
- q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node);
+ q = blk_alloc_queue(ctrl->numa_node);
if (!q)
goto out;
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
@@ -529,7 +545,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
int error;
mutex_lock(&ctrl->ana_lock);
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
@@ -565,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+ if (ctrl->state != NVME_CTRL_LIVE)
+ return;
+
nvme_read_ana_log(ctrl);
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1de3f9b827aa..ebb8c3ed3885 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -37,6 +37,14 @@ extern unsigned int admin_timeout;
#define NVME_INLINE_METADATA_SG_CNT 1
#endif
+/*
+ * Default to a 4K page size, with the intention to update this
+ * path in the future to accommodate architectures with differing
+ * kernel and IO page sizes.
+ */
+#define NVME_CTRL_PAGE_SHIFT 12
+#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT)
+
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
@@ -129,6 +137,13 @@ enum nvme_quirks {
* Don't change the value of the temperature threshold feature
*/
NVME_QUIRK_NO_TEMP_THRESH_CHANGE = (1 << 14),
+
+ /*
+ * The controller doesn't handle the Identify Namespace
+ * Identification Descriptor list subcommand despite claiming
+ * NVMe 1.3 compliance.
+ */
+ NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15),
};
/*
@@ -173,12 +188,32 @@ static inline u16 nvme_req_qid(struct request *req)
*/
#define NVME_QUIRK_DELAY_AMOUNT 2300
+/*
+ * enum nvme_ctrl_state: Controller state
+ *
+ * @NVME_CTRL_NEW: New controller just allocated, initial state
+ * @NVME_CTRL_LIVE: Controller is connected and I/O capable
+ * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset)
+ * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the
+ * transport
+ * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion)
+ * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not
+ * disabled/failed immediately. This state comes
+ * after all async event processing took place and
+ * before ns removal and the controller deletion
+ * progress
+ * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during
+ * shutdown or removal. In this case we forcibly
+ * kill all inflight I/O as they have no chance to
+ * complete
+ */
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
+ NVME_CTRL_DELETING_NOIO,
NVME_CTRL_DEAD,
};
@@ -191,6 +226,12 @@ struct nvme_fault_inject {
#endif
};
+struct nvme_cel {
+ struct list_head entry;
+ struct nvme_effects_log log;
+ u8 csi;
+};
+
struct nvme_ctrl {
bool comp_seen;
enum nvme_ctrl_state state;
@@ -228,10 +269,12 @@ struct nvme_ctrl {
u32 queue_count;
u64 cap;
- u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
+#ifdef CONFIG_BLK_DEV_ZONED
+ u32 max_zone_append;
+#endif
u16 crdt[3];
u16 oncs;
u16 oacs;
@@ -257,6 +300,7 @@ struct nvme_ctrl {
unsigned long quirks;
struct nvme_id_power_state psd[32];
struct nvme_effects_log *effects;
+ struct list_head cels;
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
@@ -339,6 +383,7 @@ struct nvme_ns_ids {
u8 eui64[8];
u8 nguid[16];
uuid_t uuid;
+ u8 csi;
};
/*
@@ -358,6 +403,7 @@ struct nvme_ns_head {
struct kref ref;
bool shared;
int instance;
+ struct nvme_effects_log *effects;
#ifdef CONFIG_NVME_MULTIPATH
struct gendisk *disk;
struct bio_list requeue_list;
@@ -395,6 +441,9 @@ struct nvme_ns {
u16 sgs;
u32 sws;
u8 pi_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+ u64 zsze;
+#endif
unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
@@ -474,7 +523,7 @@ static inline u32 nvme_bytes_to_numd(size_t len)
return (len >> 2) - 1;
}
-static inline void nvme_end_request(struct request *req, __le16 status,
+static inline bool nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
{
struct nvme_request *rq = nvme_req(req);
@@ -483,7 +532,9 @@ static inline void nvme_end_request(struct request *req, __le16 status,
rq->result = result;
/* inject error when permitted by fault injection framework */
nvme_should_fail(req);
- blk_mq_complete_request(req);
+ if (unlikely(blk_should_fake_timeout(req->q)))
+ return true;
+ return blk_mq_complete_request_remote(req);
}
static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
@@ -558,8 +609,11 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset);
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+ struct nvme_ns_head **head, int *srcu_idx);
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct block_device_operations nvme_ns_head_ops;
@@ -586,6 +640,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+blk_qc_t nvme_ns_head_submit_bio(struct bio *bio);
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
@@ -694,6 +749,36 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
+#ifdef CONFIG_BLK_DEV_ZONED
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+ unsigned lbaf);
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data);
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+ struct nvme_command *cmnd,
+ enum nvme_zone_mgmt_action action);
+#else
+#define nvme_report_zones NULL
+
+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
+ struct request *req, struct nvme_command *cmnd,
+ enum nvme_zone_mgmt_action action)
+{
+ return BLK_STS_NOTSUPP;
+}
+
+static inline int nvme_update_zone_info(struct gendisk *disk,
+ struct nvme_ns *ns,
+ unsigned lbaf)
+{
+ dev_warn(ns->ctrl->device,
+ "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
+ return -EPROTONOSUPPORT;
+}
+#endif
+
#ifdef CONFIG_NVM
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
void nvme_nvm_unregister(struct nvme_ns *ns);
@@ -725,4 +810,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl);
static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
#endif
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u8 opcode);
+void nvme_execute_passthru_rq(struct request *rq);
+struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path);
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
+void nvme_put_ns(struct nvme_ns *ns);
+
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b1d18f0633c7..ba725ae47305 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
+#include <linux/acpi.h>
#include <linux/aer.h>
#include <linux/async.h>
#include <linux/blkdev.h>
@@ -61,10 +62,10 @@ MODULE_PARM_DESC(sgl_threshold,
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
- .get = param_get_int,
+ .get = param_get_uint,
};
-static int io_queue_depth = 1024;
+static unsigned int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
@@ -94,6 +95,10 @@ static unsigned int poll_queues;
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+static bool noacpi;
+module_param(noacpi, bool, 0444);
+MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
+
struct nvme_dev;
struct nvme_queue;
@@ -115,7 +120,7 @@ struct nvme_dev {
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
- int q_depth;
+ u16 q_depth;
int io_sqes;
u32 db_stride;
void __iomem *bar;
@@ -151,13 +156,14 @@ struct nvme_dev {
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
- int n = 0, ret;
+ int ret;
+ u16 n;
- ret = kstrtoint(val, 10, &n);
+ ret = kstrtou16(val, 10, &n);
if (ret != 0 || n < 2)
return -EINVAL;
- return param_set_int(val, kp);
+ return param_set_ushort(val, kp);
}
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@@ -345,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static int nvme_pci_npages_prp(void)
{
- unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
- dev->ctrl.page_size);
+ unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
+ NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
@@ -356,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
* Calculates the number of pages needed for the SGL segments. For example a 4k
* page can accommodate 256 SGL descriptors.
*/
-static int nvme_pci_npages_sgl(unsigned int num_seg)
+static int nvme_pci_npages_sgl(void)
{
- return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
+ return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
+ PAGE_SIZE);
}
-static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
- unsigned int size, unsigned int nseg, bool use_sgl)
+static size_t nvme_pci_iod_alloc_size(void)
{
- size_t alloc_size;
+ size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
- if (use_sgl)
- alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
- else
- alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
-
- return alloc_size + sizeof(struct scatterlist) * nseg;
+ return sizeof(__le64 *) * npages +
+ sizeof(struct scatterlist) * NVME_MAX_SEGS;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -500,9 +502,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
int nseg = blk_rq_nr_phys_segments(req);
unsigned int avg_seg_size;
- if (nseg == 0)
- return false;
-
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
@@ -517,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+ const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
@@ -584,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
struct scatterlist *sg = iod->sg;
int dma_len = sg_dma_len(sg);
u64 dma_addr = sg_dma_address(sg);
- u32 page_size = dev->ctrl.page_size;
- int offset = dma_addr & (page_size - 1);
+ int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
- length -= (page_size - offset);
+ length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
iod->first_dma = 0;
goto done;
}
- dma_len -= (page_size - offset);
+ dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
if (dma_len) {
- dma_addr += (page_size - offset);
+ dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
} else {
sg = sg_next(sg);
dma_addr = sg_dma_address(sg);
dma_len = sg_dma_len(sg);
}
- if (length <= page_size) {
+ if (length <= NVME_CTRL_PAGE_SIZE) {
iod->first_dma = dma_addr;
goto done;
}
- nprps = DIV_ROUND_UP(length, page_size);
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) {
pool = dev->prp_small_pool;
iod->npages = 0;
@@ -630,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
iod->first_dma = prp_dma;
i = 0;
for (;;) {
- if (i == page_size >> 3) {
+ if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list)
@@ -641,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
i = 1;
}
prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= page_size;
- dma_addr += page_size;
- length -= page_size;
+ dma_len -= NVME_CTRL_PAGE_SIZE;
+ dma_addr += NVME_CTRL_PAGE_SIZE;
+ length -= NVME_CTRL_PAGE_SIZE;
if (length <= 0)
break;
if (dma_len > 0)
@@ -753,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
- unsigned int first_prp_len = dev->ctrl.page_size - offset;
+ unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+ unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma))
@@ -764,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
if (bv->bv_len > first_prp_len)
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
- return 0;
+ return BLK_STS_OK;
}
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
@@ -782,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
- return 0;
+ return BLK_STS_OK;
}
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
@@ -796,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
+ if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
@@ -846,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
if (dma_mapping_error(dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
- return 0;
+ return BLK_STS_OK;
}
/*
@@ -963,7 +961,8 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
- nvme_end_request(req, cqe->status, cqe->result);
+ if (!nvme_end_request(req, cqe->status, cqe->result))
+ nvme_pci_complete_rq(req);
}
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1018,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
static irqreturn_t nvme_irq_check(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
+
if (nvme_cqe_pending(nvmeq))
return IRQ_WAKE_THREAD;
return IRQ_NONE;
@@ -1153,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t error)
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
-
/* If true, indicates loss of adapter communication, possibly by a
* NVMe Subsystem reset.
*/
@@ -1260,9 +1259,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
}
/*
- * Shutdown the controller immediately and schedule a reset if the
- * command was already aborted once before and still hasn't been
- * returned to the driver, or if this is the admin queue.
+ * Shutdown the controller immediately and schedule a reset if the
+ * command was already aborted once before and still hasn't been
+ * returned to the driver, or if this is the admin queue.
*/
if (!nvmeq->qid || iod->aborted) {
dev_warn(dev->ctrl.device,
@@ -1397,11 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
{
int q_depth = dev->q_depth;
unsigned q_size_aligned = roundup(q_depth * entry_size,
- dev->ctrl.page_size);
+ NVME_CTRL_PAGE_SIZE);
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
- mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
+
+ mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
q_depth = div_u64(mem_per_q, entry_size);
/*
@@ -1816,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
+ u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
u64 dma_addr = dev->host_mem_descs_dma;
struct nvme_command c;
int ret;
@@ -1824,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
c.features.dword11 = cpu_to_le32(bits);
- c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
- ilog2(dev->ctrl.page_size));
+ c.features.dword12 = cpu_to_le32(host_mem_size);
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
@@ -1845,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
for (i = 0; i < dev->nr_host_mem_descs; i++) {
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
- size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
+ size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
le64_to_cpu(desc->addr),
@@ -1897,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
break;
descs[i].addr = cpu_to_le64(dma_addr);
- descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
+ descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
i++;
}
@@ -1913,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
out_free_bufs:
while (--i >= 0) {
- size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
+ size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, bufs[i],
le64_to_cpu(descs[i].addr),
@@ -1931,12 +1931,12 @@ out:
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
- u32 chunk_size;
+ u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
+ u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
+ u64 chunk_size;
/* start big and work our way down */
- for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
- chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
- chunk_size /= 2) {
+ for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
@@ -2002,7 +2002,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
/*
- * If there is no interupt available for queues, ensure that
+ * If there is no interrupt available for queues, ensure that
* the default queue is set to 1. The affinity set size is
* also set to one, but the irq core ignores it for this case.
*
@@ -2260,8 +2260,8 @@ static void nvme_dev_add(struct nvme_dev *dev)
dev->tagset.nr_maps++;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev->ctrl.numa_node;
- dev->tagset.queue_depth =
- min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+ dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
+ BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = sizeof(struct nvme_iod);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
@@ -2320,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
- dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
+ dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
@@ -2759,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
+#ifdef CONFIG_ACPI
+static bool nvme_acpi_storage_d3(struct pci_dev *dev)
+{
+ struct acpi_device *adev;
+ struct pci_dev *root;
+ acpi_handle handle;
+ acpi_status status;
+ u8 val;
+
+ /*
+ * Look for _DSD property specifying that the storage device on the port
+ * must use D3 to support deep platform power savings during
+ * suspend-to-idle.
+ */
+ root = pcie_find_root_port(dev);
+ if (!root)
+ return false;
+
+ adev = ACPI_COMPANION(&root->dev);
+ if (!adev)
+ return false;
+
+ /*
+ * The property is defined in the PXSX device for South complex ports
+ * and in the PEGP device for North complex ports.
+ */
+ status = acpi_get_handle(adev->handle, "PXSX", &handle);
+ if (ACPI_FAILURE(status)) {
+ status = acpi_get_handle(adev->handle, "PEGP", &handle);
+ if (ACPI_FAILURE(status))
+ return false;
+ }
+
+ if (acpi_bus_get_device(handle, &adev))
+ return false;
+
+ if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
+ &val))
+ return false;
+ return val == 1;
+}
+#else
+static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
+{
+ return false;
+}
+#endif /* CONFIG_ACPI */
+
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
struct nvme_dev *dev = data;
@@ -2808,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
quirks |= check_vendor_combination_bug(pdev);
+ if (!noacpi && nvme_acpi_storage_d3(pdev)) {
+ /*
+ * Some systems use a bios work around to ask for D3 on
+ * platforms that support kernel managed suspend.
+ */
+ dev_info(&pdev->dev,
+ "platform quirk: setting simple suspend\n");
+ quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
+ }
+
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
*/
- alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
- NVME_MAX_SEGS, true);
+ alloc_size = nvme_pci_iod_alloc_size();
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
@@ -2875,6 +2932,7 @@ static void nvme_reset_done(struct pci_dev *pdev)
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
+
nvme_disable_prepare_reset(dev, true);
}
@@ -3005,6 +3063,7 @@ unfreeze:
static int nvme_simple_suspend(struct device *dev)
{
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+
return nvme_disable_prepare_reset(ndev, true);
}
@@ -3078,16 +3137,16 @@ static const struct pci_error_handlers nvme_err_handler = {
};
static const struct pci_device_id nvme_id_table[] = {
- { PCI_VDEVICE(INTEL, 0x0953),
+ { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
- { PCI_VDEVICE(INTEL, 0x0a53),
+ { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
- { PCI_VDEVICE(INTEL, 0x0a54),
+ { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
- { PCI_VDEVICE(INTEL, 0x0a55),
+ { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
@@ -3099,6 +3158,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS |
NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
+ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
{ PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
@@ -3122,6 +3183,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
.driver_data = NVME_QUIRK_SINGLE_VECTOR },
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 13506a87a444..44c76ffbb264 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -96,6 +96,7 @@ struct nvme_rdma_queue {
int cm_error;
struct completion cm_done;
bool pi_support;
+ int cq_size;
};
struct nvme_rdma_ctrl {
@@ -149,6 +150,7 @@ MODULE_PARM_DESC(register_always,
static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void nvme_rdma_complete_rq(struct request *rq);
static const struct blk_mq_ops nvme_rdma_mq_ops;
static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
@@ -274,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
init_attr.recv_cq = queue->ib_cq;
if (queue->pi_support)
init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
+ init_attr.qp_context = queue;
ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
@@ -408,6 +411,14 @@ out_err:
return NULL;
}
+static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
+{
+ if (nvme_rdma_poll_queue(queue))
+ ib_free_cq(queue->ib_cq);
+ else
+ ib_cq_pool_put(queue->ib_cq, queue->cq_size);
+}
+
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
struct nvme_rdma_device *dev;
@@ -429,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
* the destruction of the QP shouldn't use rdma_cm API.
*/
ib_destroy_qp(queue->qp);
- ib_free_cq(queue->ib_cq);
+ nvme_rdma_free_cq(queue);
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
sizeof(struct nvme_completion), DMA_FROM_DEVICE);
@@ -449,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
}
+static int nvme_rdma_create_cq(struct ib_device *ibdev,
+ struct nvme_rdma_queue *queue)
+{
+ int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
+ enum ib_poll_context poll_ctx;
+
+ /*
+ * Spread I/O queues completion vectors according their queue index.
+ * Admin queues can always go on completion vector 0.
+ */
+ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
+
+ /* Polling queues need direct cq polling context */
+ if (nvme_rdma_poll_queue(queue)) {
+ poll_ctx = IB_POLL_DIRECT;
+ queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
+ comp_vector, poll_ctx);
+ } else {
+ poll_ctx = IB_POLL_SOFTIRQ;
+ queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
+ comp_vector, poll_ctx);
+ }
+
+ if (IS_ERR(queue->ib_cq)) {
+ ret = PTR_ERR(queue->ib_cq);
+ return ret;
+ }
+
+ return 0;
+}
+
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
{
struct ib_device *ibdev;
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
- int comp_vector, idx = nvme_rdma_queue_idx(queue);
- enum ib_poll_context poll_ctx;
int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -466,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
}
ibdev = queue->device->dev;
- /*
- * Spread I/O queues completion vectors according their queue index.
- * Admin queues can always go on completion vector 0.
- */
- comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
-
- /* Polling queues need direct cq polling context */
- if (nvme_rdma_poll_queue(queue))
- poll_ctx = IB_POLL_DIRECT;
- else
- poll_ctx = IB_POLL_SOFTIRQ;
-
/* +1 for ib_stop_cq */
- queue->ib_cq = ib_alloc_cq(ibdev, queue,
- cq_factor * queue->queue_size + 1,
- comp_vector, poll_ctx);
- if (IS_ERR(queue->ib_cq)) {
- ret = PTR_ERR(queue->ib_cq);
+ queue->cq_size = cq_factor * queue->queue_size + 1;
+
+ ret = nvme_rdma_create_cq(ibdev, queue);
+ if (ret)
goto out_put_dev;
- }
ret = nvme_rdma_create_qp(queue, send_wr_factor);
if (ret)
@@ -511,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
- queue->queue_size, idx);
+ queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_ring;
}
@@ -522,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize PI MR pool sized %d for QID %d\n",
- queue->queue_size, idx);
+ queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_mr_pool;
}
}
@@ -539,7 +565,7 @@ out_destroy_ring:
out_destroy_qp:
rdma_destroy_qp(queue->cm_id);
out_destroy_ib_cq:
- ib_free_cq(queue->ib_cq);
+ nvme_rdma_free_cq(queue);
out_put_dev:
nvme_rdma_dev_put(queue->device);
return ret;
@@ -941,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(&ctrl->tag_set,
- ctrl->ctrl.queue_count - 1);
}
ret = nvme_rdma_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(&ctrl->ctrl);
+ nvme_wait_freeze(&ctrl->ctrl);
+ blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
+ ctrl->ctrl.queue_count - 1);
+ nvme_unfreeze(&ctrl->ctrl);
+ }
+
return 0;
out_cleanup_connect_q:
@@ -982,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
+ nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
if (ctrl->ctrl.tagset) {
@@ -1076,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
if (!changed) {
/*
- * state change failure is ok if we're in DELETING state,
+ * state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@@ -1133,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+ /* state change failure is ok if we started ctrl delete */
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
return;
}
@@ -1149,10 +1183,20 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
queue_work(nvme_reset_wq, &ctrl->err_work);
}
+static void nvme_rdma_end_request(struct nvme_rdma_request *req)
+{
+ struct request *rq = blk_mq_rq_from_pdu(req);
+
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ if (!nvme_end_request(rq, req->status, req->result))
+ nvme_rdma_complete_rq(rq);
+}
+
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
const char *op)
{
- struct nvme_rdma_queue *queue = cq->cq_context;
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
@@ -1173,16 +1217,11 @@ static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_request *req =
container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
- struct request *rq = blk_mq_rq_from_pdu(req);
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if (unlikely(wc->status != IB_WC_SUCCESS))
nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
- return;
- }
-
- if (refcount_dec_and_test(&req->ref))
- nvme_end_request(rq, req->status, req->result);
-
+ else
+ nvme_rdma_end_request(req);
}
static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
@@ -1547,15 +1586,11 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
struct nvme_rdma_request *req =
container_of(qe, struct nvme_rdma_request, sqe);
- struct request *rq = blk_mq_rq_from_pdu(req);
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if (unlikely(wc->status != IB_WC_SUCCESS))
nvme_rdma_wr_error(cq, wc, "SEND");
- return;
- }
-
- if (refcount_dec_and_test(&req->ref))
- nvme_end_request(rq, req->status, req->result);
+ else
+ nvme_rdma_end_request(req);
}
static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
@@ -1697,15 +1732,14 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
return;
}
- if (refcount_dec_and_test(&req->ref))
- nvme_end_request(rq, req->status, req->result);
+ nvme_rdma_end_request(req);
}
static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
- struct nvme_rdma_queue *queue = cq->cq_context;
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 79ef2b8e2b3c..62fbaecdc960 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -46,6 +46,7 @@ struct nvme_tcp_request {
u32 pdu_sent;
u16 ttag;
struct list_head entry;
+ struct llist_node lentry;
__le32 ddgst;
struct bio *curr_bio;
@@ -75,9 +76,10 @@ struct nvme_tcp_queue {
struct work_struct io_work;
int io_cpu;
- spinlock_t lock;
struct mutex send_mutex;
+ struct llist_head req_list;
struct list_head send_list;
+ bool more_requests;
/* recv state */
void *pdu;
@@ -261,15 +263,13 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
- bool sync)
+ bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
- spin_lock(&queue->lock);
- empty = list_empty(&queue->send_list) && !queue->request;
- list_add_tail(&req->entry, &queue->send_list);
- spin_unlock(&queue->lock);
+ empty = llist_add(&req->lentry, &queue->req_list) &&
+ list_empty(&queue->send_list) && !queue->request;
/*
* if we're the first on the send_list and we can try to send
@@ -278,25 +278,42 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
*/
if (queue->io_cpu == smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
+ queue->more_requests = !last;
nvme_tcp_try_send(queue);
+ queue->more_requests = false;
mutex_unlock(&queue->send_mutex);
- } else {
+ } else if (last) {
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
}
+static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+ struct llist_node *node;
+
+ for (node = llist_del_all(&queue->req_list); node; node = node->next) {
+ req = llist_entry(node, struct nvme_tcp_request, lentry);
+ list_add(&req->entry, &queue->send_list);
+ }
+}
+
static inline struct nvme_tcp_request *
nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
- spin_lock(&queue->lock);
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
- if (req)
- list_del(&req->entry);
- spin_unlock(&queue->lock);
+ if (!req) {
+ nvme_tcp_process_req_list(queue);
+ req = list_first_entry_or_null(&queue->send_list,
+ struct nvme_tcp_request, entry);
+ if (unlikely(!req))
+ return NULL;
+ }
+ list_del(&req->entry);
return req;
}
@@ -464,7 +481,8 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
return -EINVAL;
}
- nvme_end_request(rq, cqe->status, cqe->result);
+ if (!nvme_end_request(rq, cqe->status, cqe->result))
+ nvme_complete_rq(rq);
queue->nr_cqe++;
return 0;
@@ -595,7 +613,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
- nvme_tcp_queue_request(req, false);
+ nvme_tcp_queue_request(req, false, true);
return 0;
}
@@ -654,7 +672,8 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{
union nvme_result res = {};
- nvme_end_request(rq, cpu_to_le16(status << 1), res);
+ if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
+ nvme_complete_rq(rq);
}
static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
@@ -861,6 +880,12 @@ done:
read_unlock(&sk->sk_callback_lock);
}
+static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+{
+ return !list_empty(&queue->send_list) ||
+ !llist_empty(&queue->req_list) || queue->more_requests;
+}
+
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
{
queue->request = NULL;
@@ -882,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
bool last = nvme_tcp_pdu_last_send(req, len);
int ret, flags = MSG_DONTWAIT;
- if (last && !queue->data_digest)
+ if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
flags |= MSG_EOR;
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
@@ -929,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
int flags = MSG_DONTWAIT;
int ret;
- if (inline_data)
+ if (inline_data || nvme_tcp_queue_more(queue))
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
@@ -994,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
int ret;
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
.iov_base = &req->ddgst + req->offset,
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
};
+ if (nvme_tcp_queue_more(queue))
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR;
+
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (unlikely(ret <= 0))
return ret;
@@ -1342,8 +1372,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
int ret, rcv_pdu_size;
queue->ctrl = ctrl;
+ init_llist_head(&queue->req_list);
INIT_LIST_HEAD(&queue->send_list);
- spin_lock_init(&queue->lock);
mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@@ -1382,6 +1412,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
if (nctrl->opts->tos >= 0)
ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
+ /* Set 10 seconds timeout for icresp recvmsg */
+ queue->sock->sk->sk_rcvtimeo = 10 * HZ;
+
queue->sock->sk->sk_allocation = GFP_ATOMIC;
nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
@@ -1741,15 +1774,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(ctrl->tagset,
- ctrl->queue_count - 1);
}
ret = nvme_tcp_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(ctrl);
+ nvme_wait_freeze(ctrl);
+ blk_mq_update_nr_hw_queues(ctrl->tagset,
+ ctrl->queue_count - 1);
+ nvme_unfreeze(ctrl);
+ }
+
return 0;
out_cleanup_connect_q:
@@ -1854,6 +1892,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
+ nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
if (ctrl->tagset) {
@@ -1920,11 +1959,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
/*
- * state change failure is ok if we're in DELETING state,
+ * state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@@ -1980,8 +2020,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ /* state change failure is ok if we started ctrl delete */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@@ -2016,8 +2057,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ /* state change failure is ok if we started ctrl delete */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@@ -2104,7 +2146,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req, true);
+ nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
static enum blk_eh_timer_return
@@ -2216,6 +2258,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
return 0;
}
+static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+
+ if (!llist_empty(&queue->req_list))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -2235,7 +2285,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
- nvme_tcp_queue_request(req, true);
+ nvme_tcp_queue_request(req, true, bd->last);
return BLK_STS_OK;
}
@@ -2303,6 +2353,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
+ .commit_rqs = nvme_tcp_commit_rqs,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
.exit_request = nvme_tcp_exit_request,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
new file mode 100644
index 000000000000..57cfd78731fb
--- /dev/null
+++ b/drivers/nvme/host/zns.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "nvme.h"
+
+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
+{
+ struct nvme_command c = { };
+ struct nvme_id_ctrl_zns *id;
+ int status;
+
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
+ return -ENOMEM;
+
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.cns = NVME_ID_CNS_CS_CTRL;
+ c.identify.csi = NVME_CSI_ZNS;
+
+ status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+ if (status) {
+ kfree(id);
+ return status;
+ }
+
+ if (id->zasl)
+ ctrl->max_zone_append = 1 << (id->zasl + 3);
+ else
+ ctrl->max_zone_append = ctrl->max_hw_sectors;
+ kfree(id);
+ return 0;
+}
+
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+ unsigned lbaf)
+{
+ struct nvme_effects_log *log = ns->head->effects;
+ struct request_queue *q = disk->queue;
+ struct nvme_command c = { };
+ struct nvme_id_ns_zns *id;
+ int status;
+
+ /* Driver requires zone append support */
+ if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
+ NVME_CMD_EFFECTS_CSUPP)) {
+ dev_warn(ns->ctrl->device,
+ "append not supported for zoned namespace:%d\n",
+ ns->head->ns_id);
+ return -EINVAL;
+ }
+
+ /* Lazily query controller append limit for the first zoned namespace */
+ if (!ns->ctrl->max_zone_append) {
+ status = nvme_set_max_append(ns->ctrl);
+ if (status)
+ return status;
+ }
+
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
+ return -ENOMEM;
+
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+ c.identify.cns = NVME_ID_CNS_CS_NS;
+ c.identify.csi = NVME_CSI_ZNS;
+
+ status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
+ if (status)
+ goto free_data;
+
+ /*
+ * We currently do not handle devices requiring any of the zoned
+ * operation characteristics.
+ */
+ if (id->zoc) {
+ dev_warn(ns->ctrl->device,
+ "zone operations:%x not supported for namespace:%u\n",
+ le16_to_cpu(id->zoc), ns->head->ns_id);
+ status = -EINVAL;
+ goto free_data;
+ }
+
+ ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+ if (!is_power_of_2(ns->zsze)) {
+ dev_warn(ns->ctrl->device,
+ "invalid zone size:%llu for namespace:%u\n",
+ ns->zsze, ns->head->ns_id);
+ status = -EINVAL;
+ goto free_data;
+ }
+
+ q->limits.zoned = BLK_ZONED_HM;
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+ blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
+ blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
+free_data:
+ kfree(id);
+ return status;
+}
+
+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = ns->disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ const size_t min_bufsize = sizeof(struct nvme_zone_report) +
+ sizeof(struct nvme_zone_descriptor);
+
+ nr_zones = min_t(unsigned int, nr_zones,
+ get_capacity(ns->disk) >> ilog2(ns->zsze));
+
+ bufsize = sizeof(struct nvme_zone_report) +
+ nr_zones * sizeof(struct nvme_zone_descriptor);
+ bufsize = min_t(size_t, bufsize,
+ queue_max_hw_sectors(q) << SECTOR_SHIFT);
+ bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+ while (bufsize >= min_bufsize) {
+ buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+ if (buf) {
+ *buflen = bufsize;
+ return buf;
+ }
+ bufsize >>= 1;
+ }
+ return NULL;
+}
+
+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+ struct nvme_zone_report *report,
+ size_t buflen)
+{
+ struct nvme_command c = { };
+ int ret;
+
+ c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
+ c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
+ c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+ c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
+ c.zmr.zra = NVME_ZRA_ZONE_REPORT;
+ c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
+ c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
+
+ ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
+ if (ret)
+ return ret;
+
+ return le64_to_cpu(report->nr_zones);
+}
+
+static int nvme_zone_parse_entry(struct nvme_ns *ns,
+ struct nvme_zone_descriptor *entry,
+ unsigned int idx, report_zones_cb cb,
+ void *data)
+{
+ struct blk_zone zone = { };
+
+ if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
+ dev_err(ns->ctrl->device, "invalid zone type %#x\n",
+ entry->zt);
+ return -EINVAL;
+ }
+
+ zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ zone.cond = entry->zs >> 4;
+ zone.len = ns->zsze;
+ zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
+ zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+ zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+
+ return cb(&zone, idx, data);
+}
+
+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct nvme_zone_report *report;
+ int ret, zone_idx = 0;
+ unsigned int nz, i;
+ size_t buflen;
+
+ report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
+ if (!report)
+ return -ENOMEM;
+
+ sector &= ~(ns->zsze - 1);
+ while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
+ memset(report, 0, buflen);
+ ret = __nvme_ns_report_zones(ns, sector, report, buflen);
+ if (ret < 0)
+ goto out_free;
+
+ nz = min_t(unsigned int, ret, nr_zones);
+ if (!nz)
+ break;
+
+ for (i = 0; i < nz && zone_idx < nr_zones; i++) {
+ ret = nvme_zone_parse_entry(ns, &report->entries[i],
+ zone_idx, cb, data);
+ if (ret)
+ goto out_free;
+ zone_idx++;
+ }
+
+ sector += ns->zsze * nz;
+ }
+
+ if (zone_idx > 0)
+ ret = zone_idx;
+ else
+ ret = -EINVAL;
+out_free:
+ kvfree(report);
+ return ret;
+}
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct nvme_ns_head *head = NULL;
+ struct nvme_ns *ns;
+ int srcu_idx, ret;
+
+ ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ return -EWOULDBLOCK;
+
+ if (ns->head->ids.csi == NVME_CSI_ZNS)
+ ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+ else
+ ret = -EINVAL;
+ nvme_put_ns_from_disk(head, srcu_idx);
+
+ return ret;
+}
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+ struct nvme_command *c, enum nvme_zone_mgmt_action action)
+{
+ c->zms.opcode = nvme_cmd_zone_mgmt_send;
+ c->zms.nsid = cpu_to_le32(ns->head->ns_id);
+ c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ c->zms.zsa = action;
+
+ if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
+ c->zms.select_all = 1;
+
+ return BLK_STS_OK;
+}
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 4474952d64c6..8056955e652c 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -16,6 +16,18 @@ config NVME_TARGET
To configure the NVMe target you probably want to use the nvmetcli
tool from http://git.infradead.org/users/hch/nvmetcli.git.
+config NVME_TARGET_PASSTHRU
+ bool "NVMe Target Passthrough support"
+ depends on NVME_TARGET
+ depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
+ help
+ This enables target side NVMe passthru controller support for the
+ NVMe Over Fabrics protocol. It allows for hosts to manage and
+ directly access an actual NVMe controller residing on the target
+ side, incuding executing Vendor Unique Commands.
+
+ If unsure, say N.
+
config NVME_TARGET_LOOP
tristate "NVMe loopback device support"
depends on NVME_TARGET
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 2b33836f3d3e..ebf91fc4c72e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
+nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1db8c0498668..e9fe91786bbb 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
u64 data_units_read = 0, data_units_written = 0;
struct nvmet_ns *ns;
struct nvmet_ctrl *ctrl;
+ unsigned long idx;
ctrl = req->sq->ctrl;
-
- rcu_read_lock();
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
/* we don't have the right data for file backed ns */
if (!ns->bdev)
continue;
@@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
data_units_written += DIV_ROUND_UP(
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
-
}
- rcu_read_unlock();
put_unaligned_le64(host_reads, &slog->host_reads[0]);
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
@@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
+ unsigned long idx;
u32 count = 0;
if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
- rcu_read_lock();
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns)
if (ns->anagrpid == grpid)
desc->nsids[count++] = cpu_to_le32(ns->nsid);
- rcu_read_unlock();
}
desc->grpid = cpu_to_le32(grpid);
@@ -427,7 +423,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->awupf = 0;
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
- if (ctrl->ops->has_keyed_sgls)
+ if (ctrl->ops->flags & NVMF_KEYED_SGLS)
id->sgls |= cpu_to_le32(1 << 2);
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
@@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
+ unsigned long idx;
u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
__le32 *list;
u16 status = 0;
@@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
goto out;
}
- rcu_read_lock();
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
if (ns->nsid <= min_nsid)
continue;
list[i++] = cpu_to_le32(ns->nsid);
if (i == buf_size / sizeof(__le32))
break;
}
- rcu_read_unlock();
status = nvmet_copy_to_sgl(req, 0, list, buf_size);
@@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
return 0;
}
-static void nvmet_execute_set_features(struct nvmet_req *req)
+void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
}
-static void nvmet_execute_get_features(struct nvmet_req *req)
+void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
+ if (nvmet_req_passthru_ctrl(req))
+ return nvmet_parse_passthru_admin_cmd(req);
+
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->execute = nvmet_execute_get_log_page;
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 419e0d4ce79b..74b2b61c773b 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = {
.ct_owner = THIS_MODULE,
};
+#ifdef CONFIG_NVME_TARGET_PASSTHRU
+
+static ssize_t nvmet_passthru_device_path_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
+
+ return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path);
+}
+
+static ssize_t nvmet_passthru_device_path_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
+ size_t len;
+ int ret;
+
+ mutex_lock(&subsys->lock);
+
+ ret = -EBUSY;
+ if (subsys->passthru_ctrl)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+ if (!len)
+ goto out_unlock;
+
+ kfree(subsys->passthru_ctrl_path);
+ ret = -ENOMEM;
+ subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL);
+ if (!subsys->passthru_ctrl_path)
+ goto out_unlock;
+
+ mutex_unlock(&subsys->lock);
+
+ return count;
+out_unlock:
+ mutex_unlock(&subsys->lock);
+ return ret;
+}
+CONFIGFS_ATTR(nvmet_passthru_, device_path);
+
+static ssize_t nvmet_passthru_enable_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
+
+ return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0);
+}
+
+static ssize_t nvmet_passthru_enable_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
+ bool enable;
+ int ret = 0;
+
+ if (strtobool(page, &enable))
+ return -EINVAL;
+
+ if (enable)
+ ret = nvmet_passthru_ctrl_enable(subsys);
+ else
+ nvmet_passthru_ctrl_disable(subsys);
+
+ return ret ? ret : count;
+}
+CONFIGFS_ATTR(nvmet_passthru_, enable);
+
+static struct configfs_attribute *nvmet_passthru_attrs[] = {
+ &nvmet_passthru_attr_device_path,
+ &nvmet_passthru_attr_enable,
+ NULL,
+};
+
+static const struct config_item_type nvmet_passthru_type = {
+ .ct_attrs = nvmet_passthru_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
+{
+ config_group_init_type_name(&subsys->passthru_group,
+ "passthru", &nvmet_passthru_type);
+ configfs_add_default_group(&subsys->passthru_group,
+ &subsys->group);
+}
+
+#else /* CONFIG_NVME_TARGET_PASSTHRU */
+
+static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
+{
+}
+
+#endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
static int nvmet_port_subsys_allow_link(struct config_item *parent,
struct config_item *target)
{
@@ -862,14 +959,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item,
struct nvmet_subsys *subsys = to_subsys(item);
if (NVME_TERTIARY(subsys->ver))
- return snprintf(page, PAGE_SIZE, "%d.%d.%d\n",
- (int)NVME_MAJOR(subsys->ver),
- (int)NVME_MINOR(subsys->ver),
- (int)NVME_TERTIARY(subsys->ver));
+ return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n",
+ NVME_MAJOR(subsys->ver),
+ NVME_MINOR(subsys->ver),
+ NVME_TERTIARY(subsys->ver));
- return snprintf(page, PAGE_SIZE, "%d.%d\n",
- (int)NVME_MAJOR(subsys->ver),
- (int)NVME_MINOR(subsys->ver));
+ return snprintf(page, PAGE_SIZE, "%llu.%llu\n",
+ NVME_MAJOR(subsys->ver),
+ NVME_MINOR(subsys->ver));
}
static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
@@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
int major, minor, tertiary = 0;
int ret;
+ /* passthru subsystems use the underlying controller's version */
+ if (nvmet_passthru_ctrl(subsys))
+ return -EINVAL;
+
ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
if (ret != 2 && ret != 3)
return -EINVAL;
@@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
configfs_add_default_group(&subsys->allowed_hosts_group,
&subsys->group);
+ nvmet_add_passthru_group(subsys);
+
return &subsys->group;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 6e2f623e472e..b92f45f5cd5b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
{
- struct nvmet_ns *ns;
+ unsigned long nsid = 0;
+ struct nvmet_ns *cur;
+ unsigned long idx;
- if (list_empty(&subsys->namespaces))
- return 0;
+ xa_for_each(&subsys->namespaces, idx, cur)
+ nsid = cur->nsid;
- ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
- return ns->nsid;
+ return nsid;
}
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
@@ -336,7 +337,7 @@ int nvmet_enable_port(struct nvmet_port *port)
* If the user requested PI support and the transport isn't pi capable,
* don't enable the port.
*/
- if (port->pi_enable && !ops->metadata_support) {
+ if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
pr_err("T10-PI is not supported by transport type %d\n",
port->disc_addr.trtype);
ret = -EINVAL;
@@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
cancel_delayed_work_sync(&ctrl->ka_work);
}
-static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
- __le32 nsid)
-{
- struct nvmet_ns *ns;
-
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
- if (ns->nsid == le32_to_cpu(nsid))
- return ns;
- }
-
- return NULL;
-}
-
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
{
struct nvmet_ns *ns;
- rcu_read_lock();
- ns = __nvmet_find_namespace(ctrl, nsid);
+ ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
if (ns)
percpu_ref_get(&ns->ref);
- rcu_read_unlock();
return ns;
}
@@ -467,7 +453,7 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
return -EINVAL;
}
- if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
+ if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) {
pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
ns->device_path);
return -EINVAL;
@@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
mutex_lock(&subsys->lock);
ret = 0;
+
+ if (nvmet_passthru_ctrl(subsys)) {
+ pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
+ goto out_unlock;
+ }
+
if (ns->enabled)
goto out_unlock;
@@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = ns->nsid;
- /*
- * The namespaces list needs to be sorted to simplify the implementation
- * of the Identify Namepace List subcommand.
- */
- if (list_empty(&subsys->namespaces)) {
- list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
- } else {
- struct nvmet_ns *old;
-
- list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
- lockdep_is_held(&subsys->lock)) {
- BUG_ON(ns->nsid == old->nsid);
- if (ns->nsid < old->nsid)
- break;
- }
+ ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
+ if (ret)
+ goto out_restore_subsys_maxnsid;
- list_add_tail_rcu(&ns->dev_link, &old->dev_link);
- }
subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
@@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
+
+out_restore_subsys_maxnsid:
+ subsys->max_nsid = nvmet_max_nsid(subsys);
+ percpu_ref_exit(&ns->ref);
out_dev_put:
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
@@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
goto out_unlock;
ns->enabled = false;
- list_del_rcu(&ns->dev_link);
+ xa_erase(&ns->subsys->namespaces, ns->nsid);
if (ns->nsid == subsys->max_nsid)
subsys->max_nsid = nvmet_max_nsid(subsys);
@@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
if (!ns)
return NULL;
- INIT_LIST_HEAD(&ns->dev_link);
init_completion(&ns->disable_done);
ns->nsid = nsid;
@@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
+ if (nvmet_req_passthru_ctrl(req))
+ return nvmet_parse_passthru_io_cmd(req);
+
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
@@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
struct nvmet_req *req)
{
struct nvmet_ns *ns;
+ unsigned long idx;
if (!req->p2p_client)
return;
ctrl->p2p_client = get_device(req->p2p_client);
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
- lockdep_is_held(&ctrl->subsys->lock))
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns)
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
@@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
if (!subsys)
return ERR_PTR(-ENOMEM);
- subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
+ subsys->ver = NVMET_DEFAULT_VS;
/* generate a random serial number as our controllers are ephemeral: */
get_random_bytes(&subsys->serial, sizeof(subsys->serial));
@@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
kref_init(&subsys->ref);
mutex_init(&subsys->lock);
- INIT_LIST_HEAD(&subsys->namespaces);
+ xa_init(&subsys->namespaces);
INIT_LIST_HEAD(&subsys->ctrls);
INIT_LIST_HEAD(&subsys->hosts);
@@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref)
struct nvmet_subsys *subsys =
container_of(ref, struct nvmet_subsys, ref);
- WARN_ON_ONCE(!list_empty(&subsys->namespaces));
+ WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
+
+ xa_destroy(&subsys->namespaces);
+ nvmet_passthru_subsys_free(subsys);
kfree(subsys->subsysnqn);
kfree_rcu(subsys->model, rcuhead);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 40cf0b6e6c9d..f40c05c33c3a 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
- if (ctrl->ops->has_keyed_sgls)
+ if (ctrl->ops->flags & NVMF_KEYED_SGLS)
id->sgls |= cpu_to_le32(1 << 2);
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 27fd3b5aa621..55bafd56166a 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc {
struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
struct kref ref;
struct work_struct del_work;
- atomic_t del_work_active;
};
@@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work)
container_of(work, struct nvmet_fc_tgt_assoc, del_work);
nvmet_fc_delete_target_assoc(assoc);
- atomic_set(&assoc->del_work_active, 0);
nvmet_fc_tgt_a_put(assoc);
}
@@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle)
INIT_LIST_HEAD(&assoc->a_list);
kref_init(&assoc->ref);
INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
- atomic_set(&assoc->del_work_active, 0);
atomic_set(&assoc->terminating, 0);
while (needrandom) {
@@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
if (association_id == assoc->association_id) {
ret = assoc;
- nvmet_fc_tgt_a_get(assoc);
+ if (!nvmet_fc_tgt_a_get(assoc))
+ ret = NULL;
break;
}
}
@@ -1477,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
{
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
- int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
&tgtport->assoc_list, a_list) {
if (!nvmet_fc_tgt_a_get(assoc))
continue;
- ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
- if (ret == 0) {
- if (!schedule_work(&assoc->del_work))
- nvmet_fc_tgt_a_put(assoc);
- } else {
+ if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
- }
}
spin_unlock_irqrestore(&tgtport->lock, flags);
}
@@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
bool noassoc = true;
- int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
@@ -1545,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
continue;
assoc->hostport->invalid = 1;
noassoc = false;
- ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
- if (ret == 0) {
- if (!schedule_work(&assoc->del_work))
- nvmet_fc_tgt_a_put(assoc);
- } else {
+ if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
- }
}
spin_unlock_irqrestore(&tgtport->lock, flags);
@@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
struct nvmet_fc_tgt_queue *queue;
unsigned long flags;
bool found_ctrl = false;
- int ret;
/* this is a bit ugly, but don't want to make locks layered */
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
@@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
nvmet_fc_tgtport_put(tgtport);
if (found_ctrl) {
- ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
- if (ret == 0) {
- if (!schedule_work(&assoc->del_work))
- nvmet_fc_tgt_a_put(assoc);
- } else {
+ if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
- }
return;
}
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 2ff1d1334a03..c97e60b71bbc 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -43,6 +43,17 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_ERR, NULL }
};
+static int fcloop_verify_addr(substring_t *s)
+{
+ size_t blen = s->to - s->from + 1;
+
+ if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 ||
+ strncmp(s->from, "0x", 2))
+ return -EINVAL;
+
+ return 0;
+}
+
static int
fcloop_parse_options(struct fcloop_ctrl_options *opts,
const char *buf)
@@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->mask |= token;
switch (token) {
case NVMF_OPT_WWNN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
opts->wwnn = token64;
break;
case NVMF_OPT_WWPN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
@@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->fcaddr = token;
break;
case NVMF_OPT_LPWWNN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
opts->lpwwnn = token64;
break;
case NVMF_OPT_LPWWPN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
@@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname,
token = match_token(p, opt_tokens, args);
switch (token) {
case NVMF_OPT_WWNN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
*nname = token64;
break;
case NVMF_OPT_WWPN:
- if (match_u64(args, &token64)) {
+ if (fcloop_verify_addr(args) ||
+ match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 6344e73c9354..4884ef1e46a2 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -36,7 +36,6 @@ struct nvme_loop_ctrl {
struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
- struct nvmet_ctrl *target_ctrl;
struct nvmet_port *port;
};
@@ -116,7 +115,8 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
return;
}
- nvme_end_request(rq, cqe->status, cqe->result);
+ if (!nvme_end_request(rq, cqe->status, cqe->result))
+ nvme_loop_complete_rq(rq);
}
}
@@ -444,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
{
struct nvme_loop_ctrl *ctrl =
container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
- bool changed;
int ret;
nvme_stop_ctrl(&ctrl->ctrl);
@@ -471,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
+ WARN_ON_ONCE(1);
nvme_start_ctrl(&ctrl->ctrl);
@@ -567,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_loop_ctrl *ctrl;
- bool changed;
int ret;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
@@ -583,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
if (ret)
goto out_put_ctrl;
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
+ WARN_ON_ONCE(1);
+
ret = -ENOMEM;
ctrl->ctrl.sqsize = opts->queue_size - 1;
@@ -617,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
+ WARN_ON_ONCE(1);
mutex_lock(&nvme_loop_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 809691291e73..47ee3fb193bd 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -21,6 +21,8 @@
#include <linux/radix-tree.h>
#include <linux/t10-pi.h>
+#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
+
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
#define NVMET_NO_ERROR_LOC ((u16)-1)
@@ -52,7 +54,6 @@
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
struct nvmet_ns {
- struct list_head dev_link;
struct percpu_ref ref;
struct block_device *bdev;
struct file *file;
@@ -219,7 +220,7 @@ struct nvmet_subsys {
struct mutex lock;
struct kref ref;
- struct list_head namespaces;
+ struct xarray namespaces;
unsigned int nr_namespaces;
unsigned int max_nsid;
u16 cntlid_min;
@@ -243,6 +244,12 @@ struct nvmet_subsys {
struct config_group allowed_hosts_group;
struct nvmet_subsys_model __rcu *model;
+
+#ifdef CONFIG_NVME_TARGET_PASSTHRU
+ struct nvme_ctrl *passthru_ctrl;
+ char *passthru_ctrl_path;
+ struct config_group passthru_group;
+#endif /* CONFIG_NVME_TARGET_PASSTHRU */
};
static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@@ -286,8 +293,9 @@ struct nvmet_fabrics_ops {
struct module *owner;
unsigned int type;
unsigned int msdbd;
- bool has_keyed_sgls : 1;
- bool metadata_support : 1;
+ unsigned int flags;
+#define NVMF_KEYED_SGLS (1 << 0)
+#define NVMF_METADATA_SUPPORTED (1 << 1)
void (*queue_response)(struct nvmet_req *req);
int (*add_port)(struct nvmet_port *port);
void (*remove_port)(struct nvmet_port *port);
@@ -321,6 +329,11 @@ struct nvmet_req {
struct bio_vec *bvec;
struct work_struct work;
} f;
+ struct {
+ struct request *rq;
+ struct work_struct work;
+ bool use_workqueue;
+ } p;
};
int sg_cnt;
int metadata_sg_cnt;
@@ -400,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
int nvmet_req_alloc_sgls(struct nvmet_req *req);
void nvmet_req_free_sgls(struct nvmet_req *req);
+void nvmet_execute_set_features(struct nvmet_req *req);
+void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@@ -532,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
sizeof(struct nvme_dsm_range);
}
+#ifdef CONFIG_NVME_TARGET_PASSTHRU
+void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
+int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
+void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys);
+u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
+static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+{
+ return subsys->passthru_ctrl;
+}
+#else /* CONFIG_NVME_TARGET_PASSTHRU */
+static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
+{
+}
+static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
+{
+}
+static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
+{
+ return 0;
+}
+static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
+{
+ return 0;
+}
+static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+{
+ return NULL;
+}
+#endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
+static inline struct nvme_ctrl *
+nvmet_req_passthru_ctrl(struct nvmet_req *req)
+{
+ return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
+}
+
u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
/* Convert a 32-bit number to a 16-bit 0's based number */
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
new file mode 100644
index 000000000000..89d91dc999a6
--- /dev/null
+++ b/drivers/nvme/target/passthru.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe Over Fabrics Target Passthrough command implementation.
+ *
+ * Copyright (c) 2017-2018 Western Digital Corporation or its
+ * affiliates.
+ * Copyright (c) 2019-2020, Eideticom Inc.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+
+#include "../host/nvme.h"
+#include "nvmet.h"
+
+MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU);
+
+/*
+ * xarray to maintain one passthru subsystem per nvme controller.
+ */
+static DEFINE_XARRAY(passthru_subsystems);
+
+static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
+ u16 status = NVME_SC_SUCCESS;
+ struct nvme_id_ctrl *id;
+ u32 max_hw_sectors;
+ int page_shift;
+
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
+ return NVME_SC_INTERNAL;
+
+ status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id));
+ if (status)
+ goto out_free;
+
+ id->cntlid = cpu_to_le16(ctrl->cntlid);
+ id->ver = cpu_to_le32(ctrl->subsys->ver);
+
+ /*
+ * The passthru NVMe driver may have a limit on the number of segments
+ * which depends on the host's memory fragementation. To solve this,
+ * ensure mdts is limited to the pages equal to the number of segments.
+ */
+ max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9),
+ pctrl->max_hw_sectors);
+
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
+ id->mdts = ilog2(max_hw_sectors) + 9 - page_shift;
+
+ id->acl = 3;
+ /*
+ * We export aerl limit for the fabrics controller, update this when
+ * passthru based aerl support is added.
+ */
+ id->aerl = NVMET_ASYNC_EVENTS - 1;
+
+ /* emulate kas as most of the PCIe ctrl don't have a support for kas */
+ id->kas = cpu_to_le16(NVMET_KAS);
+
+ /* don't support host memory buffer */
+ id->hmpre = 0;
+ id->hmmin = 0;
+
+ id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes);
+ id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes);
+ id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+
+ /* don't support fuse commands */
+ id->fuses = 0;
+
+ id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
+ if (ctrl->ops->flags & NVMF_KEYED_SGLS)
+ id->sgls |= cpu_to_le32(1 << 2);
+ if (req->port->inline_data_size)
+ id->sgls |= cpu_to_le32(1 << 20);
+
+ /*
+ * When passsthru controller is setup using nvme-loop transport it will
+ * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in
+ * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl()
+ * code path with duplicate ctr subsynqn. In order to prevent that we
+ * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn.
+ */
+ memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn));
+
+ /* use fabric id-ctrl values */
+ id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
+ req->port->inline_data_size) / 16);
+ id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
+
+ id->msdbd = ctrl->ops->msdbd;
+
+ /* Support multipath connections with fabrics */
+ id->cmic |= 1 << 1;
+
+ /* Disable reservations, see nvmet_parse_passthru_io_cmd() */
+ id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS);
+
+ status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl));
+
+out_free:
+ kfree(id);
+ return status;
+}
+
+static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req)
+{
+ u16 status = NVME_SC_SUCCESS;
+ struct nvme_id_ns *id;
+ int i;
+
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
+ return NVME_SC_INTERNAL;
+
+ status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns));
+ if (status)
+ goto out_free;
+
+ for (i = 0; i < (id->nlbaf + 1); i++)
+ if (id->lbaf[i].ms)
+ memset(&id->lbaf[i], 0, sizeof(id->lbaf[i]));
+
+ id->flbas = id->flbas & ~(1 << 4);
+
+ /*
+ * Presently the NVMEof target code does not support sending
+ * metadata, so we must disable it here. This should be updated
+ * once target starts supporting metadata.
+ */
+ id->mc = 0;
+
+ status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+out_free:
+ kfree(id);
+ return status;
+}
+
+static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
+ struct request *rq = req->p.rq;
+ u16 status;
+
+ nvme_execute_passthru_rq(rq);
+
+ status = nvme_req(rq)->status;
+ if (status == NVME_SC_SUCCESS &&
+ req->cmd->common.opcode == nvme_admin_identify) {
+ switch (req->cmd->identify.cns) {
+ case NVME_ID_CNS_CTRL:
+ nvmet_passthru_override_id_ctrl(req);
+ break;
+ case NVME_ID_CNS_NS:
+ nvmet_passthru_override_id_ns(req);
+ break;
+ }
+ }
+
+ req->cqe->result = nvme_req(rq)->result;
+ nvmet_req_complete(req, status);
+ blk_put_request(rq);
+}
+
+static void nvmet_passthru_req_done(struct request *rq,
+ blk_status_t blk_status)
+{
+ struct nvmet_req *req = rq->end_io_data;
+
+ req->cqe->result = nvme_req(rq)->result;
+ nvmet_req_complete(req, nvme_req(rq)->status);
+ blk_put_request(rq);
+}
+
+static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
+{
+ int sg_cnt = req->sg_cnt;
+ struct scatterlist *sg;
+ int op_flags = 0;
+ struct bio *bio;
+ int i, ret;
+
+ if (req->cmd->common.opcode == nvme_cmd_flush)
+ op_flags = REQ_FUA;
+ else if (nvme_is_write(req->cmd))
+ op_flags = REQ_SYNC | REQ_IDLE;
+
+ bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+ bio->bi_end_io = bio_put;
+ bio->bi_opf = req_op(rq) | op_flags;
+
+ for_each_sg(req->sg, sg, req->sg_cnt, i) {
+ if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
+ sg->offset) < sg->length) {
+ bio_put(bio);
+ return -EINVAL;
+ }
+ sg_cnt--;
+ }
+
+ ret = blk_rq_append_bio(rq, &bio);
+ if (unlikely(ret)) {
+ bio_put(bio);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
+{
+ struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+ struct request_queue *q = ctrl->admin_q;
+ struct nvme_ns *ns = NULL;
+ struct request *rq = NULL;
+ u32 effects;
+ u16 status;
+ int ret;
+
+ if (likely(req->sq->qid != 0)) {
+ u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+
+ ns = nvme_find_get_ns(ctrl, nsid);
+ if (unlikely(!ns)) {
+ pr_err("failed to get passthru ns nsid:%u\n", nsid);
+ status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+ goto fail_out;
+ }
+
+ q = ns->queue;
+ }
+
+ rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
+ if (IS_ERR(rq)) {
+ rq = NULL;
+ status = NVME_SC_INTERNAL;
+ goto fail_out;
+ }
+
+ if (req->sg_cnt) {
+ ret = nvmet_passthru_map_sg(req, rq);
+ if (unlikely(ret)) {
+ status = NVME_SC_INTERNAL;
+ goto fail_out;
+ }
+ }
+
+ /*
+ * If there are effects for the command we are about to execute, or
+ * an end_req function we need to use nvme_execute_passthru_rq()
+ * synchronously in a work item seeing the end_req function and
+ * nvme_passthru_end() can't be called in the request done callback
+ * which is typically in interrupt context.
+ */
+ effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode);
+ if (req->p.use_workqueue || effects) {
+ INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work);
+ req->p.rq = rq;
+ schedule_work(&req->p.work);
+ } else {
+ rq->end_io_data = req;
+ blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
+ nvmet_passthru_req_done);
+ }
+
+ if (ns)
+ nvme_put_ns(ns);
+
+ return;
+
+fail_out:
+ if (ns)
+ nvme_put_ns(ns);
+ nvmet_req_complete(req, status);
+ blk_put_request(rq);
+}
+
+/*
+ * We need to emulate set host behaviour to ensure that any requested
+ * behaviour of the target's host matches the requested behaviour
+ * of the device's host and fail otherwise.
+ */
+static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req)
+{
+ struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+ struct nvme_feat_host_behavior *host;
+ u16 status = NVME_SC_INTERNAL;
+ int ret;
+
+ host = kzalloc(sizeof(*host) * 2, GFP_KERNEL);
+ if (!host)
+ goto out_complete_req;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+ host, sizeof(*host), NULL);
+ if (ret)
+ goto out_free_host;
+
+ status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host));
+ if (status)
+ goto out_free_host;
+
+ if (memcmp(&host[0], &host[1], sizeof(host[0]))) {
+ pr_warn("target host has requested different behaviour from the local host\n");
+ status = NVME_SC_INTERNAL;
+ }
+
+out_free_host:
+ kfree(host);
+out_complete_req:
+ nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
+{
+ req->p.use_workqueue = false;
+ req->execute = nvmet_passthru_execute_cmd;
+ return NVME_SC_SUCCESS;
+}
+
+u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
+{
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_resv_register:
+ case nvme_cmd_resv_report:
+ case nvme_cmd_resv_acquire:
+ case nvme_cmd_resv_release:
+ /*
+ * Reservations cannot be supported properly because the
+ * underlying device has no way of differentiating different
+ * hosts that connect via fabrics. This could potentially be
+ * emulated in the future if regular targets grow support for
+ * this feature.
+ */
+ return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+
+ return nvmet_setup_passthru_command(req);
+}
+
+/*
+ * Only features that are emulated or specifically allowed in the list are
+ * passed down to the controller. This function implements the allow list for
+ * both get and set features.
+ */
+static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
+{
+ switch (le32_to_cpu(req->cmd->features.fid)) {
+ case NVME_FEAT_ARBITRATION:
+ case NVME_FEAT_POWER_MGMT:
+ case NVME_FEAT_LBA_RANGE:
+ case NVME_FEAT_TEMP_THRESH:
+ case NVME_FEAT_ERR_RECOVERY:
+ case NVME_FEAT_VOLATILE_WC:
+ case NVME_FEAT_WRITE_ATOMIC:
+ case NVME_FEAT_AUTO_PST:
+ case NVME_FEAT_TIMESTAMP:
+ case NVME_FEAT_HCTM:
+ case NVME_FEAT_NOPSC:
+ case NVME_FEAT_RRL:
+ case NVME_FEAT_PLM_CONFIG:
+ case NVME_FEAT_PLM_WINDOW:
+ case NVME_FEAT_HOST_BEHAVIOR:
+ case NVME_FEAT_SANITIZE:
+ case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END:
+ return nvmet_setup_passthru_command(req);
+
+ case NVME_FEAT_ASYNC_EVENT:
+ /* There is no support for forwarding ASYNC events */
+ case NVME_FEAT_IRQ_COALESCE:
+ case NVME_FEAT_IRQ_CONFIG:
+ /* The IRQ settings will not apply to the target controller */
+ case NVME_FEAT_HOST_MEM_BUF:
+ /*
+ * Any HMB that's set will not be passed through and will
+ * not work as expected
+ */
+ case NVME_FEAT_SW_PROGRESS:
+ /*
+ * The Pre-Boot Software Load Count doesn't make much
+ * sense for a target to export
+ */
+ case NVME_FEAT_RESV_MASK:
+ case NVME_FEAT_RESV_PERSIST:
+ /* No reservations, see nvmet_parse_passthru_io_cmd() */
+ default:
+ return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+}
+
+u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
+{
+ /*
+ * Passthru all vendor specific commands
+ */
+ if (req->cmd->common.opcode >= nvme_admin_vendor_start)
+ return nvmet_setup_passthru_command(req);
+
+ switch (req->cmd->common.opcode) {
+ case nvme_admin_async_event:
+ req->execute = nvmet_execute_async_event;
+ return NVME_SC_SUCCESS;
+ case nvme_admin_keep_alive:
+ /*
+ * Most PCIe ctrls don't support keep alive cmd, we route keep
+ * alive to the non-passthru mode. In future please change this
+ * code when PCIe ctrls with keep alive support available.
+ */
+ req->execute = nvmet_execute_keep_alive;
+ return NVME_SC_SUCCESS;
+ case nvme_admin_set_features:
+ switch (le32_to_cpu(req->cmd->features.fid)) {
+ case NVME_FEAT_ASYNC_EVENT:
+ case NVME_FEAT_KATO:
+ case NVME_FEAT_NUM_QUEUES:
+ case NVME_FEAT_HOST_ID:
+ req->execute = nvmet_execute_set_features;
+ return NVME_SC_SUCCESS;
+ case NVME_FEAT_HOST_BEHAVIOR:
+ req->execute = nvmet_passthru_set_host_behaviour;
+ return NVME_SC_SUCCESS;
+ default:
+ return nvmet_passthru_get_set_features(req);
+ }
+ break;
+ case nvme_admin_get_features:
+ switch (le32_to_cpu(req->cmd->features.fid)) {
+ case NVME_FEAT_ASYNC_EVENT:
+ case NVME_FEAT_KATO:
+ case NVME_FEAT_NUM_QUEUES:
+ case NVME_FEAT_HOST_ID:
+ req->execute = nvmet_execute_get_features;
+ return NVME_SC_SUCCESS;
+ default:
+ return nvmet_passthru_get_set_features(req);
+ }
+ break;
+ case nvme_admin_identify:
+ switch (req->cmd->identify.cns) {
+ case NVME_ID_CNS_CTRL:
+ req->execute = nvmet_passthru_execute_cmd;
+ req->p.use_workqueue = true;
+ return NVME_SC_SUCCESS;
+ case NVME_ID_CNS_NS:
+ req->execute = nvmet_passthru_execute_cmd;
+ req->p.use_workqueue = true;
+ return NVME_SC_SUCCESS;
+ default:
+ return nvmet_setup_passthru_command(req);
+ }
+ case nvme_admin_get_log_page:
+ return nvmet_setup_passthru_command(req);
+ default:
+ /* Reject commands not in the allowlist above */
+ return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+}
+
+int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
+{
+ struct nvme_ctrl *ctrl;
+ int ret = -EINVAL;
+ void *old;
+
+ mutex_lock(&subsys->lock);
+ if (!subsys->passthru_ctrl_path)
+ goto out_unlock;
+ if (subsys->passthru_ctrl)
+ goto out_unlock;
+
+ if (subsys->nr_namespaces) {
+ pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
+ goto out_unlock;
+ }
+
+ ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path);
+ if (IS_ERR(ctrl)) {
+ ret = PTR_ERR(ctrl);
+ pr_err("failed to open nvme controller %s\n",
+ subsys->passthru_ctrl_path);
+
+ goto out_unlock;
+ }
+
+ old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL,
+ subsys, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ ret = xa_err(old);
+ goto out_put_ctrl;
+ }
+
+ if (old)
+ goto out_put_ctrl;
+
+ subsys->passthru_ctrl = ctrl;
+ subsys->ver = ctrl->vs;
+
+ if (subsys->ver < NVME_VS(1, 2, 1)) {
+ pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n",
+ NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver),
+ NVME_TERTIARY(subsys->ver));
+ subsys->ver = NVME_VS(1, 2, 1);
+ }
+
+ mutex_unlock(&subsys->lock);
+ return 0;
+
+out_put_ctrl:
+ nvme_put_ctrl(ctrl);
+out_unlock:
+ mutex_unlock(&subsys->lock);
+ return ret;
+}
+
+static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
+{
+ if (subsys->passthru_ctrl) {
+ xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid);
+ nvme_put_ctrl(subsys->passthru_ctrl);
+ }
+ subsys->passthru_ctrl = NULL;
+ subsys->ver = NVMET_DEFAULT_VS;
+}
+
+void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
+{
+ mutex_lock(&subsys->lock);
+ __nvmet_passthru_ctrl_disable(subsys);
+ mutex_unlock(&subsys->lock);
+}
+
+void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
+{
+ mutex_lock(&subsys->lock);
+ __nvmet_passthru_ctrl_disable(subsys);
+ mutex_unlock(&subsys->lock);
+ kfree(subsys->passthru_ctrl_path);
+}
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 76ea23a2c2be..3ccb59260b4a 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
- struct nvmet_rdma_queue *queue = cq->cq_context;
+ struct nvmet_rdma_queue *queue = wc->qp->qp_context;
u16 status = 0;
WARN_ON(rsp->n_rdma <= 0);
@@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_cmd *cmd =
container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
- struct nvmet_rdma_queue *queue = cq->cq_context;
+ struct nvmet_rdma_queue *queue = wc->qp->qp_context;
struct nvmet_rdma_rsp *rsp;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
@@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
*/
nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
- queue->cq = ib_alloc_cq(ndev->device, queue,
- nr_cqe + 1, queue->comp_vector,
- IB_POLL_WORKQUEUE);
+ queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
+ queue->comp_vector, IB_POLL_WORKQUEUE);
if (IS_ERR(queue->cq)) {
ret = PTR_ERR(queue->cq);
pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -1322,7 +1321,7 @@ out:
err_destroy_qp:
rdma_destroy_qp(queue->cm_id);
err_destroy_cq:
- ib_free_cq(queue->cq);
+ ib_cq_pool_put(queue->cq, nr_cqe + 1);
goto out;
}
@@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
if (queue->cm_id)
rdma_destroy_id(queue->cm_id);
ib_destroy_qp(queue->qp);
- ib_free_cq(queue->cq);
+ ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
+ queue->send_queue_size + 1);
}
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
@@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_RDMA,
.msdbd = 1,
- .has_keyed_sgls = 1,
- .metadata_support = 1,
+ .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED,
.add_port = nvmet_rdma_add_port,
.remove_port = nvmet_rdma_remove_port,
.queue_response = nvmet_rdma_queue_response,
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index de9217cfd22d..9eda91162fe4 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
{
struct llist_node *node;
+ struct nvmet_tcp_cmd *cmd;
- node = llist_del_all(&queue->resp_list);
- if (!node)
- return;
-
- while (node) {
- struct nvmet_tcp_cmd *cmd = llist_entry(node,
- struct nvmet_tcp_cmd, lentry);
-
+ for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
+ cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
list_add(&cmd->entry, &queue->resp_send_list);
- node = node->next;
queue->send_list_len++;
}
}
@@ -1717,7 +1711,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_TCP,
.msdbd = 1,
- .has_keyed_sgls = 0,
.add_port = nvmet_tcp_add_port,
.remove_port = nvmet_tcp_remove_port,
.queue_response = nvmet_tcp_queue_response,