From 8b8b0915ba8daef9b4320d6dc75a2ec14e1fe2df Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 8 Oct 2020 15:13:27 +0200 Subject: s390/cio: Export information about Endpoint-Security Capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new sysfs attribute 'esc' per chpid. This new attribute exports the Endpoint-Security-Capability byte of channel-path description block, which could be 0-None, 1-Authentication, 2 and 3-Encryption. For example: $ cat /sys/devices/css0/chp0.34/esc 0 [vneethv@linux.ibm.com: cleaned-up & modified description] Signed-off-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Stefan Haberland Reviewed-by: Jan Höppner Reviewed-by: Peter Oberparleiter Reviewed-by: Cornelia Huck Acked-by: Vasily Gorbik Signed-off-by: Jens Axboe --- drivers/s390/cio/chp.c | 15 +++++++++++++++ drivers/s390/cio/chsc.h | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index dfcbe54591fb..8d0de6adcad0 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -384,6 +384,20 @@ static ssize_t chp_chid_external_show(struct device *dev, } static DEVICE_ATTR(chid_external, 0444, chp_chid_external_show, NULL); +static ssize_t chp_esc_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct channel_path *chp = to_channelpath(dev); + ssize_t rc; + + mutex_lock(&chp->lock); + rc = sprintf(buf, "%x\n", chp->desc_fmt1.esc); + mutex_unlock(&chp->lock); + + return rc; +} +static DEVICE_ATTR(esc, 0444, chp_esc_show, NULL); + static ssize_t util_string_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) @@ -414,6 +428,7 @@ static struct attribute *chp_attrs[] = { &dev_attr_shared.attr, &dev_attr_chid.attr, &dev_attr_chid_external.attr, + &dev_attr_esc.attr, NULL, }; static struct attribute_group chp_attr_group = { diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index c2b83b68bc57..32fa7faa5bf6 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -27,7 +27,8 @@ struct channel_path_desc_fmt1 { u8 lsn; u8 desc; u8 chpid; - u32:24; + u32:16; + u8 esc; u8 chpp; u32 unused[2]; u16 chid; -- cgit v1.2.3-59-g8ed1b From 4cd6094d9d609f73694783553df72572e302a5e9 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Thu, 8 Oct 2020 15:13:28 +0200 Subject: s390/cio: Provide Endpoint-Security Mode per CU Add an interface in the CIO layer to retrieve the information about the Endpoint-Security Mode (ESM) of the specified CU. The ESM values are defined as 0-None, 1-Authenticated or 2, 3-Encrypted. [vneethv@linux.ibm.com: cleaned-up and modified description] Signed-off-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Stefan Haberland Reviewed-by: Peter Oberparleiter Acked-by: Vasily Gorbik Acked-by: Cornelia Huck Signed-off-by: Jens Axboe --- arch/s390/include/asm/cio.h | 1 + drivers/s390/cio/chsc.c | 83 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 5c58756d6476..e36cb67d2441 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -373,5 +373,6 @@ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); int chsc_stzi(void *page, void *result, size_t size); int chsc_sgib(u32 origin); +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid); #endif diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index fc06a4002168..4ea466593fd6 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1428,3 +1428,86 @@ int chsc_sgib(u32 origin) return ret; } EXPORT_SYMBOL_GPL(chsc_sgib); + +#define SCUD_REQ_LEN 0x10 /* SCUD request block length */ +#define SCUD_REQ_CMD 0x4b /* SCUD Command Code */ + +struct chse_cudb { + u16 flags:8; + u16 chp_valid:8; + u16 cu; + u32 esm_valid:8; + u32:24; + u8 chpid[8]; + u32:32; + u32:32; + u8 esm[8]; + u32 efla[8]; +} __packed; + +struct chsc_scud { + struct chsc_header request; + u16:4; + u16 fmt:4; + u16 cssid:8; + u16 first_cu; + u16:16; + u16 last_cu; + u32:32; + struct chsc_header response; + u16:4; + u16 fmt_resp:4; + u32:24; + struct chse_cudb cudb[]; +} __packed; + +/** + * chsc_scud() - Store control-unit description. + * @cu: number of the control-unit + * @esm: 8 1-byte endpoint security mode values + * @esm_valid: validity mask for @esm + * + * Interface to retrieve information about the endpoint security + * modes for up to 8 paths of a control unit. + * + * Returns 0 on success. + */ +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid) +{ + struct chsc_scud *scud = chsc_page; + int ret; + + spin_lock_irq(&chsc_page_lock); + memset(chsc_page, 0, PAGE_SIZE); + scud->request.length = SCUD_REQ_LEN; + scud->request.code = SCUD_REQ_CMD; + scud->fmt = 0; + scud->cssid = 0; + scud->first_cu = cu; + scud->last_cu = cu; + + ret = chsc(scud); + if (!ret) + ret = chsc_error_from_response(scud->response.code); + + if (!ret && (scud->response.length <= 8 || scud->fmt_resp != 0 + || !(scud->cudb[0].flags & 0x80) + || scud->cudb[0].cu != cu)) { + + CIO_MSG_EVENT(2, "chsc: scud failed rc=%04x, L2=%04x " + "FMT=%04x, cudb.flags=%02x, cudb.cu=%04x", + scud->response.code, scud->response.length, + scud->fmt_resp, scud->cudb[0].flags, scud->cudb[0].cu); + ret = -EINVAL; + } + + if (ret) + goto out; + + memcpy(esm, scud->cudb[0].esm, sizeof(*esm)); + *esm_valid = scud->cudb[0].esm_valid; +out: + spin_unlock_irq(&chsc_page_lock); + return ret; +} +EXPORT_SYMBOL_GPL(chsc_scud); -- cgit v1.2.3-59-g8ed1b From 32ef938815c1fb42d65212aac860ab153a64de1a Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Thu, 8 Oct 2020 15:13:29 +0200 Subject: s390/cio: Add support for FCES status notification Fibre Channel Endpoint-Security event is received as an sei:nt0 type in the CIO layer. This information needs to be shared with the CCW device drivers using the path_events callback. Co-developed-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Sebastian Ott Signed-off-by: Stefan Haberland Reviewed-by: Peter Oberparleiter Acked-by: Vasily Gorbik Signed-off-by: Jens Axboe --- arch/s390/include/asm/ccwdev.h | 2 ++ drivers/s390/cio/chp.h | 1 + drivers/s390/cio/chsc.c | 62 ++++++++++++++++++++++++++++++++++++------ drivers/s390/cio/device.c | 15 +++++++++- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index c0be5fe1ddba..bf605e1fcf6a 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -104,6 +104,8 @@ struct ccw_device { was successfully verified. */ #define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had to be established again. */ +#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has + * changed. */ /* * Possible CIO actions triggered by the unit check handler. diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h index 20259f3fbf45..7ee9eba0abcb 100644 --- a/drivers/s390/cio/chp.h +++ b/drivers/s390/cio/chp.h @@ -23,6 +23,7 @@ #define CHP_OFFLINE 1 #define CHP_VARY_ON 2 #define CHP_VARY_OFF 3 +#define CHP_FCES_EVENT 4 struct chp_link { struct chp_id chpid; diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 4ea466593fd6..c22d9ee27ba1 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -37,6 +37,9 @@ static void *sei_page; static void *chsc_page; static DEFINE_SPINLOCK(chsc_page_lock); +#define SEI_VF_FLA 0xc0 /* VF flag for Full Link Address */ +#define SEI_RS_CHPID 0x4 /* 4 in RS field indicates CHPID */ + /** * chsc_error_from_response() - convert a chsc response to an error * @response: chsc response code @@ -287,6 +290,15 @@ static void s390_process_res_acc(struct chp_link *link) css_schedule_reprobe(); } +static int process_fces_event(struct subchannel *sch, void *data) +{ + spin_lock_irq(sch->lock); + if (sch->driver && sch->driver->chp_event) + sch->driver->chp_event(sch, data, CHP_FCES_EVENT); + spin_unlock_irq(sch->lock); + return 0; +} + struct chsc_sei_nt0_area { u8 flags; u8 vf; /* validity flags */ @@ -364,6 +376,16 @@ static char *store_ebcdic(char *dest, const char *src, unsigned long len, return dest + len; } +static void chsc_link_from_sei(struct chp_link *link, + struct chsc_sei_nt0_area *sei_area) +{ + if ((sei_area->vf & SEI_VF_FLA) != 0) { + link->fla = sei_area->fla; + link->fla_mask = ((sei_area->vf & SEI_VF_FLA) == SEI_VF_FLA) ? + 0xffff : 0xff00; + } +} + /* Format node ID and parameters for output in LIR log message. */ static void format_node_data(char *params, char *id, struct node_descriptor *nd) { @@ -453,15 +475,7 @@ static void chsc_process_sei_res_acc(struct chsc_sei_nt0_area *sei_area) } memset(&link, 0, sizeof(struct chp_link)); link.chpid = chpid; - if ((sei_area->vf & 0xc0) != 0) { - link.fla = sei_area->fla; - if ((sei_area->vf & 0xc0) == 0xc0) - /* full link address */ - link.fla_mask = 0xffff; - else - /* link address */ - link.fla_mask = 0xff00; - } + chsc_link_from_sei(&link, sei_area); s390_process_res_acc(&link); } @@ -570,6 +584,33 @@ static void chsc_process_sei_ap_cfg_chg(struct chsc_sei_nt0_area *sei_area) ap_bus_cfg_chg(); } +static void chsc_process_sei_fces_event(struct chsc_sei_nt0_area *sei_area) +{ + struct chp_link link; + struct chp_id chpid; + struct channel_path *chp; + + CIO_CRW_EVENT(4, + "chsc: FCES status notification (rs=%02x, rs_id=%04x, FCES-status=%x)\n", + sei_area->rs, sei_area->rsid, sei_area->ccdf[0]); + + if (sei_area->rs != SEI_RS_CHPID) + return; + chp_id_init(&chpid); + chpid.id = sei_area->rsid; + + /* Ignore the event on unknown/invalid chp */ + chp = chpid_to_chp(chpid); + if (!chp) + return; + + memset(&link, 0, sizeof(struct chp_link)); + link.chpid = chpid; + chsc_link_from_sei(&link, sei_area); + + for_each_subchannel_staged(process_fces_event, NULL, &link); +} + static void chsc_process_sei_nt2(struct chsc_sei_nt2_area *sei_area) { switch (sei_area->cc) { @@ -611,6 +652,9 @@ static void chsc_process_sei_nt0(struct chsc_sei_nt0_area *sei_area) case 14: /* scm available notification */ chsc_process_sei_scm_avail(sei_area); break; + case 15: /* FCES event notification */ + chsc_process_sei_fces_event(sei_area); + break; default: /* other stuff */ CIO_CRW_EVENT(2, "chsc: sei nt0 unhandled cc=%d\n", sei_area->cc); diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index b29fe8d50baf..aab13c78db9f 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1170,7 +1170,8 @@ static int io_subchannel_chp_event(struct subchannel *sch, struct chp_link *link, int event) { struct ccw_device *cdev = sch_get_cdev(sch); - int mask; + int mask, chpid, valid_bit; + int path_event[8]; mask = chp_ssd_get_mask(&sch->ssd_info, link); if (!mask) @@ -1205,6 +1206,18 @@ static int io_subchannel_chp_event(struct subchannel *sch, cdev->private->path_new_mask |= mask; io_subchannel_verify(sch); break; + case CHP_FCES_EVENT: + /* Forward Endpoint Security event */ + for (chpid = 0, valid_bit = 0x80; chpid < 8; chpid++, + valid_bit >>= 1) { + if (mask & valid_bit) + path_event[chpid] = PE_PATH_FCES_EVENT; + else + path_event[chpid] = PE_NONE; + } + if (cdev) + cdev->drv->path_event(cdev, path_event); + break; } return 0; } -- cgit v1.2.3-59-g8ed1b From e03c5941f904afcc0237295e84e756c36619e058 Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:30 +0200 Subject: s390/dasd: Remove unused parameter from dasd_generic_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The discipline argument in dasd_generic_probe() isn't used and there is no history how it was used in the past. Remove it. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 3 +-- drivers/s390/block/dasd_eckd.c | 2 +- drivers/s390/block/dasd_fba.c | 2 +- drivers/s390/block/dasd_int.h | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index db24e04ee978..8581b2d46e13 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3466,8 +3466,7 @@ static void dasd_generic_auto_online(void *data, async_cookie_t cookie) * Initial attempt at a probe function. this can be simplified once * the other detection code is gone. */ -int dasd_generic_probe(struct ccw_device *cdev, - struct dasd_discipline *discipline) +int dasd_generic_probe(struct ccw_device *cdev) { int ret; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index ad44d22e8859..2b39d2a5965f 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -143,7 +143,7 @@ dasd_eckd_probe (struct ccw_device *cdev) "ccw-device options"); return ret; } - ret = dasd_generic_probe(cdev, &dasd_eckd_discipline); + ret = dasd_generic_probe(cdev); return ret; } diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 1a44e321b54e..5b0ebf6bf20f 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -58,7 +58,7 @@ static struct ccw_driver dasd_fba_driver; /* see below */ static int dasd_fba_probe(struct ccw_device *cdev) { - return dasd_generic_probe(cdev, &dasd_fba_discipline); + return dasd_generic_probe(cdev); } static int diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index c59a0d63b506..97ee0997a33e 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -774,7 +774,7 @@ void dasd_block_set_timer(struct dasd_block *, int); void dasd_block_clear_timer(struct dasd_block *); int dasd_cancel_req(struct dasd_ccw_req *); int dasd_flush_device_queue(struct dasd_device *); -int dasd_generic_probe (struct ccw_device *, struct dasd_discipline *); +int dasd_generic_probe(struct ccw_device *); void dasd_generic_free_discipline(struct dasd_device *); void dasd_generic_remove (struct ccw_device *cdev); int dasd_generic_set_online(struct ccw_device *, struct dasd_discipline *); -- cgit v1.2.3-59-g8ed1b From d2a527580c0a0c83f1d98eff32804cde4280d721 Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:31 +0200 Subject: s390/dasd: Move duplicate code to separate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For storing retrieved path information both the if and else block in dasd_eckd_read_conf() use the same code. To avoid duplicate code this should be done after the if/else block. To further increase readability, move the code to a new function, dasd_eckd_store_conf_data(). Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 2b39d2a5965f..497aa81778b6 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1000,6 +1000,22 @@ static unsigned char dasd_eckd_path_access(void *conf_data, int conf_len) return 0; } +static void dasd_eckd_store_conf_data(struct dasd_device *device, + struct dasd_conf_data *conf_data, int chp) +{ + struct channel_path_desc_fmt0 *chp_desc; + struct subchannel_id sch_id; + + ccw_device_get_schid(device->cdev, &sch_id); + device->path[chp].conf_data = conf_data; + device->path[chp].cssid = sch_id.cssid; + device->path[chp].ssid = sch_id.ssid; + chp_desc = ccw_device_get_chp_desc(device->cdev, chp); + if (chp_desc) + device->path[chp].chpid = chp_desc->chpid; + kfree(chp_desc); +} + static void dasd_eckd_clear_conf_data(struct dasd_device *device) { struct dasd_eckd_private *private = device->private; @@ -1016,7 +1032,6 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) } } - static int dasd_eckd_read_conf(struct dasd_device *device) { void *conf_data; @@ -1026,12 +1041,9 @@ static int dasd_eckd_read_conf(struct dasd_device *device) struct dasd_eckd_private *private, path_private; struct dasd_uid *uid; char print_path_uid[60], print_device_uid[60]; - struct channel_path_desc_fmt0 *chp_desc; - struct subchannel_id sch_id; private = device->private; opm = ccw_device_get_path_mask(device->cdev); - ccw_device_get_schid(device->cdev, &sch_id); conf_data_saved = 0; path_err = 0; /* get configuration data per operational path */ @@ -1066,15 +1078,6 @@ static int dasd_eckd_read_conf(struct dasd_device *device) kfree(conf_data); continue; } - pos = pathmask_to_pos(lpm); - /* store per path conf_data */ - device->path[pos].conf_data = conf_data; - device->path[pos].cssid = sch_id.cssid; - device->path[pos].ssid = sch_id.ssid; - chp_desc = ccw_device_get_chp_desc(device->cdev, pos); - if (chp_desc) - device->path[pos].chpid = chp_desc->chpid; - kfree(chp_desc); /* * build device UID that other path data * can be compared to it @@ -1132,18 +1135,13 @@ static int dasd_eckd_read_conf(struct dasd_device *device) dasd_path_add_cablepm(device, lpm); continue; } - pos = pathmask_to_pos(lpm); - /* store per path conf_data */ - device->path[pos].conf_data = conf_data; - device->path[pos].cssid = sch_id.cssid; - device->path[pos].ssid = sch_id.ssid; - chp_desc = ccw_device_get_chp_desc(device->cdev, pos); - if (chp_desc) - device->path[pos].chpid = chp_desc->chpid; - kfree(chp_desc); path_private.conf_data = NULL; path_private.conf_len = 0; } + + pos = pathmask_to_pos(lpm); + dasd_eckd_store_conf_data(device, conf_data, pos); + switch (dasd_eckd_path_access(conf_data, conf_len)) { case 0x02: dasd_path_add_nppm(device, lpm); -- cgit v1.2.3-59-g8ed1b From 460181217a2496defc6c279b0a7eb810b05b9145 Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:32 +0200 Subject: s390/dasd: Store path configuration data during path handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the configuration data for a path is retrieved during a path verification and used only temporarily. If a path is newly added to the I/O setup after a boot, no configuration data will be stored for this particular path. However, this data is required for later use and should be present for a valid I/O path anyway. Store this data during the path verification so that newly added paths can provide all information necessary. [sth@linux.ibm.com: fix conf_data memleak] Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 497aa81778b6..3ff7b532a5bf 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1007,6 +1007,11 @@ static void dasd_eckd_store_conf_data(struct dasd_device *device, struct subchannel_id sch_id; ccw_device_get_schid(device->cdev, &sch_id); + /* + * path handling and read_conf allocate data + * free it before replacing the pointer + */ + kfree(device->path[chp].conf_data); device->path[chp].conf_data = conf_data; device->path[chp].cssid = sch_id.cssid; device->path[chp].ssid = sch_id.ssid; @@ -1263,9 +1268,10 @@ static void do_path_verification_work(struct work_struct *work) struct dasd_uid *uid; __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm; + struct dasd_conf_data *conf_data; unsigned long flags; char print_uid[60]; - int rc; + int rc, pos; data = container_of(work, struct path_verification_work_data, worker); device = data->device; @@ -1395,6 +1401,14 @@ static void do_path_verification_work(struct work_struct *work) } } + conf_data = kzalloc(DASD_ECKD_RCD_DATA_SIZE, GFP_KERNEL); + if (conf_data) { + memcpy(conf_data, data->rcd_buffer, + DASD_ECKD_RCD_DATA_SIZE); + } + pos = pathmask_to_pos(lpm); + dasd_eckd_store_conf_data(device, conf_data, pos); + /* * There is a small chance that a path is lost again between * above path verification and the following modification of -- cgit v1.2.3-59-g8ed1b From 9e34c8ba91697cb7441805c36d92ab3e695df6e0 Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:33 +0200 Subject: s390/dasd: Fix operational path inconsistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During online processing and setting up a DASD device, the configuration data for operational paths is read and validated two times (dasd_eckd_read_conf()). The first time to provide information that are necessary for the LCU setup. A second time after the LCU setup as a device might report different configuration data then. When the configuration setup for each operational path is being validated, an initial call to dasd_eckd_clear_conf_data() is issued. This call wipes all previously available configuration data and path information for each path. However, the operational path mask is not updated during this process. As a result, the stored operational path mask might no longer correspond to the operational paths mask reported by the CIO layer, as several paths might be gone between the two dasd_eckd_read_conf() calls. This inconsistency leads to more severe issues in later path handling changes. Fix this by removing the channel paths from the operational path mask during the dasd_eckd_clear_conf_data() call. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 3ff7b532a5bf..3273b26b25b0 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1034,6 +1034,7 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) device->path[i].cssid = 0; device->path[i].ssid = 0; device->path[i].chpid = 0; + dasd_path_notoper(device, i); } } -- cgit v1.2.3-59-g8ed1b From 19508b2047403cc88d2255118e2640ab1d3bf8a1 Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:34 +0200 Subject: s390/dasd: Display FC Endpoint Security information via sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new sysfs attribute (fc_security) per device and per operational channel path. The information of the current FC Endpoint Security state is received through the CIO layer. The state of the FC Endpoint Security can be either "Unsupported", "Authentication", or "Encryption". For example: $ cat /sys/bus/ccw/devices/0.0.c600/fc_security Encryption If any of the operational paths is in a state different from all others, the device sysfs attribute will display the additional state "Inconsistent". The sysfs attributes per paths are organised in a new directory called "paths_info" with subdirectories for each path. /sys/bus/ccw/devices/0.0.c600/paths_info/ ├── 0.38 │   └── fc_security ├── 0.39 │   └── fc_security ├── 0.3a │   └── fc_security └── 0.3b └── fc_security Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_devmap.c | 109 +++++++++++++++++++++++++++++++++++++++ drivers/s390/block/dasd_eckd.c | 30 +++++++++++ drivers/s390/block/dasd_int.h | 68 ++++++++++++++++++++++++ 3 files changed, 207 insertions(+) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 32fc51341d99..16bb135c20aa 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -576,6 +576,11 @@ dasd_create_device(struct ccw_device *cdev) dev_set_drvdata(&cdev->dev, device); spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + device->paths_info = kset_create_and_add("paths_info", NULL, + &device->cdev->dev.kobj); + if (!device->paths_info) + dev_warn(&cdev->dev, "Could not create paths_info kset\n"); + return device; } @@ -622,6 +627,9 @@ dasd_delete_device(struct dasd_device *device) wait_event(dasd_delete_wq, atomic_read(&device->ref_count) == 0); dasd_generic_free_discipline(device); + + kset_unregister(device->paths_info); + /* Disconnect dasd_device structure from ccw_device structure. */ cdev = device->cdev; device->cdev = NULL; @@ -1641,6 +1649,39 @@ dasd_path_interval_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(path_interval, 0644, dasd_path_interval_show, dasd_path_interval_store); +static ssize_t +dasd_device_fcs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dasd_device *device; + int fc_sec; + int rc; + + device = dasd_device_from_cdev(to_ccwdev(dev)); + if (IS_ERR(device)) + return -ENODEV; + fc_sec = dasd_path_get_fcs_device(device); + if (fc_sec == -EINVAL) + rc = snprintf(buf, PAGE_SIZE, "Inconsistent\n"); + else + rc = snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); + dasd_put_device(device); + + return rc; +} +static DEVICE_ATTR(fc_security, 0444, dasd_device_fcs_show, NULL); + +static ssize_t +dasd_path_fcs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct dasd_path *path = to_dasd_path(kobj); + unsigned int fc_sec = path->fc_security; + + return snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); +} + +static struct kobj_attribute path_fcs_attribute = + __ATTR(fc_security, 0444, dasd_path_fcs_show, NULL); #define DASD_DEFINE_ATTR(_name, _func) \ static ssize_t dasd_##_name##_show(struct device *dev, \ @@ -1697,6 +1738,7 @@ static struct attribute * dasd_attrs[] = { &dev_attr_path_reset.attr, &dev_attr_hpf.attr, &dev_attr_ese.attr, + &dev_attr_fc_security.attr, NULL, }; @@ -1777,6 +1819,73 @@ dasd_set_feature(struct ccw_device *cdev, int feature, int flag) } EXPORT_SYMBOL(dasd_set_feature); +static struct attribute *paths_info_attrs[] = { + &path_fcs_attribute.attr, + NULL, +}; + +static struct kobj_type path_attr_type = { + .release = dasd_path_release, + .default_attrs = paths_info_attrs, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static void dasd_path_init_kobj(struct dasd_device *device, int chp) +{ + device->path[chp].kobj.kset = device->paths_info; + kobject_init(&device->path[chp].kobj, &path_attr_type); +} + +void dasd_path_create_kobj(struct dasd_device *device, int chp) +{ + int rc; + + if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) + return; + if (!device->paths_info) { + dev_warn(&device->cdev->dev, "Unable to create paths objects\n"); + return; + } + if (device->path[chp].in_sysfs) + return; + if (!device->path[chp].conf_data) + return; + + dasd_path_init_kobj(device, chp); + + rc = kobject_add(&device->path[chp].kobj, NULL, "%x.%02x", + device->path[chp].cssid, device->path[chp].chpid); + if (rc) + kobject_put(&device->path[chp].kobj); + device->path[chp].in_sysfs = true; +} +EXPORT_SYMBOL(dasd_path_create_kobj); + +void dasd_path_create_kobjects(struct dasd_device *device) +{ + u8 lpm, opm; + + opm = dasd_path_get_opm(device); + for (lpm = 0x80; lpm; lpm >>= 1) { + if (!(lpm & opm)) + continue; + dasd_path_create_kobj(device, pathmask_to_pos(lpm)); + } +} +EXPORT_SYMBOL(dasd_path_create_kobjects); + +/* + * As we keep kobjects for the lifetime of a device, this function must not be + * called anywhere but in the context of offlining a device. + */ +void dasd_path_remove_kobj(struct dasd_device *device, int chp) +{ + if (device->path[chp].in_sysfs) { + kobject_put(&device->path[chp].kobj); + device->path[chp].in_sysfs = false; + } +} +EXPORT_SYMBOL(dasd_path_remove_kobj); int dasd_add_sysfs_files(struct ccw_device *cdev) { diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 3273b26b25b0..cfffab4c627b 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1035,6 +1035,30 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) device->path[i].ssid = 0; device->path[i].chpid = 0; dasd_path_notoper(device, i); + dasd_path_remove_kobj(device, i); + } +} + +static void dasd_eckd_read_fc_security(struct dasd_device *device) +{ + struct dasd_eckd_private *private = device->private; + u8 esm_valid; + u8 esm[8]; + int chp; + int rc; + + rc = chsc_scud(private->uid.ssid, (u64 *)esm, &esm_valid); + if (rc) { + for (chp = 0; chp < 8; chp++) + device->path[chp].fc_security = 0; + return; + } + + for (chp = 0; chp < 8; chp++) { + if (esm_valid & (0x80 >> chp)) + device->path[chp].fc_security = esm[chp]; + else + device->path[chp].fc_security = 0; } } @@ -1164,6 +1188,8 @@ static int dasd_eckd_read_conf(struct dasd_device *device) } } + dasd_eckd_read_fc_security(device); + return path_err; } @@ -1430,6 +1456,8 @@ static void do_path_verification_work(struct work_struct *work) dasd_path_add_cablepm(device, cablepm); dasd_path_add_nohpfpm(device, hpfpm); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); + + dasd_path_create_kobj(device, pos); } clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); @@ -2069,6 +2097,8 @@ dasd_eckd_check_characteristics(struct dasd_device *device) if (rc) goto out_err3; + dasd_path_create_kobjects(device); + /* Read Feature Codes */ dasd_eckd_read_features(device); diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 97ee0997a33e..e6823464acca 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -426,6 +426,35 @@ extern struct dasd_discipline *dasd_diag_discipline_pointer; #define DASD_THRHLD_MAX 4294967295U #define DASD_INTERVAL_MAX 4294967295U +/* FC Endpoint Security Capabilities */ +#define DASD_FC_SECURITY_UNSUP 0 +#define DASD_FC_SECURITY_AUTH 1 +#define DASD_FC_SECURITY_ENC_FCSP2 2 +#define DASD_FC_SECURITY_ENC_ERAS 3 + +#define DASD_FC_SECURITY_ENC_STR "Encryption" +static const struct { + u8 value; + char *name; +} dasd_path_fcs_mnemonics[] = { + { DASD_FC_SECURITY_UNSUP, "Unsupported" }, + { DASD_FC_SECURITY_AUTH, "Authentication" }, + { DASD_FC_SECURITY_ENC_FCSP2, DASD_FC_SECURITY_ENC_STR }, + { DASD_FC_SECURITY_ENC_ERAS, DASD_FC_SECURITY_ENC_STR }, +}; + +static inline char *dasd_path_get_fcs_str(int val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dasd_path_fcs_mnemonics); i++) { + if (dasd_path_fcs_mnemonics[i].value == val) + return dasd_path_fcs_mnemonics[i].name; + } + + return dasd_path_fcs_mnemonics[0].name; +} + struct dasd_path { unsigned long flags; u8 cssid; @@ -434,8 +463,18 @@ struct dasd_path { struct dasd_conf_data *conf_data; atomic_t error_count; unsigned long errorclk; + u8 fc_security; + struct kobject kobj; + bool in_sysfs; }; +#define to_dasd_path(path) container_of(path, struct dasd_path, kobj) + +static inline void dasd_path_release(struct kobject *kobj) +{ +/* Memory for the dasd_path kobject is freed when dasd_free_device() is called */ +} + struct dasd_profile_info { /* legacy part of profile data, as in dasd_profile_info_t */ @@ -547,6 +586,7 @@ struct dasd_device { struct dentry *hosts_dentry; struct dasd_profile profile; struct dasd_format_entry format_entry; + struct kset *paths_info; }; struct dasd_block { @@ -824,6 +864,9 @@ int dasd_set_feature(struct ccw_device *, int, int); int dasd_add_sysfs_files(struct ccw_device *); void dasd_remove_sysfs_files(struct ccw_device *); +void dasd_path_create_kobj(struct dasd_device *, int); +void dasd_path_create_kobjects(struct dasd_device *); +void dasd_path_remove_kobj(struct dasd_device *, int); struct dasd_device *dasd_device_from_cdev(struct ccw_device *); struct dasd_device *dasd_device_from_cdev_locked(struct ccw_device *); @@ -1114,6 +1157,31 @@ static inline __u8 dasd_path_get_hpfpm(struct dasd_device *device) return hpfpm; } +static inline u8 dasd_path_get_fcs_path(struct dasd_device *device, int chp) +{ + return device->path[chp].fc_security; +} + +static inline int dasd_path_get_fcs_device(struct dasd_device *device) +{ + u8 fc_sec = 0; + int chp; + + for (chp = 0; chp < 8; chp++) { + if (device->opm & (0x80 >> chp)) { + fc_sec = device->path[chp].fc_security; + break; + } + } + for (; chp < 8; chp++) { + if (device->opm & (0x80 >> chp)) + if (device->path[chp].fc_security != fc_sec) + return -EINVAL; + } + + return fc_sec; +} + /* * add functions for path masks * the existing path mask will be extended by the given path mask -- cgit v1.2.3-59-g8ed1b From b72949328869dfd45f6452c2410647afd7db5f1a Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:35 +0200 Subject: s390/dasd: Prepare for additional path event handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As more path events need to be handled for ECKD the current path verification infrastructure can be reused. Rename all path verifcation code to fit the more broadly based task of path event handling and put the path verification in a new separate function. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 4 +-- drivers/s390/block/dasd_eckd.c | 78 ++++++++++++++++++++++++------------------ drivers/s390/block/dasd_int.h | 1 + 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8581b2d46e13..8d7a53e98298 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2115,8 +2115,8 @@ static void __dasd_device_check_path_events(struct dasd_device *device) if (device->stopped & ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM)) return; - rc = device->discipline->verify_path(device, - dasd_path_get_tbvpm(device)); + rc = device->discipline->pe_handler(device, + dasd_path_get_tbvpm(device)); if (rc) dasd_device_set_timer(device, 50); else diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index cfffab4c627b..2e1cfacbf4d8 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -103,7 +103,7 @@ struct ext_pool_exhaust_work_data { }; /* definitions for the path verification worker */ -struct path_verification_work_data { +struct pe_handler_work_data { struct work_struct worker; struct dasd_device *device; struct dasd_ccw_req cqr; @@ -112,8 +112,8 @@ struct path_verification_work_data { int isglobal; __u8 tbvpm; }; -static struct path_verification_work_data *path_verification_worker; -static DEFINE_MUTEX(dasd_path_verification_mutex); +static struct pe_handler_work_data *pe_handler_worker; +static DEFINE_MUTEX(dasd_pe_handler_mutex); struct check_attention_work_data { struct work_struct worker; @@ -1249,7 +1249,7 @@ static int verify_fcx_max_data(struct dasd_device *device, __u8 lpm) } static int rebuild_device_uid(struct dasd_device *device, - struct path_verification_work_data *data) + struct pe_handler_work_data *data) { struct dasd_eckd_private *private = device->private; __u8 lpm, opm = dasd_path_get_opm(device); @@ -1287,10 +1287,9 @@ static int rebuild_device_uid(struct dasd_device *device, return rc; } -static void do_path_verification_work(struct work_struct *work) +static void dasd_eckd_path_available_action(struct dasd_device *device, + struct pe_handler_work_data *data) { - struct path_verification_work_data *data; - struct dasd_device *device; struct dasd_eckd_private path_private; struct dasd_uid *uid; __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; @@ -1300,19 +1299,6 @@ static void do_path_verification_work(struct work_struct *work) char print_uid[60]; int rc, pos; - data = container_of(work, struct path_verification_work_data, worker); - device = data->device; - - /* delay path verification until device was resumed */ - if (test_bit(DASD_FLAG_SUSPENDED, &device->flags)) { - schedule_work(work); - return; - } - /* check if path verification already running and delay if so */ - if (test_and_set_bit(DASD_FLAG_PATH_VERIFY, &device->flags)) { - schedule_work(work); - return; - } opm = 0; npm = 0; ppm = 0; @@ -1459,30 +1445,54 @@ static void do_path_verification_work(struct work_struct *work) dasd_path_create_kobj(device, pos); } +} + +static void do_pe_handler_work(struct work_struct *work) +{ + struct pe_handler_work_data *data; + struct dasd_device *device; + + data = container_of(work, struct pe_handler_work_data, worker); + device = data->device; + + /* delay path verification until device was resumed */ + if (test_bit(DASD_FLAG_SUSPENDED, &device->flags)) { + schedule_work(work); + return; + } + /* check if path verification already running and delay if so */ + if (test_and_set_bit(DASD_FLAG_PATH_VERIFY, &device->flags)) { + schedule_work(work); + return; + } + + dasd_eckd_path_available_action(device, data); + clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); if (data->isglobal) - mutex_unlock(&dasd_path_verification_mutex); + mutex_unlock(&dasd_pe_handler_mutex); else kfree(data); } -static int dasd_eckd_verify_path(struct dasd_device *device, __u8 lpm) +static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) { - struct path_verification_work_data *data; + struct pe_handler_work_data *data; data = kmalloc(sizeof(*data), GFP_ATOMIC | GFP_DMA); if (!data) { - if (mutex_trylock(&dasd_path_verification_mutex)) { - data = path_verification_worker; + if (mutex_trylock(&dasd_pe_handler_mutex)) { + data = pe_handler_worker; data->isglobal = 1; - } else + } else { return -ENOMEM; + } } else { memset(data, 0, sizeof(*data)); data->isglobal = 0; } - INIT_WORK(&data->worker, do_path_verification_work); + INIT_WORK(&data->worker, do_pe_handler_work); dasd_get_device(device); data->device = device; data->tbvpm = lpm; @@ -6725,7 +6735,7 @@ static struct dasd_discipline dasd_eckd_discipline = { .check_device = dasd_eckd_check_characteristics, .uncheck_device = dasd_eckd_uncheck_device, .do_analysis = dasd_eckd_do_analysis, - .verify_path = dasd_eckd_verify_path, + .pe_handler = dasd_eckd_pe_handler, .basic_to_ready = dasd_eckd_basic_to_ready, .online_to_ready = dasd_eckd_online_to_ready, .basic_to_known = dasd_eckd_basic_to_known, @@ -6786,16 +6796,16 @@ dasd_eckd_init(void) GFP_KERNEL | GFP_DMA); if (!dasd_vol_info_req) return -ENOMEM; - path_verification_worker = kmalloc(sizeof(*path_verification_worker), - GFP_KERNEL | GFP_DMA); - if (!path_verification_worker) { + pe_handler_worker = kmalloc(sizeof(*pe_handler_worker), + GFP_KERNEL | GFP_DMA); + if (!pe_handler_worker) { kfree(dasd_reserve_req); kfree(dasd_vol_info_req); return -ENOMEM; } rawpadpage = (void *)__get_free_page(GFP_KERNEL); if (!rawpadpage) { - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); kfree(dasd_vol_info_req); return -ENOMEM; @@ -6804,7 +6814,7 @@ dasd_eckd_init(void) if (!ret) wait_for_device_probe(); else { - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); kfree(dasd_vol_info_req); free_page((unsigned long)rawpadpage); @@ -6816,7 +6826,7 @@ static void __exit dasd_eckd_cleanup(void) { ccw_driver_unregister(&dasd_eckd_driver); - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); free_page((unsigned long)rawpadpage); } diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index e6823464acca..4cfed3b6d9a6 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -298,6 +298,7 @@ struct dasd_discipline { * configuration. */ int (*verify_path)(struct dasd_device *, __u8); + int (*pe_handler)(struct dasd_device *, __u8); /* * Last things to do when a device is set online, and first things -- cgit v1.2.3-59-g8ed1b From 4d063e646b4bfe8e74c0b4b78bf11c3a7b5d962a Mon Sep 17 00:00:00 2001 From: Jan Höppner Date: Thu, 8 Oct 2020 15:13:36 +0200 Subject: s390/dasd: Process FCES path event notification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the Fibre Channel Endpoint-Security status of a path changes, a corresponding path event is received from the CIO layer. Process this event by re-reading the FCES information. As the information is retrieved for all paths on a single CU in one call, the internal status can also be updated for all paths and no processing per path is necessary. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 19 ++++++++++++++----- drivers/s390/block/dasd_eckd.c | 12 +++++++++--- drivers/s390/block/dasd_int.h | 42 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8d7a53e98298..874345e1138c 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2107,20 +2107,25 @@ static void __dasd_device_start_head(struct dasd_device *device) static void __dasd_device_check_path_events(struct dasd_device *device) { + __u8 tbvpm, fcsecpm; int rc; - if (!dasd_path_get_tbvpm(device)) + tbvpm = dasd_path_get_tbvpm(device); + fcsecpm = dasd_path_get_fcsecpm(device); + + if (!tbvpm && !fcsecpm) return; if (device->stopped & ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM)) return; - rc = device->discipline->pe_handler(device, - dasd_path_get_tbvpm(device)); - if (rc) + rc = device->discipline->pe_handler(device, tbvpm, fcsecpm); + if (rc) { dasd_device_set_timer(device, 50); - else + } else { dasd_path_clear_all_verify(device); + dasd_path_clear_all_fcsec(device); + } }; /* @@ -3869,6 +3874,10 @@ void dasd_generic_path_event(struct ccw_device *cdev, int *path_event) if (device->discipline->kick_validate) device->discipline->kick_validate(device); } + if (path_event[chp] & PE_PATH_FCES_EVENT) { + dasd_path_fcsec_update(device, chp); + dasd_schedule_device_bh(device); + } } hpfpm = dasd_path_get_hpfpm(device); ifccpm = dasd_path_get_ifccpm(device); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 2e1cfacbf4d8..0d319c21c287 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -111,6 +111,7 @@ struct pe_handler_work_data { __u8 rcd_buffer[DASD_ECKD_RCD_DATA_SIZE]; int isglobal; __u8 tbvpm; + __u8 fcsecpm; }; static struct pe_handler_work_data *pe_handler_worker; static DEFINE_MUTEX(dasd_pe_handler_mutex); @@ -1466,7 +1467,10 @@ static void do_pe_handler_work(struct work_struct *work) return; } - dasd_eckd_path_available_action(device, data); + if (data->tbvpm) + dasd_eckd_path_available_action(device, data); + if (data->fcsecpm) + dasd_eckd_read_fc_security(device); clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); @@ -1476,7 +1480,8 @@ static void do_pe_handler_work(struct work_struct *work) kfree(data); } -static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) +static int dasd_eckd_pe_handler(struct dasd_device *device, + __u8 tbvpm, __u8 fcsecpm) { struct pe_handler_work_data *data; @@ -1495,7 +1500,8 @@ static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) INIT_WORK(&data->worker, do_pe_handler_work); dasd_get_device(device); data->device = device; - data->tbvpm = lpm; + data->tbvpm = tbvpm; + data->fcsecpm = fcsecpm; schedule_work(&data->worker); return 0; } diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 4cfed3b6d9a6..10f411c9b3c0 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -298,7 +298,7 @@ struct dasd_discipline { * configuration. */ int (*verify_path)(struct dasd_device *, __u8); - int (*pe_handler)(struct dasd_device *, __u8); + int (*pe_handler)(struct dasd_device *, __u8, __u8); /* * Last things to do when a device is set online, and first things @@ -423,6 +423,7 @@ extern struct dasd_discipline *dasd_diag_discipline_pointer; #define DASD_PATH_NOHPF 6 #define DASD_PATH_CUIR 7 #define DASD_PATH_IFCC 8 +#define DASD_PATH_FCSEC 9 #define DASD_THRHLD_MAX 4294967295U #define DASD_INTERVAL_MAX 4294967295U @@ -966,6 +967,29 @@ static inline void dasd_path_clear_all_verify(struct dasd_device *device) dasd_path_clear_verify(device, chp); } +static inline void dasd_path_fcsec(struct dasd_device *device, int chp) +{ + __set_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline void dasd_path_clear_fcsec(struct dasd_device *device, int chp) +{ + __clear_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline int dasd_path_need_fcsec(struct dasd_device *device, int chp) +{ + return test_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline void dasd_path_clear_all_fcsec(struct dasd_device *device) +{ + int chp; + + for (chp = 0; chp < 8; chp++) + dasd_path_clear_fcsec(device, chp); +} + static inline void dasd_path_operational(struct dasd_device *device, int chp) { __set_bit(DASD_PATH_OPERATIONAL, &device->path[chp].flags); @@ -1091,6 +1115,17 @@ static inline __u8 dasd_path_get_tbvpm(struct dasd_device *device) return tbvpm; } +static inline int dasd_path_get_fcsecpm(struct dasd_device *device) +{ + int chp; + + for (chp = 0; chp < 8; chp++) + if (dasd_path_need_fcsec(device, chp)) + return 1; + + return 0; +} + static inline __u8 dasd_path_get_nppm(struct dasd_device *device) { int chp; @@ -1348,6 +1383,11 @@ static inline void dasd_path_notoper(struct dasd_device *device, int chp) dasd_path_clear_nonpreferred(device, chp); } +static inline void dasd_path_fcsec_update(struct dasd_device *device, int chp) +{ + dasd_path_fcsec(device, chp); +} + /* * remove all paths from normal operation */ -- cgit v1.2.3-59-g8ed1b From c731b84b51bf7fe83448bea8f56a6d55006b0615 Mon Sep 17 00:00:00 2001 From: "Dae R. Jeong" Date: Thu, 22 Oct 2020 10:21:28 +0900 Subject: md: fix a warning caused by a race between concurrent md_ioctl()s Syzkaller reports a warning as belows. WARNING: CPU: 0 PID: 9647 at drivers/md/md.c:7169 ... Call Trace: ... RIP: 0010:md_ioctl+0x4017/0x5980 drivers/md/md.c:7169 RSP: 0018:ffff888096027950 EFLAGS: 00010293 RAX: ffff88809322c380 RBX: 0000000000000932 RCX: ffffffff84e266f2 RDX: 0000000000000000 RSI: ffffffff84e299f7 RDI: 0000000000000007 RBP: ffff888096027bc0 R08: ffff88809322c380 R09: ffffed101341a482 R10: ffff888096027940 R11: ffff88809a0d240f R12: 0000000000000932 R13: ffff8880a2c14100 R14: ffff88809a0d2268 R15: ffff88809a0d2408 __blkdev_driver_ioctl block/ioctl.c:304 [inline] blkdev_ioctl+0xece/0x1c10 block/ioctl.c:606 block_ioctl+0xee/0x130 fs/block_dev.c:1930 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is caused by a race between two concurrenct md_ioctl()s closing the array. CPU1 (md_ioctl()) CPU2 (md_ioctl()) ------ ------ set_bit(MD_CLOSING, &mddev->flags); did_set_md_closing = true; WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); Fix the warning by returning immediately if the MD_CLOSING bit is set in &mddev->flags which indicates that the array is being closed. Fixes: 065e519e71b2 ("md: MD_CLOSING needs to be cleared after called md_set_readonly or do_md_stop") Reported-by: syzbot+1e46a0864c1a6e9bd3d8@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dae R. Jeong Signed-off-by: Song Liu --- drivers/md/md.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3c79243e9d07..16a97dc385b4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7589,8 +7589,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); - set_bit(MD_CLOSING, &mddev->flags); + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); -- cgit v1.2.3-59-g8ed1b From 93decc563637c4288380912eac0eb42fb246cc04 Mon Sep 17 00:00:00 2001 From: Kevin Vigor Date: Fri, 6 Nov 2020 14:20:34 -0800 Subject: md/raid10: initialize r10_bio->read_slot before use. In __make_request() a new r10bio is allocated and passed to raid10_read_request(). The read_slot member of the bio is not initialized, and the raid10_read_request() uses it to index an array. This leads to occasional panics. Fix by initializing the field to invalid value and checking for valid value in raid10_read_request(). Cc: stable@vger.kernel.org Signed-off-by: Kevin Vigor Signed-off-by: Song Liu --- drivers/md/raid10.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..3153183b7772 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1127,7 +1127,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1508,6 +1508,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + r10_bio->read_slot = -1; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) -- cgit v1.2.3-59-g8ed1b From 81ba3c24628c14eb869d81652dbaf50640d8cc24 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:56 +0100 Subject: md: improve variable names in md_flush_request() This patch improves readability by using better variable names in flush request coalescing logic. Signed-off-by: Pankaj Gupta Reviewed-by: Paul Menzel Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- drivers/md/md.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 16a97dc385b4..3992e8c5c0d6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -639,7 +639,7 @@ static void md_submit_flush_data(struct work_struct *ws) * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ - mddev->last_flush = mddev->start_flush; + mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); @@ -660,13 +660,13 @@ static void md_submit_flush_data(struct work_struct *ws) */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { - ktime_t start = ktime_get_boottime(); + ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || - ktime_after(mddev->last_flush, start), + ktime_after(mddev->prev_flush_start, req_start), mddev->lock); - if (!ktime_after(mddev->last_flush, start)) { + if (!ktime_after(mddev->prev_flush_start, req_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; diff --git a/drivers/md/md.h b/drivers/md/md.h index ccfb69868c2e..2292c847f9dd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -495,9 +495,9 @@ struct mddev { */ struct bio *flush_bio; atomic_t flush_pending; - ktime_t start_flush, last_flush; /* last_flush is when the last completed - * flush was started. - */ + ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed + * flush was started. + */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; -- cgit v1.2.3-59-g8ed1b From 204d1a6434158ac655fc4037f29742b9b6103f0e Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:57 +0100 Subject: md: add comments in md_flush_request() Request coalescing logic is dependent on flush time update in other context. This patch adds comments to understand the code flow better. Signed-off-by: Pankaj Gupta Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3992e8c5c0d6..a0998ad6388c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -662,10 +662,14 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); + /* flush requests wait until ongoing flush completes, + * hence coalescing all the pending requests. + */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || ktime_after(mddev->prev_flush_start, req_start), mddev->lock); + /* new request after previous flush is completed */ if (!ktime_after(mddev->prev_flush_start, req_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; -- cgit v1.2.3-59-g8ed1b From a23f2aae8498d8c8bb6ff5301bda02db8093cb09 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:58 +0100 Subject: md: use current request time as base for ktime comparisons Request coalescing logic uses 'prev_flush_start' as base to compare the current request start time. 'prev_flush_start' is updated in other context. This patch changes this by using ktime comparison base to 'req_start' for better readability of code. Signed-off-by: Pankaj Gupta Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a0998ad6388c..1a3a6150123e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -667,10 +667,10 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || - ktime_after(mddev->prev_flush_start, req_start), + ktime_before(req_start, mddev->prev_flush_start), mddev->lock); /* new request after previous flush is completed */ - if (!ktime_after(mddev->prev_flush_start, req_start)) { + if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; -- cgit v1.2.3-59-g8ed1b From a8da01f79c89755fad55ed0ea96e8d2103242a72 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Thu, 19 Nov 2020 19:41:33 +0800 Subject: md/cluster: block reshape with remote resync job Reshape request should be blocked with ongoing resync job. In cluster env, a node can start resync job even if the resync cmd isn't executed on it, e.g., user executes "mdadm --grow" on node A, sometimes node B will start resync job. However, current update_raid_disks() only check local recovery status, which is incomplete. As a result, we see user will execute "mdadm --grow" successfully on local, while the remote node deny to do reshape job when it doing resync job. The inconsistent handling cause array enter unexpected status. If user doesn't observe this issue and continue executing mdadm cmd, the array doesn't work at last. Fix this issue by blocking reshape request. When node executes "--grow" and detects ongoing resync, it should stop and report error to user. The following script reproduces the issue with ~100% probability. (two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB) ``` # on node1, node2 is the remote node. ssh root@node2 "mdadm -S --scan" mdadm -S --scan for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \ count=20; done mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh" sleep 5 mdadm --manage --add /dev/md0 /dev/sdi mdadm --wait /dev/md0 mdadm --grow --raid-devices=3 /dev/md0 mdadm /dev/md0 --fail /dev/sdg mdadm /dev/md0 --remove /dev/sdg mdadm --grow --raid-devices=2 /dev/md0 ``` Cc: stable@vger.kernel.org Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1a3a6150123e..90faec048f2c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7283,6 +7283,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; @@ -9667,8 +9668,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } } - if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) - update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } /* * Since mddev->delta_disks has already updated in update_raid_disks, -- cgit v1.2.3-59-g8ed1b From bca5b0658020be90b6b504ca514fd80110204f71 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Thu, 19 Nov 2020 19:41:34 +0800 Subject: md/cluster: fix deadlock when node is doing resync job md-cluster uses MD_CLUSTER_SEND_LOCK to make node can exclusively send msg. During sending msg, node can concurrently receive msg from another node. When node does resync job, grab token_lockres:EX may trigger a deadlock: ``` nodeA nodeB -------------------- -------------------- a. send METADATA_UPDATED held token_lockres:EX b. md_do_sync resync_info_update send RESYNCING + set MD_CLUSTER_SEND_LOCK + wait for holding token_lockres:EX c. mdadm /dev/md0 --remove /dev/sdg + held reconfig_mutex + send REMOVE + wait_event(MD_CLUSTER_SEND_LOCK) d. recv_daemon //METADATA_UPDATED from A process_metadata_update + (mddev_trylock(mddev) || MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD) //this time, both return false forever ``` Explaination: a. A send METADATA_UPDATED This will block another node to send msg b. B does sync jobs, which will send RESYNCING at intervals. This will be block for holding token_lockres:EX lock. c. B do "mdadm --remove", which will send REMOVE. This will be blocked by step : MD_CLUSTER_SEND_LOCK is 1. d. B recv METADATA_UPDATED msg, which send from A in step . This will be blocked by step : holding mddev lock, it makes wait_event can't hold mddev lock. (btw, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD keep ZERO in this scenario.) There is a similar deadlock in commit 0ba959774e93 ("md-cluster: use sync way to handle METADATA_UPDATED msg") In that commit, step c is "update sb". This patch step c is "mdadm --remove". For fixing this issue, we can refer the solution of function: metadata_update_start. Which does the same grab lock_token action. lock_comm can use the same steps to avoid deadlock. By moving MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD from lock_token to lock_comm. It enlarge a little bit window of MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, but it is safe & can break deadlock. Repro steps (I only triggered 3 times with hundreds tests): two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB. ``` ssh root@node2 "mdadm -S --scan" mdadm -S --scan for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \ count=20; done mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh \ --bitmap-chunk=1M ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh" sleep 5 mkfs.xfs /dev/md0 mdadm --manage --add /dev/md0 /dev/sdi mdadm --wait /dev/md0 mdadm --grow --raid-devices=3 /dev/md0 mdadm /dev/md0 --fail /dev/sdg mdadm /dev/md0 --remove /dev/sdg mdadm --grow --raid-devices=2 /dev/md0 ``` test script will hung when executing "mdadm --remove". ``` # dump stacks by "echo t > /proc/sysrq-trigger" md0_cluster_rec D 0 5329 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? _cond_resched+0x2d/0x40 ? schedule+0x4a/0xb0 ? process_metadata_update.isra.0+0xdb/0x140 [md_cluster] ? wait_woken+0x80/0x80 ? process_recvd_msg+0x113/0x1d0 [md_cluster] ? recv_daemon+0x9e/0x120 [md_cluster] ? md_thread+0x94/0x160 [md_mod] ? wait_woken+0x80/0x80 ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 mdadm D 0 5423 1 0x00004004 Call Trace: __schedule+0x1f6/0x560 ? __schedule+0x1fe/0x560 ? schedule+0x4a/0xb0 ? lock_comm.isra.0+0x7b/0xb0 [md_cluster] ? wait_woken+0x80/0x80 ? remove_disk+0x4f/0x90 [md_cluster] ? hot_remove_disk+0xb1/0x1b0 [md_mod] ? md_ioctl+0x50c/0xba0 [md_mod] ? wait_woken+0x80/0x80 ? blkdev_ioctl+0xa2/0x2a0 ? block_ioctl+0x39/0x40 ? ksys_ioctl+0x82/0xc0 ? __x64_sys_ioctl+0x16/0x20 ? do_syscall_64+0x5f/0x150 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 md0_resync D 0 5425 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? schedule+0x4a/0xb0 ? dlm_lock_sync+0xa1/0xd0 [md_cluster] ? wait_woken+0x80/0x80 ? lock_token+0x2d/0x90 [md_cluster] ? resync_info_update+0x95/0x100 [md_cluster] ? raid1_sync_request+0x7d3/0xa40 [raid1] ? md_do_sync.cold+0x737/0xc8f [md_mod] ? md_thread+0x94/0x160 [md_mod] ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 ``` At last, thanks for Xiao's solution. Cc: stable@vger.kernel.org Signed-off-by: Zhao Heming Suggested-by: Xiao Ni Reviewed-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 67 ++++++++++++++++++++++++++++--------------------- drivers/md/md.c | 6 +++-- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 4aaf4820b6f6..f0e64e76fd79 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -664,9 +664,27 @@ out: * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. */ -static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) +static int lock_token(struct md_cluster_info *cinfo) { - int error, set_bit = 0; + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; struct mddev *mddev = cinfo->mddev; /* @@ -677,34 +695,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)) { - error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - WARN_ON_ONCE(error); + WARN_ON_ONCE(rv); md_wakeup_thread(mddev->thread); set_bit = 1; } - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); - if (set_bit) - clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - if (error) - pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", - __func__, __LINE__, error); - - /* Lock the receive sequence */ - mutex_lock(&cinfo->recv_mutex); - return error; -} - -/* lock_comm() - * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. - */ -static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) -{ wait_event(cinfo->wait, !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); - - return lock_token(cinfo, mddev_locked); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; } static void unlock_comm(struct md_cluster_info *cinfo) @@ -784,9 +787,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, { int ret; - lock_comm(cinfo, mddev_locked); - ret = __sendmsg(cinfo, cmsg); - unlock_comm(cinfo); + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } return ret; } @@ -1061,7 +1066,7 @@ static int metadata_update_start(struct mddev *mddev) return 0; } - ret = lock_token(cinfo, 1); + ret = lock_token(cinfo); clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); return ret; } @@ -1255,7 +1260,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) int raid_slot = -1; md_update_sb(mddev, 1); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); @@ -1407,7 +1415,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) + return -EAGAIN; ret = __sendmsg(cinfo, &cmsg); if (ret) { unlock_comm(cinfo); diff --git a/drivers/md/md.c b/drivers/md/md.c index 90faec048f2c..c42af46d366a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6953,8 +6953,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); -- cgit v1.2.3-59-g8ed1b From 03d99e5d63dabe2c0cea0d8fe1cb89bde33f7939 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:28:38 -0700 Subject: nvme-fcloop: add sysfs attribute to inject command drop Add sysfs attribute to specify parameters for dropping a command. The attribute takes a string of: :: Opcode is formatted as lower 8 bits are opcode. If a fabrics opcode, a bit above bits 7:0 will be set. Once set, each sqe is looked at. If the opcode matches the running instance count is updated. If the instance count is in the range of where to drop (based on starting and # of times), then drop the command by not passing it to the target layer. Signed-off-by: James Smart --- drivers/nvme/target/fcloop.c | 81 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 3da067a8311e..733d9363900e 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -564,6 +564,50 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, fcloop_tfcp_req_put(tfcp_req); } +static bool drop_fabric_opcode; +#define DROP_OPCODE_MASK 0x00FF +/* fabrics opcode will have a bit set above 1st byte */ +static int drop_opcode = -1; +static int drop_instance; +static int drop_amount; +static int drop_current_cnt; + +/* + * Routine to parse io and determine if the io is to be dropped. + * Returns: + * 0 if io is not obstructed + * 1 if io was dropped + */ +static int check_for_drop(struct fcloop_fcpreq *tfcp_req) +{ + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvme_fc_cmd_iu *cmdiu = fcpreq->cmdaddr; + struct nvme_command *sqe = &cmdiu->sqe; + + if (drop_opcode == -1) + return 0; + + pr_info("%s: seq opcd x%02x fctype x%02x: drop F %s op x%02x " + "inst %d start %d amt %d\n", + __func__, sqe->common.opcode, sqe->fabrics.fctype, + drop_fabric_opcode ? "y" : "n", + drop_opcode, drop_current_cnt, drop_instance, drop_amount); + + if ((drop_fabric_opcode && + (sqe->common.opcode != nvme_fabrics_command || + sqe->fabrics.fctype != drop_opcode)) || + (!drop_fabric_opcode && sqe->common.opcode != drop_opcode)) + return 0; + + if (++drop_current_cnt >= drop_instance) { + if (drop_current_cnt >= drop_instance + drop_amount) + drop_opcode = -1; + return 1; + } + + return 0; +} + static void fcloop_fcp_recv_work(struct work_struct *work) { @@ -590,10 +634,14 @@ fcloop_fcp_recv_work(struct work_struct *work) if (unlikely(aborted)) ret = -ECANCELED; - else - ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + else { + if (likely(!check_for_drop(tfcp_req))) + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req, fcpreq->cmdaddr, fcpreq->cmdlen); + else + pr_info("%s: dropped command ********\n", __func__); + } if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); @@ -1449,6 +1497,33 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } +static ssize_t +fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int opcode, starting, amount; + + if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) + return -EBADRQC; + + drop_current_cnt = 0; + drop_fabric_opcode = (opcode & ~DROP_OPCODE_MASK) ? true : false; + drop_opcode = (opcode & DROP_OPCODE_MASK); + drop_instance = starting; + /* the check to drop routine uses instance + count to know when + * to end. Thus, if dropping 1 instance, count should be 0. + * so subtract 1 from the count. + */ + drop_amount = amount - 1; + + pr_info("%s: DROP: Starting at instance %d of%s opcode x%x drop +%d " + "instances\n", + __func__, drop_instance, drop_fabric_opcode ? " fabric" : "", + drop_opcode, drop_amount); + + return count; +} + static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port); static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port); @@ -1456,6 +1531,7 @@ static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port); static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port); static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port); static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port); +static DEVICE_ATTR(set_cmd_drop, 0200, NULL, fcloop_set_cmd_drop); static struct attribute *fcloop_dev_attrs[] = { &dev_attr_add_local_port.attr, @@ -1464,6 +1540,7 @@ static struct attribute *fcloop_dev_attrs[] = { &dev_attr_del_remote_port.attr, &dev_attr_add_target_port.attr, &dev_attr_del_target_port.attr, + &dev_attr_set_cmd_drop.attr, NULL }; -- cgit v1.2.3-59-g8ed1b From 84115d6d80c809d65c42f9383f22c10b91a4eb1c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 27 Oct 2020 16:15:16 +0800 Subject: nvme: simplify nvme_req_qid() Use the request's '->mq_hctx->queue_num' directly to simplify the nvme_req_qid() function. Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bc330bf0d3bd..87867e93c7d3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -178,7 +178,8 @@ static inline u16 nvme_req_qid(struct request *req) { if (!req->q->queuedata) return 0; - return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; + + return req->mq_hctx->queue_num + 1; } /* The below value is the specific amount of delay needed before checking -- cgit v1.2.3-59-g8ed1b From 0d2e7c840b178bf9a47bd0de89d8f9182fa71d86 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:42 -0800 Subject: nvme: centralize setting the timeout in nvme_alloc_request The function nvme_alloc_request() is called from different context (I/O and Admin queue) where callers do not consider the I/O timeout when called from I/O queue context. Update nvme_alloc_request() to set the default I/O and Admin timeout value based on whether the queuedata is set or not. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 11 +++++++++-- drivers/nvme/host/lightnvm.c | 3 ++- drivers/nvme/host/pci.c | 2 -- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9b01afcb7777..97348b1ecfd6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -533,6 +533,11 @@ struct request *nvme_alloc_request(struct request_queue *q, if (IS_ERR(req)) return req; + if (req->q->queuedata) + req->timeout = NVME_IO_TIMEOUT; + else /* no queuedata implies admin queue */ + req->timeout = ADMIN_TIMEOUT; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; @@ -901,7 +906,8 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); @@ -1071,7 +1077,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8e562d0f2c30..88a7c8eac455 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -774,7 +774,8 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, goto err_cmd; } - rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + rq->timeout = timeout; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0578ff253c47..76465d335924 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1310,7 +1310,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } - abort_req->timeout = ADMIN_TIMEOUT; abort_req->end_io_data = NULL; blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); @@ -2223,7 +2222,6 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = ADMIN_TIMEOUT; req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); -- cgit v1.2.3-59-g8ed1b From dc96f93874c63e126087e1adf1973c9fecfdaa0c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:45 -0800 Subject: nvme: use consistent macro name for timeout This is purely a clenaup patch, add prefix NVME to the ADMIN_TIMEOUT to make consistent with NVME_IO_TIMEOUT. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 97348b1ecfd6..98bea150e5dc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -536,7 +536,7 @@ struct request *nvme_alloc_request(struct request_queue *q, if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ - req->timeout = ADMIN_TIMEOUT; + req->timeout = NVME_ADMIN_TIMEOUT; req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); @@ -2268,8 +2268,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); cmd.common.cdw11 = cpu_to_le32(len); - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4c246462658..38373a0e86ef 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3479,7 +3479,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->lport->ops->fcprqst_priv_sz); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 87867e93c7d3..824776a8ba13 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -24,7 +24,7 @@ extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) extern unsigned int admin_timeout; -#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 76465d335924..6123040ff872 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1606,7 +1606,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.nr_hw_queues = 1; dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev->ctrl.numa_node; dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; @@ -2237,7 +2237,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) unsigned long timeout; retry: - timeout = ADMIN_TIMEOUT; + timeout = NVME_ADMIN_TIMEOUT; while (nr_queues > 0) { if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) break; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 65e3d0ef36e1..df9f6f4549f1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -797,7 +797,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, NVME_RDMA_DATA_SGL_SIZE; set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0c33320fe65..1ba659927442 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1568,7 +1568,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f6d81239be21..76d8c0a9a87d 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -345,7 +345,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) NVME_INLINE_SG_CNT * sizeof(struct scatterlist); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ctrl->queues[0].ctrl = ctrl; -- cgit v1.2.3-59-g8ed1b From a2f6a2b8ce43db608357a490e028166f9e4bab0d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:43 -0800 Subject: nvmet: add passthru admin timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of admin commands including vender specific passhtru admin comands. The vendor specific admin commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:admin_timeout). Add a configfs attribute so that user can set the admin timeout values. In case if this configfs value is not set nvme_alloc_request() will set the ADMIN_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 6 ++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 37e1d7784e17..781157a654e9 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -736,9 +736,29 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, enable); +static ssize_t nvmet_passthru_admin_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->admin_timeout); +} + +static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->admin_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, + &nvmet_passthru_attr_admin_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 559a15ccc322..a0c80e5179a2 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -249,6 +249,7 @@ struct nvmet_subsys { struct nvme_ctrl *passthru_ctrl; char *passthru_ctrl_path; struct config_group passthru_group; + unsigned int admin_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 8ee94f056898..b496682ccf85 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,6 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; + unsigned int timeout = 0; u32 effects; u16 status; int ret; @@ -242,6 +243,8 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + } else { + timeout = req->sq->ctrl->subsys->admin_timeout; } rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); @@ -250,6 +253,9 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) goto out_put_ns; } + if (timeout) + rq->timeout = timeout; + if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { -- cgit v1.2.3-59-g8ed1b From 47e9730c26a4a5d4eab2124d6bbeb94693e44b46 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:44 -0800 Subject: nvmet: add passthru io timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of I/O commands including vender specific passhtru io comands. The vendor specific I/O commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:io_timeout). Add a configfs attribute so that user can set the io timeout values. In case if this configfs value is not set nvme_alloc_request() will set the NVME_IO_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 3 ++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 781157a654e9..c61ffd767062 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -755,10 +755,30 @@ static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); +static ssize_t nvmet_passthru_io_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->io_timeout); +} + +static ssize_t nvmet_passthru_io_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->io_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, io_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, &nvmet_passthru_attr_admin_timeout, + &nvmet_passthru_attr_io_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index a0c80e5179a2..2f9635273629 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -250,6 +250,7 @@ struct nvmet_subsys { char *passthru_ctrl_path; struct config_group passthru_group; unsigned int admin_timeout; + unsigned int io_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b496682ccf85..a062398305a7 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,7 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; - unsigned int timeout = 0; + unsigned int timeout; u32 effects; u16 status; int ret; @@ -243,6 +243,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + timeout = req->sq->ctrl->subsys->io_timeout; } else { timeout = req->sq->ctrl->subsys->admin_timeout; } -- cgit v1.2.3-59-g8ed1b From 53ffabfd4ddb3a24c5603ae82eefb5537ecb5c20 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:03 -0800 Subject: block: move blk_rq_bio_prep() to linux/blk-mq.h This is a preparation patch to have minimal block layer request bio append functionality in the context of the NVMeOF Passthru driver which falls in the fast path and doesn't need calls from blk_rq_append_bio(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- block/blk.h | 12 ------------ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk.h b/block/blk.h index dfab98465db9..e05507a8d1e3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -91,18 +91,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); - - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; -} - #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 794b2a33a2c3..e7482e6ad3ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -593,6 +593,18 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } +static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + rq->nr_phys_segments = nr_segs; + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); + + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; +} + blk_qc_t blk_mq_submit_bio(struct bio *bio); #endif -- cgit v1.2.3-59-g8ed1b From 39dfe84451b4526a8054cc5a127337bca980dfa3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:00 -0800 Subject: nvme: split nvme_alloc_request() Right now nvme_alloc_request() allocates a request from block layer based on the value of the qid. When qid set to NVME_QID_ANY it used blk_mq_alloc_request() else blk_mq_alloc_request_hctx(). The function nvme_alloc_request() is called from different context, The only place where it uses non NVME_QID_ANY value is for fabrics connect commands :- nvme_submit_sync_cmd() NVME_QID_ANY nvme_features() NVME_QID_ANY nvme_sec_submit() NVME_QID_ANY nvmf_reg_read32() NVME_QID_ANY nvmf_reg_read64() NVME_QID_ANY nvmf_reg_write32() NVME_QID_ANY nvmf_connect_admin_queue() NVME_QID_ANY nvme_submit_user_cmd() NVME_QID_ANY nvme_alloc_request() nvme_keep_alive() NVME_QID_ANY nvme_alloc_request() nvme_timeout() NVME_QID_ANY nvme_alloc_request() nvme_delete_queue() NVME_QID_ANY nvme_alloc_request() nvmet_passthru_execute_cmd() NVME_QID_ANY nvme_alloc_request() nvmf_connect_io_queue() QID __nvme_submit_sync_cmd() nvme_alloc_request() With passthru nvme_alloc_request() now falls into the I/O fast path such that blk_mq_alloc_request_hctx() is never gets called and that adds additional branch check in fast path. Split the nvme_alloc_request() into nvme_alloc_request() and nvme_alloc_request_qid(). Replace each call of the nvme_alloc_request() with NVME_QID_ANY param with a call to newly added nvme_alloc_request() without NVME_QID_ANY. Replace a call to nvme_alloc_request() with QID param with a call to newly added nvme_alloc_request() and nvme_alloc_request_qid() based on the qid value set in the __nvme_submit_sync_cmd(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 52 ++++++++++++++++++++++++++++-------------- drivers/nvme/host/lightnvm.c | 5 ++-- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/target/passthru.c | 2 +- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 98bea150e5dc..fff90200497c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -518,21 +518,14 @@ static inline void nvme_clear_nvme_request(struct request *req) } } -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +static inline unsigned int nvme_req_op(struct nvme_command *cmd) { - unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; - struct request *req; - - if (qid == NVME_QID_ANY) { - req = blk_mq_alloc_request(q, op, flags); - } else { - req = blk_mq_alloc_request_hctx(q, op, flags, - qid ? qid - 1 : 0); - } - if (IS_ERR(req)) - return req; + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; +} +static inline void nvme_init_request(struct request *req, + struct nvme_command *cmd) +{ if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ @@ -541,11 +534,33 @@ struct request *nvme_alloc_request(struct request_queue *q, req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; +} +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags) +{ + struct request *req; + + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); return req; } EXPORT_SYMBOL_GPL(nvme_alloc_request); +struct request *nvme_alloc_request_qid(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +{ + struct request *req; + + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); + return req; +} +EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { struct nvme_command c; @@ -902,7 +917,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct request *req; int ret; - req = nvme_alloc_request(q, cmd, flags, qid); + if (qid == NVME_QID_ANY) + req = nvme_alloc_request(q, cmd, flags); + else + req = nvme_alloc_request_qid(q, cmd, flags, qid); if (IS_ERR(req)) return PTR_ERR(req); @@ -1073,7 +1091,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); + req = nvme_alloc_request(q, cmd, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1148,8 +1166,8 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) { struct request *rq; - rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, - NVME_QID_ANY); + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, + BLK_MQ_REQ_RESERVED); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 88a7c8eac455..470cef3abec3 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -653,7 +653,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, nvme_nvm_rqtocmd(rqd, ns, cmd); - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); if (IS_ERR(rq)) return rq; @@ -767,8 +767,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, DECLARE_COMPLETION_ONSTACK(wait); int ret = 0; - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, - NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); if (IS_ERR(rq)) { ret = -ENOMEM; goto err_cmd; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 824776a8ba13..83fb30e317e0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -611,6 +611,8 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags); +struct request *nvme_alloc_request_qid(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6123040ff872..5e6365dd0c8e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1304,7 +1304,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; @@ -2218,7 +2218,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index a062398305a7..be8ae59dcb71 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -248,7 +248,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) timeout = req->sq->ctrl->subsys->admin_timeout; } - rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, req->cmd, 0); if (IS_ERR(rq)) { status = NVME_SC_INTERNAL; goto out_put_ns; -- cgit v1.2.3-59-g8ed1b From 06b3bec8204b4c6433ccb2f6ec60fedb77b34cb3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:02 -0800 Subject: nvmet: remove op_flags for passthru commands For passthru commands setting op_flags has no meaning. Remove the code that sets the op flags in nvmet_passthru_map_sg(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index be8ae59dcb71..1c84dadfb38f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -188,21 +188,15 @@ static void nvmet_passthru_req_done(struct request *rq, static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; - int op_flags = 0; struct bio *bio; int i, ret; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - if (req->cmd->common.opcode == nvme_cmd_flush) - op_flags = REQ_FUA; - else if (nvme_is_write(req->cmd)) - op_flags = REQ_SYNC | REQ_IDLE; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); bio->bi_end_io = bio_put; - bio->bi_opf = req_op(rq) | op_flags; + bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, -- cgit v1.2.3-59-g8ed1b From a4fe2d3afe3ce77edeadb567c0d0a8d102c6b159 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:04 -0800 Subject: nvmet: use blk_rq_bio_prep instead of blk_rq_append_bio The function blk_rq_append_bio() is a genereric API written for all types driver (having bounce buffers) and different context (where request is already having a bio i.e. rq->bio != NULL). It does mainly three things: calculating the segments, bounce queue and if req->bio == NULL call blk_rq_bio_prep() or handle low level merge() case. The NVMe PCIe and fabrics transports currently does not use queue bounce mechanism. In order to find this for each request processing in the passthru blk_rq_append_bio() does extra work in the fast path for each request. When I ran I/Os with different block sizes on the passthru controller I found that we can reuse the req->sg_cnt instead of iterating over the bvecs to find out nr_segs in blk_rq_append_bio(). This calculation in blk_rq_append_bio() is a duplication of work given that we have the value in req->sg_cnt. (correct me here if I'm wrong). With NVMe passthru request based driver we allocate fresh request each time, so every call to blk_rq_append_bio() rq->bio will be NULL i.e. we don't really need the second condition in the blk_rq_append_bio() and the resulting error condition in the caller of blk_rq_append_bio(). So for NVMeOF passthru driver recalculating the segments, bounce check and ll_back_merge code is not needed such that we can get away with the minimal version of the blk_rq_append_bio() which removes the error check in the fast path along with extra variable in nvmet_passthru_map_sg(). This patch updates the nvmet_passthru_map_sg() such that it does only appending the bio to the request in the context of the NVMeOF Passthru driver. Following are perf numbers :- With current implementation (blk_rq_append_bio()) :- ---------------------------------------------------- + 5.80% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.88% 0.00% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.86% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.17% 0.00% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd With this patch using blk_rq_bio_prep() :- ---------------------------------------------------- + 3.14% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 3.26% 0.01% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 5.37% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.18% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 4.84% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.87% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 1c84dadfb38f..2b24205ee79d 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -189,7 +189,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; struct bio *bio; - int i, ret; + int i; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; @@ -206,11 +206,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) } } - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + blk_rq_bio_prep(rq, bio, req->sg_cnt); return 0; } -- cgit v1.2.3-59-g8ed1b From dab3902b19a0dd1668d0cc3e8e4b976b1ee8638c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:05 -0800 Subject: nvmet: use inline bio for passthru fast path In nvmet_passthru_execute_cmd() which is a high frequency function it uses bio_alloc() which leads to memory allocation from the fs pool for each I/O. For NVMeoF nvmet_req we already have inline_bvec allocated as a part of request allocation that can be used with preallocated bio when we already know the size of request before bio allocation with bio_alloc(), which we already do. Introduce a bio member for the nvmet_req passthru anon union. In the fast path, check if we can get away with inline bvec and bio from nvmet_req with bio_init() call before actually allocating from the bio_alloc(). This will be useful to avoid any new memory allocation under high memory pressure situation and get rid of any extra work of allocation (bio_alloc()) vs initialization (bio_init()) when transfer len is < NVMET_MAX_INLINE_DATA_LEN that user can configure at compile time. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 2f9635273629..e89ec280e91a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -332,6 +332,7 @@ struct nvmet_req { struct work_struct work; } f; struct { + struct bio inline_bio; struct request *rq; struct work_struct work; bool use_workqueue; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 2b24205ee79d..b9776fc8f08f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -194,14 +194,20 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); - bio->bi_end_io = bio_put; + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { + bio = &req->p.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + } bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, sg->offset) < sg->length) { - bio_put(bio); + if (bio != &req->p.inline_bio) + bio_put(bio); return -EINVAL; } } -- cgit v1.2.3-59-g8ed1b From ff4e5fbad06f762b8551da56e8fd64ad14c8aa3e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:01 +0100 Subject: nvme-pci: drop min() from nr_io_queues assignment in nvme_setup_io_queues() the number of I/O queues is set to either 1 in case of a quirky Apple device or to the min of nvme_max_io_queues() or dev->nr_allocated_queues - 1. This is unnecessarily complicated as dev->nr_allocated_queues is only assigned once and is nvme_max_io_queues() + 1. Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5e6365dd0c8e..90b338435021 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2113,8 +2113,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) nr_io_queues = 1; else - nr_io_queues = min(nvme_max_io_queues(dev), - dev->nr_allocated_queues - 1); + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) -- cgit v1.2.3-59-g8ed1b From e3aef0950a30ecbf475be52509ca178907410709 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:02 +0100 Subject: nvme-pci: don't allocate unused I/O queues currently the NVME_QUIRK_SHARED_TAGS quirk for Apple devices is handled during the assignment of nr_io_queues in nvme_setup_io_queues(). This however means that for these devices nvme_max_io_queues() will actually not return the supported maximum which is confusing and unexpected and also means that in nvme_probe() we are allocating for I/O queues that will never be used. Fix this by moving the quirk handling into nvme_max_io_queues(). Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 90b338435021..2c072f33a577 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2088,6 +2088,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + return 1; return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; } @@ -2106,15 +2112,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; - /* - * If tags are shared with admin queue (Apple bug), then - * make sure we only use one IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - nr_io_queues = 1; - else - nr_io_queues = dev->nr_allocated_queues - 1; - + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; -- cgit v1.2.3-59-g8ed1b From 6d65aeab7bf6e83e75f53cfdbdb84603e52e1182 Mon Sep 17 00:00:00 2001 From: Amit Date: Sun, 15 Nov 2020 14:19:51 +0200 Subject: nvmet: remove unused ctrl->cqs remove unused cqs from nvmet_ctrl struct this will reduce the allocated memory. Signed-off-by: Amit Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 15 ++------------- drivers/nvme/target/nvmet.h | 1 - 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 957b39a82431..8ce4d59cc9e7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -757,8 +757,6 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, { cq->qid = qid; cq->size = size; - - ctrl->cqs[qid] = cq; } void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, @@ -1344,20 +1342,14 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->changed_ns_list) goto out_free_ctrl; - ctrl->cqs = kcalloc(subsys->max_qid + 1, - sizeof(struct nvmet_cq *), - GFP_KERNEL); - if (!ctrl->cqs) - goto out_free_changed_ns_list; - ctrl->sqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_sq *), GFP_KERNEL); if (!ctrl->sqs) - goto out_free_cqs; + goto out_free_changed_ns_list; if (subsys->cntlid_min > subsys->cntlid_max) - goto out_free_cqs; + goto out_free_changed_ns_list; ret = ida_simple_get(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, @@ -1395,8 +1387,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, out_free_sqs: kfree(ctrl->sqs); -out_free_cqs: - kfree(ctrl->cqs); out_free_changed_ns_list: kfree(ctrl->changed_ns_list); out_free_ctrl: @@ -1426,7 +1416,6 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_async_events_free(ctrl); kfree(ctrl->sqs); - kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); kfree(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e89ec280e91a..592763732065 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -164,7 +164,6 @@ static inline struct nvmet_port *ana_groups_to_port( struct nvmet_ctrl { struct nvmet_subsys *subsys; - struct nvmet_cq **cqs; struct nvmet_sq **sqs; bool cmd_seen; -- cgit v1.2.3-59-g8ed1b From 0068a7b010533872b6e71a376771dc310d90fa1c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 25 Nov 2020 12:27:36 +0000 Subject: nvmet: make sure discovery change log event is protected Generation counter is protected by nvmet_config_sem. Make sure the callers that call functions that might change it, are calling it properly. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index f40c05c33c3a..682854e0e079 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -69,6 +69,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, struct nvmet_port *port; struct nvmet_subsys_link *s; + lockdep_assert_held(&nvmet_config_sem); nvmet_genctr++; list_for_each_entry(port, nvmet_ports, global_entry) -- cgit v1.2.3-59-g8ed1b From 9f20599c4821d1f7281a3efb3ef94ff3cfdd5e10 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Nov 2020 22:40:29 +0000 Subject: nvmet: fix a spelling mistake "incuding" -> "including" in Kconfig There is a spelling mistake in the Kconfig help text. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 8056955e652c..4be2ececbc45 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -24,7 +24,7 @@ config NVME_TARGET_PASSTHRU This enables target side NVMe passthru controller support for the NVMe Over Fabrics protocol. It allows for hosts to manage and directly access an actual NVMe controller residing on the target - side, incuding executing Vendor Unique Commands. + side, including executing Vendor Unique Commands. If unsure, say N. -- cgit v1.2.3-59-g8ed1b From 8c4dfea97f15b80097b3f882ca428fb2751ec30c Mon Sep 17 00:00:00 2001 From: Victor Gladkov Date: Tue, 24 Nov 2020 18:34:59 +0000 Subject: nvme-fabrics: reject I/O to offline device Commands get stuck while Host NVMe-oF controller is in reconnect state. The controller enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds (default) until a successful reconnect or until the reconnect time-out is reached. The default reconnect time out is 10 minutes. Applications are expecting commands to complete with success or error within a certain timeout (30 seconds by default). The NVMe host is enforcing that timeout while it is connected, but during reconnect the timeout is not enforced and commands may get stuck for a long period or even forever. To fix this long delay due to the default timeout, introduce new "fast_io_fail_tmo" session parameter. The timeout is measured in seconds from the controller reconnect and any command beyond that timeout is rejected. The new parameter value may be passed during 'connect'. The default value of -1 means no timeout (similar to current behavior). Signed-off-by: Victor Gladkov Signed-off-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 25 ++++++++++++++++++++--- drivers/nvme/host/fabrics.h | 5 +++++ drivers/nvme/host/multipath.c | 2 ++ drivers/nvme/host/nvme.h | 3 +++ 5 files changed, 77 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fff90200497c..9c1645f28a7a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -148,6 +148,38 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_try_sched_reset); +static void nvme_failfast_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failfast_work); + + if (ctrl->state != NVME_CTRL_CONNECTING) + return; + + set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + dev_info(ctrl->device, "failfast expired\n"); + nvme_kick_requeue_lists(ctrl); +} + +static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) + return; + + schedule_delayed_work(&ctrl->failfast_work, + ctrl->opts->fast_io_fail_tmo * HZ); +} + +static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts) + return; + + cancel_delayed_work_sync(&ctrl->failfast_work); + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); +} + + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -433,8 +465,17 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } spin_unlock_irqrestore(&ctrl->lock, flags); - if (changed && ctrl->state == NVME_CTRL_LIVE) + if (!changed) + return false; + + if (ctrl->state == NVME_CTRL_LIVE) { + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); nvme_kick_requeue_lists(ctrl); + } else if (ctrl->state == NVME_CTRL_CONNECTING && + old_state == NVME_CTRL_RESETTING) { + nvme_start_failfast_work(ctrl); + } return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -4372,6 +4413,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); + nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); } @@ -4437,6 +4479,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, int ret; ctrl->state = NVME_CTRL_NEW; + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); @@ -4453,6 +4496,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, init_waitqueue_head(&ctrl->state_wq); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8575724734e0..72ac00173500 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, { if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; @@ -615,6 +616,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_TOS, "tos=%d" }, + { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -634,6 +636,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO; opts->hdr_digest = false; opts->data_digest = false; opts->tos = -1; /* < 0 == use transport default */ @@ -754,6 +757,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); ctrl_loss_tmo = token; break; + case NVMF_OPT_FAIL_FAST_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token >= 0) + pr_warn("I/O fail on reconnect controller after %d sec\n", + token); + opts->fast_io_fail_tmo = token; + break; case NVMF_OPT_HOSTNQN: if (opts->host) { pr_err("hostnqn already user-assigned: %s\n", @@ -884,11 +898,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_poll_queues = 0; opts->duplicate_connect = true; } - if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) { opts->max_reconnects = -1; - else + } else { opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, opts->reconnect_delay); + if (ctrl_loss_tmo < opts->fast_io_fail_tmo) + pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", + opts->fast_io_fail_tmo, ctrl_loss_tmo); + } if (!opts->host) { kref_get(&nvmf_default_host->ref); @@ -988,7 +1006,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ - NVMF_OPT_DISABLE_SQFLOW) + NVMF_OPT_DISABLE_SQFLOW |\ + NVMF_OPT_FAIL_FAST_TMO) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a9c1e3b4585e..733010d2eafd 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -15,6 +15,8 @@ #define NVMF_DEF_RECONNECT_DELAY 10 /* default to 600 seconds of reconnect attempts before giving up */ #define NVMF_DEF_CTRL_LOSS_TMO 600 +/* default is -1: the fail fast mechanism is disabled */ +#define NVMF_DEF_FAIL_FAST_TMO -1 /* * Define a host as seen by the target. We allocate one at boot, but also @@ -56,6 +58,7 @@ enum { NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, NVMF_OPT_TOS = 1 << 19, + NVMF_OPT_FAIL_FAST_TMO = 1 << 20, }; /** @@ -89,6 +92,7 @@ enum { * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O * @tos: type of service + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds */ struct nvmf_ctrl_options { unsigned mask; @@ -111,6 +115,7 @@ struct nvmf_ctrl_options { unsigned int nr_write_queues; unsigned int nr_poll_queues; int tos; + int fast_io_fail_tmo; }; /* diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c17..71696819c228 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -279,6 +279,8 @@ static bool nvme_available_path(struct nvme_ns_head *head) struct nvme_ns *ns; list_for_each_entry_rcu(ns, &head->list, siblings) { + if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) + continue; switch (ns->ctrl->state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 83fb30e317e0..ae017f727798 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -305,6 +305,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct delayed_work failfast_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; @@ -338,6 +339,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_FAILFAST_EXPIRED 0 struct nvmf_ctrl_options *opts; struct page *discard_page; -- cgit v1.2.3-59-g8ed1b From aa9d729592316e121110daa81604f71f82663167 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:46 +0900 Subject: nvme: improve an error message on Identify failure Add the namespace ID to the error message when the Identify command used to retrieve the Namespace Identification Descriptor list fails. This avoids rather useless and duplicative messages like the following: [ 1.321031] nvme nvme0: Identify Descriptors failed (16386) [ 1.321948] nvme nvme0: Identify Descriptors failed (16386) [ 1.322872] nvme nvme0: Identify Descriptors failed (16386) [ 1.323775] nvme nvme0: Identify Descriptors failed (16386) [ 1.324687] nvme nvme0: Identify Descriptors failed (16386) ... Also, print the nvme status code in hexadecimal rather than decimal format rather for better readability. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c1645f28a7a..73c6684aaee9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1368,7 +1368,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, NVME_IDENTIFY_DATA_SIZE); if (status) { dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", status); + "Identify Descriptors failed (nsid=%u, status=0x%x)\n", + nsid, status); goto free_data; } -- cgit v1.2.3-59-g8ed1b From f781f3dd6a165d860c29eeb092af8584284e50f3 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:47 +0900 Subject: nvme: print a warning for when listing active namespaces fails During the scan_work, an Identify command is issued to figure out which namespaces are active. If this command fails, the nvme driver falls back to scanning namespaces sequentially. In this situation, we don't see any warnings and don't even know whether list-ns command has been failed or not easiliy. Printa warning when the Identify command executin fail: [ 1.108399] nvme nvme0: Identify NS List failed (status=0x400b) [ 1.109583] nvme0n1: detected capacity change from 0 to 1048576 [ 1.112186] nvme nvme0: Identify Descriptors failed (nsid=2, status=0x4002) [ 1.113929] nvme nvme0: Identify Descriptors failed (nsid=3, status=0x4002) [ 1.116537] nvme nvme0: Identify Descriptors failed (nsid=4, status=0x4002) ... Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 73c6684aaee9..279f4aea861b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4110,8 +4110,11 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, NVME_IDENTIFY_DATA_SIZE); - if (ret) + if (ret) { + dev_warn(ctrl->device, + "Identify NS List failed (status=0x%x)\n", ret); goto free; + } for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); -- cgit v1.2.3-59-g8ed1b From e1aaf5cacba9d994d825a87a33bdd33343477f16 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 1 Dec 2020 13:56:07 +0100 Subject: nvme: remove unnecessary return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup unnecessary ret values that are not checked or used in nvme_alloc_ns(). Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 279f4aea861b..6d0c902034af 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3872,7 +3872,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -3896,8 +3896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, ns->ctrl = ctrl; kref_init(&ns->kref); - ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); - if (ret) + if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; nvme_set_disk_name(disk_name, ns, ctrl, &flags); @@ -3916,8 +3915,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - ret = nvme_nvm_register(ns, disk_name, node); - if (ret) { + if (nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } -- cgit v1.2.3-59-g8ed1b From f68abd9cc00cce58c5dbe5953ac190d25f1e4f8e Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 1 Dec 2020 13:56:08 +0100 Subject: nvme: rename controller base dev_t char device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename controller base dev_t char device in preparation for adding a namespace char device. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d0c902034af..5aaaae7884e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -85,7 +85,7 @@ static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); -static dev_t nvme_chr_devt; +static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; @@ -4517,7 +4517,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); + ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), + ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; ctrl->device->groups = nvme_dev_attr_groups; @@ -4726,7 +4727,8 @@ static int __init nvme_core_init(void) if (!nvme_delete_wq) goto destroy_reset_wq; - result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); + result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, + NVME_MINORS, "nvme"); if (result < 0) goto destroy_delete_wq; @@ -4747,7 +4749,7 @@ static int __init nvme_core_init(void) destroy_class: class_destroy(nvme_class); unregister_chrdev: - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_delete_wq: destroy_workqueue(nvme_delete_wq); destroy_reset_wq: @@ -4762,7 +4764,7 @@ static void __exit nvme_core_exit(void) { class_destroy(nvme_subsys_class); class_destroy(nvme_class); - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); -- cgit v1.2.3-59-g8ed1b From ba4fb3205680ade6c29c80102e86b88641709561 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 1 Dec 2020 13:56:09 +0100 Subject: nvme: rename bdev operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remane block device operations in preparation to add char device file operations. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5aaaae7884e5..1520773d545f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2334,7 +2334,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ -static const struct block_device_operations nvme_fops = { +static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, @@ -3342,7 +3342,7 @@ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); - if (disk->fops == &nvme_fops) + if (disk->fops == &nvme_bdev_ops) return nvme_get_ns_from_dev(dev)->head; else return disk->private_data; @@ -3451,7 +3451,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ return 0; if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; @@ -3904,7 +3904,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!disk) goto out_unlink_ns; - disk->fops = &nvme_fops; + disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; disk->flags = flags; -- cgit v1.2.3-59-g8ed1b From 2f4c9ba23b887e7a69a474e9d53f38b5833a2119 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 1 Dec 2020 13:02:21 +0100 Subject: nvme: export zoned namespaces without Zone Append support read-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow ZNS NVMe SSDs to present a read-only namespace when append is not supported, instead of rejecting the namespace directly. This allows (i) the namespace to be used in read-only mode, which is not a problem as the append command only affects the write path, and (ii) to use standard management tools such as nvme-cli to choose a different format or firmware slot that is compatible with the Linux zoned block device. Signed-off-by: Javier González Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/zns.c | 13 +++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1520773d545f..99f91efe3824 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2125,7 +2125,8 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if (id->nsattr & NVME_NS_ATTR_RO) + if ((id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)) set_disk_ro(disk, true); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae017f727798..bfcedfa4b057 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -452,6 +452,7 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 struct nvme_fault_inject fault_inject; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 67e87e9f306f..1dfe9a3500e3 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -55,12 +55,17 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int status; /* Driver requires zone append support */ - if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + if ((le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & NVME_CMD_EFFECTS_CSUPP)) { + if (test_and_clear_bit(NVME_NS_FORCE_RO, &ns->flags)) + dev_warn(ns->ctrl->device, + "Zone Append supported for zoned namespace:%d. Remove read-only mode\n", + ns->head->ns_id); + } else { + set_bit(NVME_NS_FORCE_RO, &ns->flags); dev_warn(ns->ctrl->device, - "append not supported for zoned namespace:%d\n", - ns->head->ns_id); - return -EINVAL; + "Zone Append not supported for zoned namespace:%d. Forcing to read-only mode\n", + ns->head->ns_id); } /* Lazily query controller append limit for the first zoned namespace */ -- cgit v1.2.3-59-g8ed1b From ce9fe18abb7c86a71b545e1cdd60fe333bf462a3 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:16 +0100 Subject: block/rnbd-clt: Make path parameter optional for map_device During map_device if the given session exists, then the path parameter is not used. In such a case, the path parameter is redundant. This commit makes the path parameter optional for map_device. When the path parameter is not given, if the session exists then that is used to establish the rtrs connection. If the session does not exist, and the path parameter is also missing, then map_device fails. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 1 - drivers/block/rnbd/rnbd-clt.c | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 4f4474eecadb..e7b41ec7cd6a 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -37,7 +37,6 @@ enum { }; static const unsigned int rnbd_opt_mandatory[] = { - RNBD_OPT_PATH, RNBD_OPT_DEV_PATH, RNBD_OPT_SESSNAME, }; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 8b2411ccbda9..edefa0761a81 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1193,6 +1193,12 @@ find_and_get_or_create_sess(const char *sessname, else if (!first) return sess; + if (!path_cnt) { + pr_err("Session %s not found, and path parameter not given", sessname); + err = -ENXIO; + goto put_sess; + } + rtrs_ops = (struct rtrs_clt_ops) { .priv = sess, .link_ev = rnbd_clt_link_ev, -- cgit v1.2.3-59-g8ed1b From 91f4acb2801ce4985483b0fa174bbe995d105417 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:17 +0100 Subject: block/rnbd-clt: support mapping two devices with the same name from different servers Previously, we can't map same device name from different sessions due to the limitation of sysfs naming mechanism. root@clt2:~# ls -l /sys/class/rnbd-client/ctl/devices/ total 0 lrwxrwxrwx 1 root 0 Sep 2 16:31 !dev!nullb1 -> ../../../block/rnbd0 We only use the device name in above, which caused device with the same name can't be mapped from another server. To address the issue, the sessname is appended to the node to differentiate where the device comes from. Also, we need to check if the pathname is existed in a specific session instead of search it in global sess_list. Signed-off-by: Guoqing Jiang Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 ++++ drivers/block/rnbd/rnbd-clt.c | 13 ++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e7b41ec7cd6a..5d3c3c80dab4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -480,6 +480,10 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, if (ret >= len) return -ENAMETOOLONG; + ret = snprintf(buf, len, "%s@%s", buf, dev->sess->sessname); + if (ret >= len) + return -ENAMETOOLONG; + return 0; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index edefa0761a81..1bb495e50931 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1410,13 +1410,16 @@ out_alloc: return ERR_PTR(ret); } -static bool __exists_dev(const char *pathname) +static bool __exists_dev(const char *pathname, const char *sessname) { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; bool found = false; list_for_each_entry(sess, &sess_list, list) { + if (sessname && strncmp(sess->sessname, sessname, + sizeof(sess->sessname))) + continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { if (!strncmp(dev->pathname, pathname, @@ -1433,12 +1436,12 @@ static bool __exists_dev(const char *pathname) return found; } -static bool exists_devpath(const char *pathname) +static bool exists_devpath(const char *pathname, const char *sessname) { bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sessname); mutex_unlock(&sess_lock); return found; @@ -1451,7 +1454,7 @@ static bool insert_dev_if_not_exists_devpath(const char *pathname, bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sess->sessname); if (!found) { mutex_lock(&sess->lock); list_add_tail(&dev->list, &sess->devs_list); @@ -1481,7 +1484,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rnbd_clt_dev *dev; int ret; - if (exists_devpath(pathname)) + if (unlikely(exists_devpath(pathname, sessname))) return ERR_PTR(-EEXIST); sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); -- cgit v1.2.3-59-g8ed1b From 47479b795490f146ff045ec3ee5a724bbce294f0 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 26 Nov 2020 11:47:18 +0100 Subject: Documentation/ABI/rnbd-clt: fix typo in sysfs-class-rnbd-client /sys/block/rnbd is created, not /sys/block/rnbd_client/rnbd Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-client | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client index 00c0286733d4..ca3267b81886 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-client +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -66,7 +66,7 @@ Description: Expected format is the following:: The rnbd_server prepends the received from client with and tries to open the / block device. On success, - a /dev/rnbd device file, a /sys/block/rnbd_client/rnbd/ + a /dev/rnbd device file, a /sys/block/rnbd/ directory and an entry in /sys/class/rnbd-client/ctl/devices will be created. -- cgit v1.2.3-59-g8ed1b From 7578d5cd1e0fe71736970372fcf96341d69f2234 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 26 Nov 2020 11:47:19 +0100 Subject: Documentation/ABI/rnbd-clt: session name is appended to the device path When mapping a device, /sys/devices/virtual/rnbd-client/ctl/devices/ was created. But we found out that it had a problem when mapping the same file on different servers. So we append the session name after the device_id as below. /sys/devices/virtual/rnbd-client/ctl/devices/@ Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-client | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client index ca3267b81886..2aa05b3e348e 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-client +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -95,12 +95,12 @@ Description: Expected format is the following:: --------------------------------- After mapping, the device file can be found by: - o The symlink /sys/class/rnbd-client/ctl/devices/ + o The symlink /sys/class/rnbd-client/ctl/devices/@ points to /sys/block/. The last part of the symlink destination is the same as the device name. By extracting the last part of the path the path to the device /dev/ can be build. - * /dev/block/$(cat /sys/class/rnbd-client/ctl/devices//dev) + * /dev/block/$(cat /sys/class/rnbd-client/ctl/devices/@/dev) How to find the of the device is described on the next section. @@ -110,7 +110,7 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: For each device mapped on the client a new symbolic link is created as - /sys/class/rnbd-client/ctl/devices/, which points + /sys/class/rnbd-client/ctl/devices/@, which points to the block device created by rnbd (/sys/block/rnbd/). The of each device is created as follows: -- cgit v1.2.3-59-g8ed1b From 786998050cbc8ead32e6e9fcda2facb3bf3d198d Mon Sep 17 00:00:00 2001 From: Lutz Pogrell Date: Thu, 26 Nov 2020 11:47:20 +0100 Subject: block/rnbd-srv: close a mapped device from server side. The forceful close of an exported device is required for the use case, when the client side hangs, is crashed, or is not accessible. There have been cases observed, where only some of the devices are to be cleaned up, but the session shall remain. When the device is to be exported to a different client host, server side cleanup is required. Signed-off-by: Lutz Pogrell Signed-off-by: Jack Wang Reviewed-by: Gioh Kim Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 38 ++++++++++++++++++++++++++++++++++++- drivers/block/rnbd/rnbd-srv.c | 19 +++++++++++++++++-- drivers/block/rnbd/rnbd-srv.h | 4 +++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 106775c074d1..08ffb492ebfa 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -120,10 +120,46 @@ static ssize_t mapping_path_show(struct kobject *kobj, static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = __ATTR_RO(mapping_path); +static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + if (!sysfs_streq(buf, "1")) { + rnbd_srv_err(sess_dev, "%s: invalid value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + + rnbd_srv_info(sess_dev, "force close requested\n"); + + /* first remove sysfs itself to avoid deadlock */ + sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); + rnbd_srv_sess_dev_force_close(sess_dev); + + return count; +} + +static struct kobj_attribute rnbd_srv_dev_session_force_close_attr = + __ATTR(force_close, 0644, + rnbd_srv_dev_session_force_close_show, + rnbd_srv_dev_session_force_close_store); + static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = { &rnbd_srv_dev_session_access_mode_attr.attr, &rnbd_srv_dev_session_ro_attr.attr, &rnbd_srv_dev_session_mapping_path_attr.attr, + &rnbd_srv_dev_session_force_close_attr.attr, NULL, }; @@ -145,7 +181,7 @@ static void rnbd_srv_sess_dev_release(struct kobject *kobj) struct rnbd_srv_sess_dev *sess_dev; sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - rnbd_destroy_sess_dev(sess_dev); + rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id); } static struct kobj_type rnbd_srv_sess_dev_ktype = { diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index e1bc8b4cd592..d1ee72ed8384 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -212,12 +212,20 @@ static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) kref_put(&dev->kref, destroy_device_cb); } -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) { DECLARE_COMPLETION_ONSTACK(dc); - xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); + if (keep_id) + /* free the resources for the id but don't */ + /* allow to re-use the id itself because it */ + /* is still used by the client */ + xa_cmpxchg(&sess_dev->sess->index_idr, sess_dev->device_id, + sess_dev, NULL, 0); + else + xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); synchronize_rcu(); + sess_dev->destroy_comp = &dc; rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ @@ -328,6 +336,13 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, } } +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) +{ + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + sess_dev->keep_id = true; + +} + static int process_msg_close(struct rtrs_srv *rtrs, struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 5a8544b5e74f..b157371c25ed 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -56,6 +56,7 @@ struct rnbd_srv_sess_dev { struct rnbd_srv_dev *dev; struct kobject kobj; u32 device_id; + bool keep_id; fmode_t open_flags; struct kref kref; struct completion *destroy_comp; @@ -63,6 +64,7 @@ struct rnbd_srv_sess_dev { enum rnbd_access_mode access_mode; }; +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev); /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, @@ -73,6 +75,6 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); int rnbd_srv_create_sysfs_files(void); void rnbd_srv_destroy_sysfs_files(void); -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id); #endif /* RNBD_SRV_H */ -- cgit v1.2.3-59-g8ed1b From 765c5c56ffde0a555ce69559ab275395fb1a12a9 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Thu, 26 Nov 2020 11:47:21 +0100 Subject: Documentation/ABI/rnbd-srv: add document for force_close describe force_close of device Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-server | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-server b/Documentation/ABI/testing/sysfs-class-rnbd-server index ba60a90c0e45..6c5996cd7cfb 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-server +++ b/Documentation/ABI/testing/sysfs-class-rnbd-server @@ -48,3 +48,11 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: Contains the device access mode: ro, rw or migration. + +What: /sys/class/rnbd-server/ctl/devices//sessions//force_close +Date: Nov 2020 +KernelVersion: 5.10 +Contact: Jack Wang Danil Kipnis +Description: Write "1" to the file to close the device on server side. Please + note that the client side device will not be closed, read or + write to the device will get -ENOTCONN. -- cgit v1.2.3-59-g8ed1b From d3a95ccaaf4df94743a958c90ab85f4703e3a687 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:22 +0100 Subject: block/rnbd: call kobject_put in the failure path Per the comment of kobject_init_and_add, we need to cleanup the memory by call kobject_put. Also we need to call kobject_del for the other failure cases if the kobject_init_and_add doesn't fail. Signed-off-by: Guoqing Jiang Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 +++- drivers/block/rnbd/rnbd-srv-sysfs.c | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 5d3c3c80dab4..e3c3270b0cee 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -450,9 +450,11 @@ static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s", "rnbd"); - if (ret) + if (ret) { rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n", ret); + kobject_put(&dev->kobj); + } return ret; } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 08ffb492ebfa..05ffe488ddc6 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -47,13 +47,17 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, rnbd_devs_kobj, dev_name); - if (ret) + if (ret) { + kobject_put(&dev->dev_kobj); return ret; + } dev->dev_sessions_kobj = kobject_create_and_add("sessions", &dev->dev_kobj); - if (!dev->dev_sessions_kobj) - goto put_dev_kobj; + if (!dev->dev_sessions_kobj) { + ret = -ENOMEM; + goto free_dev_kobj; + } bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj; ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev"); @@ -64,7 +68,8 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, put_sess_kobj: kobject_put(dev->dev_sessions_kobj); -put_dev_kobj: +free_dev_kobj: + kobject_del(&dev->dev_kobj); kobject_put(&dev->dev_kobj); return ret; } @@ -196,18 +201,17 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype, sess_dev->dev->dev_sessions_kobj, "%s", sess_dev->sess->sessname); - if (ret) + if (ret) { + kobject_put(&sess_dev->kobj); return ret; + } ret = sysfs_create_group(&sess_dev->kobj, &rnbd_srv_default_dev_session_attr_group); - if (ret) - goto err; - - return 0; - -err: - kobject_put(&sess_dev->kobj); + if (ret) { + kobject_del(&sess_dev->kobj); + kobject_put(&sess_dev->kobj); + } return ret; } -- cgit v1.2.3-59-g8ed1b From 64e8a6ece1a5b1fa21316918053d068baeac84af Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:23 +0100 Subject: block/rnbd-clt: Dynamically alloc buffer for pathname & blk_symlink_name For every rnbd_clt_dev, we alloc the pathname and blk_symlink_name statically to NAME_MAX which is 255 bytes. In most of the cases we only need less than 10 bytes, so 500 bytes per block device are wasted. This commit dynamically allocates memory buffer for pathname and blk_symlink_name. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Reviewed-by: Lutz Pogrell Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 12 ++++++++++-- drivers/block/rnbd/rnbd-clt.c | 14 +++++++++++--- drivers/block/rnbd/rnbd-clt.h | 4 ++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e3c3270b0cee..c3c96a567568 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -434,6 +434,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) */ if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + kfree(dev->blk_symlink_name); module_put(THIS_MODULE); } } @@ -492,10 +493,17 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) { struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; - int ret; + int ret, len; + + len = strlen(dev->pathname) + strlen(dev->sess->sessname) + 2; + dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); + if (!dev->blk_symlink_name) { + rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); + goto out_err; + } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, - sizeof(dev->blk_symlink_name)); + len); if (ret) { rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n", ret); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 1bb495e50931..34bc6083b58d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -59,6 +59,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) ida_simple_remove(&index_ida, dev->clt_device_id); mutex_unlock(&ida_lock); kfree(dev->hw_queues); + kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); kfree(dev); @@ -1387,10 +1388,17 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, pathname, sess->sessname, ret); goto out_queues; } + + dev->pathname = kzalloc(strlen(pathname) + 1, GFP_KERNEL); + if (!dev->pathname) { + ret = -ENOMEM; + goto out_queues; + } + strlcpy(dev->pathname, pathname, strlen(pathname) + 1); + dev->clt_device_id = ret; dev->sess = sess; dev->access_mode = access_mode; - strlcpy(dev->pathname, pathname, sizeof(dev->pathname)); mutex_init(&dev->lock); refcount_set(&dev->refcount, 1); dev->dev_state = DEV_STATE_INIT; @@ -1422,8 +1430,8 @@ static bool __exists_dev(const char *pathname, const char *sessname) continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { - if (!strncmp(dev->pathname, pathname, - sizeof(dev->pathname))) { + if (strlen(dev->pathname) == strlen(pathname) && + !strcmp(dev->pathname, pathname)) { found = true; break; } diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index ed33654aa486..b193d5904050 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -108,7 +108,7 @@ struct rnbd_clt_dev { u32 clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; - char pathname[NAME_MAX]; + char *pathname; enum rnbd_access_mode access_mode; bool read_only; bool rotational; @@ -126,7 +126,7 @@ struct rnbd_clt_dev { struct list_head list; struct gendisk *gd; struct kobject kobj; - char blk_symlink_name[NAME_MAX]; + char *blk_symlink_name; refcount_t refcount; struct work_struct unmap_on_rmmod_work; }; -- cgit v1.2.3-59-g8ed1b From 733c15bd3a944b8eeaacdddf061759b6a83dd3f4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Dec 2020 14:54:46 +0000 Subject: block/rnbd: fix a null pointer dereference on dev->blk_symlink_name Currently in the case where dev->blk_symlink_name fails to be allocates the error return path attempts to set an end-of-string character to the unallocated dev->blk_symlink_name causing a null pointer dereference error. Fix this by returning with an explicity ENOMEM error (which also is missing in the original code as was not initialized). Fixes: 1eb54f8f5dd8 ("block/rnbd: client: sysfs interface functions") Signed-off-by: Colin Ian King Addresses-Coverity: ("Dereference after null check") Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index c3c96a567568..a7caeedeb198 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -499,7 +499,7 @@ static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); if (!dev->blk_symlink_name) { rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); - goto out_err; + return -ENOMEM; } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, -- cgit v1.2.3-59-g8ed1b From df4ad53242158f9f1f97daf4feddbb4f8b77f080 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 8 Dec 2020 00:39:15 +0800 Subject: bcache: fix race between setting bdev state to none and new write request direct to backing There is a race condition in detaching as below: A. detaching B. Write request (1) writing back (2) write back done, set bdev state to clean. (3) cached_dev_put() and schedule_work(&dc->detach); (4) write data [0 - 4K] directly into backing and ack to user. (5) power-failure... When we restart this bcache device, this bdev is clean but not detached, and read [0 - 4K], we will get unexpected old data from cache device. To fix this problem, set the bdev state to none when we writeback done in detaching, and then if power-failure happened as above, the data in cache will not be used in next bcache device starting, it's detached, we will read the correct data from backing derectly. Signed-off-by: Dongsheng Yang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 9 --------- drivers/md/bcache/writeback.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46a00134a36a..b1a6ba9a5adb 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1114,9 +1114,6 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); - struct closure cl; - - closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); @@ -1130,12 +1127,6 @@ static void cached_dev_detach_finish(struct work_struct *w) dc->writeback_thread = NULL; } - memset(&dc->sb.set_uuid, 0, 16); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - mutex_lock(&bch_register_lock); calc_cached_dev_sectors(dc->disk.c); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3c74996978da..a129e4d2707c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -705,6 +705,15 @@ static int bch_writeback_thread(void *arg) * bch_cached_dev_detach(). */ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + up_write(&dc->writeback_lock); break; } -- cgit v1.2.3-59-g8ed1b From 0ebcdd702f49aeb0ad2e2d894f8c124a0acc6e23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:11 +0900 Subject: null_blk: Fix zone size initialization For a null_blk device with zoned mode enabled is currently initialized with a number of zones equal to the device capacity divided by the zone size, without considering if the device capacity is a multiple of the zone size. If the zone size is not a divisor of the capacity, the zones end up not covering the entire capacity, potentially resulting is out of bounds accesses to the zone array. Fix this by adding one last smaller zone with a size equal to the remainder of the disk capacity divided by the zone size if the capacity is not a multiple of the zone size. For such smaller last zone, the zone capacity is also checked so that it does not exceed the smaller zone size. Reported-by: Naohiro Aota Fixes: ca4b2a011948 ("null_blk: add zone support") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index beb34b4f76b0..1d0370d91fe7 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -6,8 +6,7 @@ #define CREATE_TRACE_POINTS #include "null_blk_trace.h" -/* zone_size in MBs to sectors. */ -#define ZONE_SIZE_SHIFT 11 +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) { @@ -16,7 +15,7 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { - sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t dev_capacity_sects, zone_capacity_sects; sector_t sector = 0; unsigned int i; @@ -38,9 +37,13 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } - dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; - dev->nr_zones = dev_size >> - (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), GFP_KERNEL | __GFP_ZERO); if (!dev->zones) @@ -101,8 +104,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) struct blk_zone *zone = &dev->zones[i]; zone->start = zone->wp = sector; - zone->len = dev->zone_size_sects; - zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; -- cgit v1.2.3-59-g8ed1b From 2e896d89510f23927ec393bee1e0570db3d5a6c6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:12 +0900 Subject: null_blk: Fail zone append to conventional zones Conventional zones do not have a write pointer and so cannot accept zone append writes. Make sure to fail any zone append write command issued to a conventional zone. Reported-by: Naohiro Aota Fixes: e0489ed5daeb ("null_blk: Support REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 1d0370d91fe7..172f720b8d63 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -339,8 +339,11 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, trace_nullb_zone_op(cmd, zno, zone->cond); - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } null_lock_zone(dev, zno); -- cgit v1.2.3-59-g8ed1b From 817046ecddbc5f3cdd93fb84dd58c58ced987dee Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:13 +0900 Subject: block: Align max_hw_sectors to logical blocksize Block device drivers do not have to call blk_queue_max_hw_sectors() to set a limit on request size if the default limit BLK_SAFE_MAX_SECTORS is acceptable. However, this limit (255 sectors) may not be aligned to the device logical block size which cannot be used as is for a request maximum size. This is the case for the null_blk device driver. Modify blk_queue_max_hw_sectors() to make sure that the request size limits specified by the max_hw_sectors and max_sectors queue limits are always aligned to the device logical block size. Additionally, to avoid introducing a dependence on the execution order of this function with blk_queue_logical_block_size(), also modify blk_queue_logical_block_size() to perform the same alignment when the logical block size is set after max_hw_sectors. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-settings.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 9741d1d83e98..dde5c2e9a728 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -157,10 +157,16 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } + max_hw_sectors = round_down(max_hw_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = round_down(max_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -321,13 +327,20 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { - q->limits.logical_block_size = size; + struct queue_limits *limits = &q->limits; - if (q->limits.physical_block_size < size) - q->limits.physical_block_size = size; + limits->logical_block_size = size; - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + if (limits->physical_block_size < size) + limits->physical_block_size = size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; + + limits->max_hw_sectors = + round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); + limits->max_sectors = + round_down(limits->max_sectors, size >> SECTOR_SHIFT); } EXPORT_SYMBOL(blk_queue_logical_block_size); -- cgit v1.2.3-59-g8ed1b From 2b8b7ed7f3fc2b1536a0add3941ae159529d23bd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:14 +0900 Subject: null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 28 ++++- drivers/block/null_blk_zoned.c | 280 ++++++++++++++++++++++++----------------- 2 files changed, 188 insertions(+), 120 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index c24d9b5ad81a..14546ead1d66 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include struct nullb_cmd { struct request *rq; @@ -32,6 +34,26 @@ struct nullb_queue { struct nullb_cmd *cmds; }; +struct nullb_zone { + /* + * Zone lock to prevent concurrent modification of a zone write + * pointer position and condition: with memory backing, a write + * command execution may sleep on memory allocation. For this case, + * use mutex as the zone lock. Otherwise, use the spinlock for + * locking the zone. + */ + union { + spinlock_t spinlock; + struct mutex mutex; + }; + enum blk_zone_type type; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + unsigned int len; + unsigned int capacity; +}; + struct nullb_device { struct nullb *nullb; struct config_item item; @@ -45,10 +67,10 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; - struct blk_zone *zones; + struct nullb_zone *zones; sector_t zone_size_sects; - spinlock_t zone_lock; - unsigned long *zone_locks; + bool need_zone_res_mgmt; + spinlock_t zone_res_lock; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 172f720b8d63..4d5c0b938618 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -13,9 +13,49 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } +static inline void null_lock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_lock_irq(&dev->zone_res_lock); +} + +static inline void null_unlock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_unlock_irq(&dev->zone_res_lock); +} + +static inline void null_init_zone_lock(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_init(&zone->spinlock); + else + mutex_init(&zone->mutex); +} + +static inline void null_lock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_irq(&zone->spinlock); + else + mutex_lock(&zone->mutex); +} + +static inline void null_unlock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_unlock_irq(&zone->spinlock); + else + mutex_unlock(&zone->mutex); +} + int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { sector_t dev_capacity_sects, zone_capacity_sects; + struct nullb_zone *zone; sector_t sector = 0; unsigned int i; @@ -44,26 +84,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (dev_capacity_sects & (dev->zone_size_sects - 1)) dev->nr_zones++; - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), + GFP_KERNEL | __GFP_ZERO); if (!dev->zones) return -ENOMEM; - /* - * With memory backing, the zone_lock spinlock needs to be temporarily - * released to avoid scheduling in atomic context. To guarantee zone - * information protection, use a bitmap to lock zones with - * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing - * implies that the queue is marked with BLK_MQ_F_BLOCKING. - */ - spin_lock_init(&dev->zone_lock); - if (dev->memory_backed) { - dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); - if (!dev->zone_locks) { - kvfree(dev->zones); - return -ENOMEM; - } - } + spin_lock_init(&dev->zone_res_lock); if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; @@ -86,10 +112,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); } + dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; for (i = 0; i < dev->zone_nr_conv; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = sector; zone->len = dev->zone_size_sects; zone->capacity = zone->len; @@ -101,8 +129,9 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) } for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = zone->wp = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; @@ -147,32 +176,17 @@ int null_register_zoned_dev(struct nullb *nullb) void null_free_zoned_dev(struct nullb_device *dev) { - bitmap_free(dev->zone_locks); kvfree(dev->zones); } -static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) -{ - if (dev->memory_backed) - wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&dev->zone_lock); -} - -static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) -{ - spin_unlock_irq(&dev->zone_lock); - - if (dev->memory_backed) - clear_and_wake_up_bit(zno, dev->zone_locks); -} - int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i, zno; - struct blk_zone zone; + unsigned int first_zone, i; + struct nullb_zone *zone; + struct blk_zone blkz; int error; first_zone = null_zone_no(dev, sector); @@ -182,19 +196,25 @@ int null_report_zones(struct gendisk *disk, sector_t sector, nr_zones = min(nr_zones, dev->nr_zones - first_zone); trace_nullb_report_zones(nullb, nr_zones); - zno = first_zone; - for (i = 0; i < nr_zones; i++, zno++) { + memset(&blkz, 0, sizeof(struct blk_zone)); + zone = &dev->zones[first_zone]; + for (i = 0; i < nr_zones; i++, zone++) { /* * Stacked DM target drivers will remap the zone information by * modifying the zone information passed to the report callback. * So use a local copy to avoid corruption of the device zone * array. */ - null_lock_zone(dev, zno); - memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); - null_unlock_zone(dev, zno); - - error = cb(&zone, i, data); + null_lock_zone(dev, zone); + blkz.start = zone->start; + blkz.len = zone->len; + blkz.wp = zone->wp; + blkz.type = zone->type; + blkz.cond = zone->cond; + blkz.capacity = zone->capacity; + null_unlock_zone(dev, zone); + + error = cb(&blkz, i, data); if (error) return error; } @@ -210,7 +230,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len) { struct nullb_device *dev = nullb->dev; - struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; unsigned int nr_sectors = len >> SECTOR_SHIFT; /* Read must be below the write pointer position */ @@ -224,11 +244,9 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t __null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - switch (zone->cond) { case BLK_ZONE_COND_CLOSED: /* close operation on closed is not an error */ @@ -261,7 +279,7 @@ static void null_close_first_imp_zone(struct nullb_device *dev) for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - null_close_zone(dev, &dev->zones[i]); + __null_close_zone(dev, &dev->zones[i]); return; } } @@ -310,7 +328,8 @@ static blk_status_t null_check_open(struct nullb_device *dev) * it is not certain that closing an implicit open zone will allow a new zone * to be opened, since we might already be at the active limit capacity. */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_check_zone_resources(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; @@ -334,7 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, { struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; + struct nullb_zone *zone = &dev->zones[zno]; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -345,26 +364,12 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); } - null_lock_zone(dev, zno); + null_lock_zone(dev, zone); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: + if (zone->cond == BLK_ZONE_COND_FULL) { /* Cannot write to a full zone */ ret = BLK_STS_IOERR; goto unlock; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - break; - default: - /* Invalid zone condition */ - ret = BLK_STS_IOERR; - goto unlock; } /* @@ -389,60 +394,69 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, goto unlock; } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) { + null_lock_zone_res(dev); + + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + null_unlock_zone_res(dev); + goto unlock; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + null_unlock_zone_res(dev); } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - /* - * Memory backing allocation may sleep: release the zone_lock spinlock - * to avoid scheduling in atomic context. Zone operation atomicity is - * still guaranteed through the zone_locks bitmap. - */ - if (dev->memory_backed) - spin_unlock_irq(&dev->zone_lock); ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (dev->memory_backed) - spin_lock_irq(&dev->zone_lock); - if (ret != BLK_STS_OK) goto unlock; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { + null_lock_zone_res(dev); if (zone->cond == BLK_ZONE_COND_EXP_OPEN) dev->nr_zones_exp_open--; else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) dev->nr_zones_imp_open--; zone->cond = BLK_ZONE_COND_FULL; + null_unlock_zone_res(dev); } + ret = BLK_STS_OK; unlock: - null_unlock_zone(dev, zno); + null_unlock_zone(dev, zone); return ret; } -static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_open_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: /* open operation on exp open is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -450,35 +464,57 @@ static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zo case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_EXP_OPEN; dev->nr_zones_exp_open++; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + ret = __null_close_zone(dev, zone); + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_FULL: /* finish operation on full is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -489,27 +525,35 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone * case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_reset_zone(struct nullb_device *dev, + struct nullb_zone *zone) { if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EMPTY: /* reset operation on empty is not an error */ + null_unlock_zone_res(dev); return BLK_STS_OK; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -523,12 +567,15 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *z case BLK_ZONE_COND_FULL: break; default: + null_unlock_zone_res(dev); return BLK_STS_IOERR; } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; + null_unlock_zone_res(dev); + return BLK_STS_OK; } @@ -537,19 +584,19 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, { struct nullb_device *dev = cmd->nq->dev; unsigned int zone_no; - struct blk_zone *zone; + struct nullb_zone *zone; blk_status_t ret; size_t i; if (op == REQ_OP_ZONE_RESET_ALL) { for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - null_lock_zone(dev, i); zone = &dev->zones[i]; + null_lock_zone(dev, zone); if (zone->cond != BLK_ZONE_COND_EMPTY) { null_reset_zone(dev, zone); trace_nullb_zone_op(cmd, i, zone->cond); } - null_unlock_zone(dev, i); + null_unlock_zone(dev, zone); } return BLK_STS_OK; } @@ -557,7 +604,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, zone_no = null_zone_no(dev, sector); zone = &dev->zones[zone_no]; - null_lock_zone(dev, zone_no); + null_lock_zone(dev, zone); switch (op) { case REQ_OP_ZONE_RESET: @@ -580,7 +627,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); - null_unlock_zone(dev, zone_no); + null_unlock_zone(dev, zone); return ret; } @@ -588,29 +635,28 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, sector_t nr_sectors) { - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); + struct nullb_device *dev; + struct nullb_zone *zone; blk_status_t sts; switch (op) { case REQ_OP_WRITE: - sts = null_zone_write(cmd, sector, nr_sectors, false); - break; + return null_zone_write(cmd, sector, nr_sectors, false); case REQ_OP_ZONE_APPEND: - sts = null_zone_write(cmd, sector, nr_sectors, true); - break; + return null_zone_write(cmd, sector, nr_sectors, true); case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - sts = null_zone_mgmt(cmd, op, sector); - break; + return null_zone_mgmt(cmd, op, sector); default: - null_lock_zone(dev, zno); + dev = cmd->nq->dev; + zone = &dev->zones[null_zone_no(dev, sector)]; + + null_lock_zone(dev, zone); sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zno); + null_unlock_zone(dev, zone); + return sts; } - - return sts; } -- cgit v1.2.3-59-g8ed1b From 2e8c6e0e1d2d65562c637940747cfa30559f976a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:15 +0900 Subject: null_blk: Improve implicit zone close When open zone resource management is enabled, that is, when a null_blk zoned device is created with zone_max_open different than 0, implicitly or explicitly opening a zone may require implicitly closing a zone that is already implicitly open. This operation is done using the function null_close_first_imp_zone(), which search for an implicitly open zone to close starting from the first sequential zone. This implementation is simple but may result in the same being constantly implicitly closed and then implicitly reopened on write, namely, the lowest numbered zone that is being written. Avoid this by starting the search for an implicitly open zone starting from the zone following the last zone that was implicitly closed. The function null_close_first_imp_zone() is renamed null_close_imp_open_zone(). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_zoned.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 14546ead1d66..29a8817fadfc 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -67,6 +67,7 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; + unsigned int imp_close_zone_no; struct nullb_zone *zones; sector_t zone_size_sects; bool need_zone_res_mgmt; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4d5c0b938618..4dad8748a61d 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -113,6 +113,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) pr_info("zone_max_open limit disabled, limit >= zone count\n"); } dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; + dev->imp_close_zone_no = dev->zone_nr_conv; for (i = 0; i < dev->zone_nr_conv; i++) { zone = &dev->zones[i]; @@ -273,13 +274,24 @@ static blk_status_t __null_close_zone(struct nullb_device *dev, return BLK_STS_OK; } -static void null_close_first_imp_zone(struct nullb_device *dev) +static void null_close_imp_open_zone(struct nullb_device *dev) { - unsigned int i; + struct nullb_zone *zone; + unsigned int zno, i; + + zno = dev->imp_close_zone_no; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, &dev->zones[i]); + zone = &dev->zones[zno]; + zno++; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + __null_close_zone(dev, zone); + dev->imp_close_zone_no = zno; return; } } @@ -307,7 +319,7 @@ static blk_status_t null_check_open(struct nullb_device *dev) if (dev->nr_zones_imp_open) { if (null_check_active(dev) == BLK_STS_OK) { - null_close_first_imp_zone(dev); + null_close_imp_open_zone(dev); return BLK_STS_OK; } } -- cgit v1.2.3-59-g8ed1b From 49c7089f3ded981fcea387f853fa394788e60fb2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:16 +0900 Subject: null_blk: cleanup discard handling null_handle_discard() is called from both null_handle_rq() and null_handle_bio(). As these functions are only passed a nullb_cmd structure, this forces pointer dereferences to identiify the discard operation code and to access the sector range to be discarded. Simplify all this by changing the interface of the functions null_handle_discard() and null_handle_memory_backed() to pass along the operation code, operation start sector and number of sectors. With this change null_handle_discard() can be called directly from null_handle_memory_backed(). Also add a message warning that the discard configuration attribute has no effect when memory backing is disabled. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 4685ea401d5b..a223bee24e76 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,13 +1076,16 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +static blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { + struct nullb *nullb = dev->nullb; + size_t n = nr_sectors << SECTOR_SHIFT; size_t temp; spin_lock_irq(&nullb->lock); while (n > 0) { - temp = min_t(size_t, n, nullb->dev->blocksize); + temp = min_t(size_t, n, dev->blocksize); null_free_sector(nullb, sector, false); if (null_cache_active(nullb)) null_free_sector(nullb, sector, true); @@ -1090,6 +1093,8 @@ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) n -= temp; } spin_unlock_irq(&nullb->lock); + + return BLK_STS_OK; } static int null_handle_flush(struct nullb *nullb) @@ -1149,17 +1154,10 @@ static int null_handle_rq(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = blk_rq_pos(rq); struct req_iterator iter; struct bio_vec bvec; - sector = blk_rq_pos(rq); - - if (req_op(rq) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, blk_rq_bytes(rq)); - return 0; - } - spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; @@ -1183,18 +1181,10 @@ static int null_handle_bio(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; struct bvec_iter iter; - sector = bio->bi_iter.bi_sector; - - if (bio_op(bio) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, - bio_sectors(bio) << SECTOR_SHIFT); - return 0; - } - spin_lock_irq(&nullb->lock); bio_for_each_segment(bvec, bio, iter) { len = bvec.bv_len; @@ -1263,11 +1253,16 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op) + enum req_opf op, + sector_t sector, + sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; int err; + if (op == REQ_OP_DISCARD) + return null_handle_discard(dev, sector, nr_sectors); + if (dev->queue_mode == NULL_Q_BIO) err = null_handle_bio(cmd); else @@ -1343,7 +1338,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, } if (dev->memory_backed) - return null_handle_memory_backed(cmd, op); + return null_handle_memory_backed(cmd, op, sector, nr_sectors); return BLK_STS_OK; } @@ -1589,6 +1584,12 @@ static void null_config_discard(struct nullb *nullb) if (nullb->dev->discard == false) return; + if (!nullb->dev->memory_backed) { + nullb->dev->discard = false; + pr_info("discard option is ignored without memory backing\n"); + return; + } + if (nullb->dev->zoned) { nullb->dev->discard = false; pr_info("discard option is ignored in zoned mode\n"); -- cgit v1.2.3-59-g8ed1b From 0ec4d913ac69ec86757eec117fc2733018552aa7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:17 +0900 Subject: null_blk: discard zones on reset When memory backing is enabled, use null_handle_discard() to free the backing memory used by a zone when the zone is being reset. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 2 ++ drivers/block/null_blk_main.c | 4 ++-- drivers/block/null_blk_zoned.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 29a8817fadfc..63000aeeb2f3 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -116,6 +116,8 @@ struct nullb { char disk_name[DISK_NAME_LEN]; }; +blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, + sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, unsigned int nr_sectors); diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index a223bee24e76..b758b9366630 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,8 +1076,8 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static blk_status_t null_handle_discard(struct nullb_device *dev, - sector_t sector, sector_t nr_sectors) +blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { struct nullb *nullb = dev->nullb; size_t n = nr_sectors << SECTOR_SHIFT; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4dad8748a61d..65464f7559e0 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -588,6 +588,9 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, null_unlock_zone_res(dev); + if (dev->memory_backed) + return null_handle_discard(dev, zone->start, zone->len); + return BLK_STS_OK; } -- cgit v1.2.3-59-g8ed1b From ea17fd354ca8afd3e8962a77236b1a9a59262fdd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:18 +0900 Subject: null_blk: Allow controlling max_hw_sectors limit Add the module option and configfs attribute max_sectors to allow configuring the maximum size of a command issued to a null_blk device. This allows exercising the block layer BIO splitting with different limits than the default BLK_SAFE_MAX_SECTORS. This is also useful for testing the zone append write path of file systems as the max_hw_sectors limit value is also used for the max_zone_append_sectors limit. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_main.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 63000aeeb2f3..83504f3cc9d6 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -85,6 +85,7 @@ struct nullb_device { unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ + unsigned int max_sectors; /* Max sectors per command */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index b758b9366630..5357c3a4a36f 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -152,6 +152,10 @@ static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); +static int g_max_sectors; +module_param_named(max_sectors, g_max_sectors, int, 0444); +MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); + static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -346,6 +350,7 @@ NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); @@ -463,6 +468,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_home_node, &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, + &nullb_device_attr_max_sectors, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, @@ -533,7 +539,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -588,6 +594,7 @@ static struct nullb_device *null_alloc_dev(void) dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; + dev->max_sectors = g_max_sectors; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; @@ -1867,6 +1874,11 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_logical_block_size(nullb->q, dev->blocksize); blk_queue_physical_block_size(nullb->q, dev->blocksize); + if (!dev->max_sectors) + dev->max_sectors = queue_max_hw_sectors(nullb->q); + dev->max_sectors = min_t(unsigned int, dev->max_sectors, + BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); null_config_discard(nullb); @@ -1910,6 +1922,12 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } + if (g_max_sectors > BLK_DEF_MAX_SECTORS) { + pr_warn("invalid max sectors\n"); + pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); + g_max_sectors = BLK_DEF_MAX_SECTORS; + } + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; -- cgit v1.2.3-59-g8ed1b From eebf34a85c8c724676eba502d15202854f199b05 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:19 +0900 Subject: null_blk: Move driver into its own directory Move null_blk driver code into the new sub-directory drivers/block/null_blk. Suggested-by: Bart Van Assche Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 8 +- drivers/block/Makefile | 7 +- drivers/block/null_blk.h | 162 --- drivers/block/null_blk/Kconfig | 12 + drivers/block/null_blk/Makefile | 11 + drivers/block/null_blk/main.c | 2031 +++++++++++++++++++++++++++++++++++++ drivers/block/null_blk/null_blk.h | 162 +++ drivers/block/null_blk/trace.c | 21 + drivers/block/null_blk/trace.h | 79 ++ drivers/block/null_blk/zoned.c | 677 +++++++++++++ drivers/block/null_blk_main.c | 2031 ------------------------------------- drivers/block/null_blk_trace.c | 21 - drivers/block/null_blk_trace.h | 79 -- drivers/block/null_blk_zoned.c | 677 ------------- 14 files changed, 2995 insertions(+), 2983 deletions(-) delete mode 100644 drivers/block/null_blk.h create mode 100644 drivers/block/null_blk/Kconfig create mode 100644 drivers/block/null_blk/Makefile create mode 100644 drivers/block/null_blk/main.c create mode 100644 drivers/block/null_blk/null_blk.h create mode 100644 drivers/block/null_blk/trace.c create mode 100644 drivers/block/null_blk/trace.h create mode 100644 drivers/block/null_blk/zoned.c delete mode 100644 drivers/block/null_blk_main.c delete mode 100644 drivers/block/null_blk_trace.c delete mode 100644 drivers/block/null_blk_trace.h delete mode 100644 drivers/block/null_blk_zoned.c diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ecceaaa1a66f..262326973ee0 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -16,13 +16,7 @@ menuconfig BLK_DEV if BLK_DEV -config BLK_DEV_NULL_BLK - tristate "Null test block driver" - select CONFIGFS_FS - -config BLK_DEV_NULL_BLK_FAULT_INJECTION - bool "Support fault injection for Null test block driver" - depends on BLK_DEV_NULL_BLK && FAULT_INJECTION +source "drivers/block/null_blk/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e1f63117ee94..a3170859e01d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,12 +41,7 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o -null_blk-objs := null_blk_main.o -ifeq ($(CONFIG_BLK_DEV_ZONED), y) -null_blk-$(CONFIG_TRACING) += null_blk_trace.o -endif -null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h deleted file mode 100644 index 83504f3cc9d6..000000000000 --- a/drivers/block/null_blk.h +++ /dev/null @@ -1,162 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BLK_NULL_BLK_H -#define __BLK_NULL_BLK_H - -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct nullb_cmd { - struct request *rq; - struct bio *bio; - unsigned int tag; - blk_status_t error; - struct nullb_queue *nq; - struct hrtimer timer; -}; - -struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; - struct nullb_device *dev; - unsigned int requeue_selection; - - struct nullb_cmd *cmds; -}; - -struct nullb_zone { - /* - * Zone lock to prevent concurrent modification of a zone write - * pointer position and condition: with memory backing, a write - * command execution may sleep on memory allocation. For this case, - * use mutex as the zone lock. Otherwise, use the spinlock for - * locking the zone. - */ - union { - spinlock_t spinlock; - struct mutex mutex; - }; - enum blk_zone_type type; - enum blk_zone_cond cond; - sector_t start; - sector_t wp; - unsigned int len; - unsigned int capacity; -}; - -struct nullb_device { - struct nullb *nullb; - struct config_item item; - struct radix_tree_root data; /* data stored in the disk */ - struct radix_tree_root cache; /* disk cache data */ - unsigned long flags; /* device flags */ - unsigned int curr_cache; - struct badblocks badblocks; - - unsigned int nr_zones; - unsigned int nr_zones_imp_open; - unsigned int nr_zones_exp_open; - unsigned int nr_zones_closed; - unsigned int imp_close_zone_no; - struct nullb_zone *zones; - sector_t zone_size_sects; - bool need_zone_res_mgmt; - spinlock_t zone_res_lock; - - unsigned long size; /* device size in MB */ - unsigned long completion_nsec; /* time in ns to complete a request */ - unsigned long cache_size; /* disk cache size in MB */ - unsigned long zone_size; /* zone size in MB if device is zoned */ - unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ - unsigned int zone_nr_conv; /* number of conventional zones */ - unsigned int zone_max_open; /* max number of open zones */ - unsigned int zone_max_active; /* max number of active zones */ - unsigned int submit_queues; /* number of submission queues */ - unsigned int home_node; /* home node for the device */ - unsigned int queue_mode; /* block interface */ - unsigned int blocksize; /* block size */ - unsigned int max_sectors; /* Max sectors per command */ - unsigned int irqmode; /* IRQ completion handler */ - unsigned int hw_queue_depth; /* queue depth */ - unsigned int index; /* index of the disk, only valid with a disk */ - unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool blocking; /* blocking blk-mq device */ - bool use_per_node_hctx; /* use per-node allocation for hardware context */ - bool power; /* power on/off the device */ - bool memory_backed; /* if data is stored in memory */ - bool discard; /* if support discard */ - bool zoned; /* if device is zoned */ -}; - -struct nullb { - struct nullb_device *dev; - struct list_head list; - unsigned int index; - struct request_queue *q; - struct gendisk *disk; - struct blk_mq_tag_set *tag_set; - struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; - atomic_long_t cur_bytes; - struct hrtimer bw_timer; - unsigned long cache_flush_pos; - spinlock_t lock; - - struct nullb_queue *queues; - unsigned int nr_queues; - char disk_name[DISK_NAME_LEN]; -}; - -blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, - sector_t nr_sectors); -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors); - -#ifdef CONFIG_BLK_DEV_ZONED -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); -int null_register_zoned_dev(struct nullb *nullb); -void null_free_zoned_dev(struct nullb_device *dev); -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len); -#else -static inline int null_init_zoned_dev(struct nullb_device *dev, - struct request_queue *q) -{ - pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); - return -EINVAL; -} -static inline int null_register_zoned_dev(struct nullb *nullb) -{ - return -ENODEV; -} -static inline void null_free_zoned_dev(struct nullb_device *dev) {} -static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, sector_t nr_sectors) -{ - return BLK_STS_NOTSUPP; -} -static inline size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, - unsigned int len) -{ - return len; -} -#define null_report_zones NULL -#endif /* CONFIG_BLK_DEV_ZONED */ -#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig new file mode 100644 index 000000000000..6bf1f8ca20a2 --- /dev/null +++ b/drivers/block/null_blk/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Null block device driver configuration +# + +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + select CONFIGFS_FS + +config BLK_DEV_NULL_BLK_FAULT_INJECTION + bool "Support fault injection for Null test block driver" + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile new file mode 100644 index 000000000000..84c36e512ab8 --- /dev/null +++ b/drivers/block/null_blk/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +# needed for trace events +ccflags-y += -I$(src) + +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +null_blk-objs := main.o +ifeq ($(CONFIG_BLK_DEV_ZONED), y) +null_blk-$(CONFIG_TRACING) += trace.o +endif +null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c new file mode 100644 index 000000000000..5357c3a4a36f --- /dev/null +++ b/drivers/block/null_blk/main.c @@ -0,0 +1,2031 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Add configfs and memory store: Kyungchan Koh and + * Shaohua Li + */ +#include + +#include +#include +#include +#include +#include "null_blk.h" + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + +#define FREE_BATCH 16 + +#define TICKS_PER_SEC 50ULL +#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static DECLARE_FAULT_ATTR(null_timeout_attr); +static DECLARE_FAULT_ATTR(null_requeue_attr); +static DECLARE_FAULT_ATTR(null_init_hctx_attr); +#endif + +static inline u64 mb_per_tick(int mbps) +{ + return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); +} + +/* + * Status flags for nullb_device. + * + * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. + * UP: Device is currently on and visible in userspace. + * THROTTLED: Device is being throttled. + * CACHE: Device is using a write-back cache. + */ +enum nullb_device_flags { + NULLB_DEV_FL_CONFIGURED = 0, + NULLB_DEV_FL_UP = 1, + NULLB_DEV_FL_THROTTLED = 2, + NULLB_DEV_FL_CACHE = 3, +}; + +#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) +/* + * nullb_page is a page in memory for nullb devices. + * + * @page: The page holding the data. + * @bitmap: The bitmap represents which sector in the page has data. + * Each bit represents one block size. For example, sector 8 + * will use the 7th bit + * The highest 2 bits of bitmap are for special purpose. LOCK means the cache + * page is being flushing to storage. FREE means the cache page is freed and + * should be skipped from flushing to storage. Please see + * null_make_cache_space + */ +struct nullb_page { + struct page *page; + DECLARE_BITMAP(bitmap, MAP_SZ); +}; +#define NULLB_PAGE_LOCK (MAP_SZ - 1) +#define NULLB_PAGE_FREE (MAP_SZ - 2) + +static LIST_HEAD(nullb_list); +static struct mutex lock; +static int null_major; +static DEFINE_IDA(nullb_indexes); +static struct blk_mq_tag_set tag_set; + +enum { + NULL_IRQ_NONE = 0, + NULL_IRQ_SOFTIRQ = 1, + NULL_IRQ_TIMER = 2, +}; + +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + +static int g_no_sched; +module_param_named(no_sched, g_no_sched, int, 0444); +MODULE_PARM_DESC(no_sched, "No io scheduler"); + +static int g_submit_queues = 1; +module_param_named(submit_queues, g_submit_queues, int, 0444); +MODULE_PARM_DESC(submit_queues, "Number of submission queues"); + +static int g_home_node = NUMA_NO_NODE; +module_param_named(home_node, g_home_node, int, 0444); +MODULE_PARM_DESC(home_node, "Home node for the device"); + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +/* + * For more details about fault injection, please refer to + * Documentation/fault-injection/fault-injection.rst. + */ +static char g_timeout_str[80]; +module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); +MODULE_PARM_DESC(timeout, "Fault injection. timeout=,,,"); + +static char g_requeue_str[80]; +module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); +MODULE_PARM_DESC(requeue, "Fault injection. requeue=,,,"); + +static char g_init_hctx_str[80]; +module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); +MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=,,,"); +#endif + +static int g_queue_mode = NULL_Q_MQ; + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static const struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); +MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); + +static int g_gb = 250; +module_param_named(gb, g_gb, int, 0444); +MODULE_PARM_DESC(gb, "Size in GB"); + +static int g_bs = 512; +module_param_named(bs, g_bs, int, 0444); +MODULE_PARM_DESC(bs, "Block size (in bytes)"); + +static int g_max_sectors; +module_param_named(max_sectors, g_max_sectors, int, 0444); +MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); + +static unsigned int nr_devices = 1; +module_param(nr_devices, uint, 0444); +MODULE_PARM_DESC(nr_devices, "Number of devices to register"); + +static bool g_blocking; +module_param_named(blocking, g_blocking, bool, 0444); +MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); + +static bool shared_tags; +module_param(shared_tags, bool, 0444); +MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); + +static bool g_shared_tag_bitmap; +module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); +MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); + +static int g_irqmode = NULL_IRQ_SOFTIRQ; + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static const struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); +MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); + +static unsigned long g_completion_nsec = 10000; +module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); +MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); + +static int g_hw_queue_depth = 64; +module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); + +static bool g_use_per_node_hctx; +module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); +MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); + +static bool g_zoned; +module_param_named(zoned, g_zoned, bool, S_IRUGO); +MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); + +static unsigned long g_zone_size = 256; +module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); + +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + +static unsigned int g_zone_nr_conv; +module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); +MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); + +static unsigned int g_zone_max_open; +module_param_named(zone_max_open, g_zone_max_open, uint, 0444); +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); + +static unsigned int g_zone_max_active; +module_param_named(zone_max_active, g_zone_max_active, uint, 0444); +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); + +static struct nullb_device *null_alloc_dev(void); +static void null_free_dev(struct nullb_device *dev); +static void null_del_dev(struct nullb *nullb); +static int null_add_dev(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev, bool is_cache); + +static inline struct nullb_device *to_nullb_device(struct config_item *item) +{ + return item ? container_of(item, struct nullb_device, item) : NULL; +} + +static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%lu\n", val); +} + +static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t nullb_device_uint_attr_store(unsigned int *val, + const char *page, size_t count) +{ + unsigned int tmp; + int result; + + result = kstrtouint(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_ulong_attr_store(unsigned long *val, + const char *page, size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, + size_t count) +{ + bool tmp; + int result; + + result = kstrtobool(page, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ +#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ +static ssize_t \ +nullb_device_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return nullb_device_##TYPE##_attr_show( \ + to_nullb_device(item)->NAME, page); \ +} \ +static ssize_t \ +nullb_device_##NAME##_store(struct config_item *item, const char *page, \ + size_t count) \ +{ \ + int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ + struct nullb_device *dev = to_nullb_device(item); \ + TYPE new_value = 0; \ + int ret; \ + \ + ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ + if (ret < 0) \ + return ret; \ + if (apply_fn) \ + ret = apply_fn(dev, new_value); \ + else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ + ret = -EBUSY; \ + if (ret < 0) \ + return ret; \ + dev->NAME = new_value; \ + return count; \ +} \ +CONFIGFS_ATTR(nullb_device_, NAME); + +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + struct nullb *nullb = dev->nullb; + struct blk_mq_tag_set *set; + + if (!nullb) + return 0; + + /* + * Make sure that null_init_hctx() does not access nullb->queues[] past + * the end of that array. + */ + if (submit_queues > nr_cpu_ids) + return -EINVAL; + set = nullb->tag_set; + blk_mq_update_nr_hw_queues(set, submit_queues); + return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +} + +NULLB_DEVICE_ATTR(size, ulong, NULL); +NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(home_node, uint, NULL); +NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(irqmode, uint, NULL); +NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +NULLB_DEVICE_ATTR(index, uint, NULL); +NULLB_DEVICE_ATTR(blocking, bool, NULL); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +NULLB_DEVICE_ATTR(discard, bool, NULL); +NULLB_DEVICE_ATTR(mbps, uint, NULL); +NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +NULLB_DEVICE_ATTR(zoned, bool, NULL); +NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); +NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); + +static ssize_t nullb_device_power_show(struct config_item *item, char *page) +{ + return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); +} + +static ssize_t nullb_device_power_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + bool newp = false; + ssize_t ret; + + ret = nullb_device_bool_attr_store(&newp, page, count); + if (ret < 0) + return ret; + + if (!dev->power && newp) { + if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) + return count; + if (null_add_dev(dev)) { + clear_bit(NULLB_DEV_FL_UP, &dev->flags); + return -ENOMEM; + } + + set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + dev->power = newp; + } else if (dev->power && !newp) { + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = newp; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + } + + return count; +} + +CONFIGFS_ATTR(nullb_device_, power); + +static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) +{ + struct nullb_device *t_dev = to_nullb_device(item); + + return badblocks_show(&t_dev->badblocks, page, 0); +} + +static ssize_t nullb_device_badblocks_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *t_dev = to_nullb_device(item); + char *orig, *buf, *tmp; + u64 start, end; + int ret; + + orig = kstrndup(page, count, GFP_KERNEL); + if (!orig) + return -ENOMEM; + + buf = strstrip(orig); + + ret = -EINVAL; + if (buf[0] != '+' && buf[0] != '-') + goto out; + tmp = strchr(&buf[1], '-'); + if (!tmp) + goto out; + *tmp = '\0'; + ret = kstrtoull(buf + 1, 0, &start); + if (ret) + goto out; + ret = kstrtoull(tmp + 1, 0, &end); + if (ret) + goto out; + ret = -EINVAL; + if (start > end) + goto out; + /* enable badblocks */ + cmpxchg(&t_dev->badblocks.shift, -1, 0); + if (buf[0] == '+') + ret = badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1); + else + ret = badblocks_clear(&t_dev->badblocks, start, + end - start + 1); + if (ret == 0) + ret = count; +out: + kfree(orig); + return ret; +} +CONFIGFS_ATTR(nullb_device_, badblocks); + +static struct configfs_attribute *nullb_device_attrs[] = { + &nullb_device_attr_size, + &nullb_device_attr_completion_nsec, + &nullb_device_attr_submit_queues, + &nullb_device_attr_home_node, + &nullb_device_attr_queue_mode, + &nullb_device_attr_blocksize, + &nullb_device_attr_max_sectors, + &nullb_device_attr_irqmode, + &nullb_device_attr_hw_queue_depth, + &nullb_device_attr_index, + &nullb_device_attr_blocking, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_power, + &nullb_device_attr_memory_backed, + &nullb_device_attr_discard, + &nullb_device_attr_mbps, + &nullb_device_attr_cache_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_zoned, + &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, + &nullb_device_attr_zone_nr_conv, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_max_active, + NULL, +}; + +static void nullb_device_release(struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + null_free_device_storage(dev, false); + null_free_dev(dev); +} + +static struct configfs_item_operations nullb_device_ops = { + .release = nullb_device_release, +}; + +static const struct config_item_type nullb_device_type = { + .ct_item_ops = &nullb_device_ops, + .ct_attrs = nullb_device_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct +config_item *nullb_group_make_item(struct config_group *group, const char *name) +{ + struct nullb_device *dev; + + dev = null_alloc_dev(); + if (!dev) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&dev->item, name, &nullb_device_type); + + return &dev->item; +} + +static void +nullb_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = false; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + + config_item_put(item); +} + +static ssize_t memb_group_features_show(struct config_item *item, char *page) +{ + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); +} + +CONFIGFS_ATTR_RO(memb_group_, features); + +static struct configfs_attribute *nullb_group_attrs[] = { + &memb_group_attr_features, + NULL, +}; + +static struct configfs_group_operations nullb_group_ops = { + .make_item = nullb_group_make_item, + .drop_item = nullb_group_drop_item, +}; + +static const struct config_item_type nullb_group_type = { + .ct_group_ops = &nullb_group_ops, + .ct_attrs = nullb_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nullb_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "nullb", + .ci_type = &nullb_group_type, + }, + }, +}; + +static inline int null_cache_active(struct nullb *nullb) +{ + return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +} + +static struct nullb_device *null_alloc_dev(void) +{ + struct nullb_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); + INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); + if (badblocks_init(&dev->badblocks, 0)) { + kfree(dev); + return NULL; + } + + dev->size = g_gb * 1024; + dev->completion_nsec = g_completion_nsec; + dev->submit_queues = g_submit_queues; + dev->home_node = g_home_node; + dev->queue_mode = g_queue_mode; + dev->blocksize = g_bs; + dev->max_sectors = g_max_sectors; + dev->irqmode = g_irqmode; + dev->hw_queue_depth = g_hw_queue_depth; + dev->blocking = g_blocking; + dev->use_per_node_hctx = g_use_per_node_hctx; + dev->zoned = g_zoned; + dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; + dev->zone_nr_conv = g_zone_nr_conv; + dev->zone_max_open = g_zone_max_open; + dev->zone_max_active = g_zone_max_active; + return dev; +} + +static void null_free_dev(struct nullb_device *dev) +{ + if (!dev) + return; + + null_free_zoned_dev(dev); + badblocks_exit(&dev->badblocks); + kfree(dev); +} + +static void put_tag(struct nullb_queue *nq, unsigned int tag) +{ + clear_bit_unlock(tag, nq->tag_map); + + if (waitqueue_active(&nq->wait)) + wake_up(&nq->wait); +} + +static unsigned int get_tag(struct nullb_queue *nq) +{ + unsigned int tag; + + do { + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); + if (tag >= nq->queue_depth) + return -1U; + } while (test_and_set_bit_lock(tag, nq->tag_map)); + + return tag; +} + +static void free_cmd(struct nullb_cmd *cmd) +{ + put_tag(cmd->nq, cmd->tag); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); + +static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + unsigned int tag; + + tag = get_tag(nq); + if (tag != -1U) { + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + return cmd; + } + + return NULL; +} + +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +{ + struct nullb_cmd *cmd; + DEFINE_WAIT(wait); + + cmd = __alloc_cmd(nq); + if (cmd || !can_wait) + return cmd; + + do { + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + cmd = __alloc_cmd(nq); + if (cmd) + break; + + io_schedule(); + } while (1); + + finish_wait(&nq->wait, &wait); + return cmd; +} + +static void end_cmd(struct nullb_cmd *cmd) +{ + int queue_mode = cmd->nq->dev->queue_mode; + + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_end_request(cmd->rq, cmd->error); + return; + case NULL_Q_BIO: + cmd->bio->bi_status = cmd->error; + bio_endio(cmd->bio); + break; + } + + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + end_cmd(container_of(timer, struct nullb_cmd, timer)); + + return HRTIMER_NORESTART; +} + +static void null_cmd_end_timer(struct nullb_cmd *cmd) +{ + ktime_t kt = cmd->nq->dev->completion_nsec; + + hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); +} + +static void null_complete_rq(struct request *rq) +{ + end_cmd(blk_mq_rq_to_pdu(rq)); +} + +static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +{ + struct nullb_page *t_page; + + t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + if (!t_page) + goto out; + + t_page->page = alloc_pages(gfp_flags, 0); + if (!t_page->page) + goto out_freepage; + + memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); + return t_page; +out_freepage: + kfree(t_page); +out: + return NULL; +} + +static void null_free_page(struct nullb_page *t_page) +{ + __set_bit(NULLB_PAGE_FREE, t_page->bitmap); + if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) + return; + __free_page(t_page->page); + kfree(t_page); +} + +static bool null_page_empty(struct nullb_page *page) +{ + int size = MAP_SZ - 2; + + return find_first_bit(page->bitmap, size) == size; +} + +static void null_free_sector(struct nullb *nullb, sector_t sector, + bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page, *ret; + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(root, idx); + if (t_page) { + __clear_bit(sector_bit, t_page->bitmap); + + if (null_page_empty(t_page)) { + ret = radix_tree_delete_item(root, idx, t_page); + WARN_ON(ret != t_page); + null_free_page(ret); + if (is_cache) + nullb->dev->curr_cache -= PAGE_SIZE; + } + } +} + +static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, + struct nullb_page *t_page, bool is_cache) +{ + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + + if (radix_tree_insert(root, idx, t_page)) { + null_free_page(t_page); + t_page = radix_tree_lookup(root, idx); + WARN_ON(!t_page || t_page->page->index != idx); + } else if (is_cache) + nullb->dev->curr_cache += PAGE_SIZE; + + return t_page; +} + +static void null_free_device_storage(struct nullb_device *dev, bool is_cache) +{ + unsigned long pos = 0; + int nr_pages; + struct nullb_page *ret, *t_pages[FREE_BATCH]; + struct radix_tree_root *root; + + root = is_cache ? &dev->cache : &dev->data; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(root, + (void **)t_pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + pos = t_pages[i]->page->index; + ret = radix_tree_delete_item(root, pos, t_pages[i]); + WARN_ON(ret != t_pages[i]); + null_free_page(ret); + } + + pos++; + } while (nr_pages == FREE_BATCH); + + if (is_cache) + dev->curr_cache = 0; +} + +static struct nullb_page *__null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page; + struct radix_tree_root *root; + + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + t_page = radix_tree_lookup(root, idx); + WARN_ON(t_page && t_page->page->index != idx); + + if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) + return t_page; + + return NULL; +} + +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool ignore_cache) +{ + struct nullb_page *page = NULL; + + if (!ignore_cache) + page = __null_lookup_page(nullb, sector, for_write, true); + if (page) + return page; + return __null_lookup_page(nullb, sector, for_write, false); +} + +static struct nullb_page *null_insert_page(struct nullb *nullb, + sector_t sector, bool ignore_cache) + __releases(&nullb->lock) + __acquires(&nullb->lock) +{ + u64 idx; + struct nullb_page *t_page; + + t_page = null_lookup_page(nullb, sector, true, ignore_cache); + if (t_page) + return t_page; + + spin_unlock_irq(&nullb->lock); + + t_page = null_alloc_page(GFP_NOIO); + if (!t_page) + goto out_lock; + + if (radix_tree_preload(GFP_NOIO)) + goto out_freepage; + + spin_lock_irq(&nullb->lock); + idx = sector >> PAGE_SECTORS_SHIFT; + t_page->page->index = idx; + t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); + radix_tree_preload_end(); + + return t_page; +out_freepage: + null_free_page(t_page); +out_lock: + spin_lock_irq(&nullb->lock); + return null_lookup_page(nullb, sector, true, ignore_cache); +} + +static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) +{ + int i; + unsigned int offset; + u64 idx; + struct nullb_page *t_page, *ret; + void *dst, *src; + + idx = c_page->page->index; + + t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); + + __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); + if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { + null_free_page(c_page); + if (t_page && null_page_empty(t_page)) { + ret = radix_tree_delete_item(&nullb->dev->data, + idx, t_page); + null_free_page(t_page); + } + return 0; + } + + if (!t_page) + return -ENOMEM; + + src = kmap_atomic(c_page->page); + dst = kmap_atomic(t_page->page); + + for (i = 0; i < PAGE_SECTORS; + i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { + if (test_bit(i, c_page->bitmap)) { + offset = (i << SECTOR_SHIFT); + memcpy(dst + offset, src + offset, + nullb->dev->blocksize); + __set_bit(i, t_page->bitmap); + } + } + + kunmap_atomic(dst); + kunmap_atomic(src); + + ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); + null_free_page(ret); + nullb->dev->curr_cache -= PAGE_SIZE; + + return 0; +} + +static int null_make_cache_space(struct nullb *nullb, unsigned long n) +{ + int i, err, nr_pages; + struct nullb_page *c_pages[FREE_BATCH]; + unsigned long flushed = 0, one_round; + +again: + if ((nullb->dev->cache_size * 1024 * 1024) > + nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) + return 0; + + nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, + (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); + /* + * nullb_flush_cache_page could unlock before using the c_pages. To + * avoid race, we don't allow page free + */ + for (i = 0; i < nr_pages; i++) { + nullb->cache_flush_pos = c_pages[i]->page->index; + /* + * We found the page which is being flushed to disk by other + * threads + */ + if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) + c_pages[i] = NULL; + else + __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); + } + + one_round = 0; + for (i = 0; i < nr_pages; i++) { + if (c_pages[i] == NULL) + continue; + err = null_flush_cache_page(nullb, c_pages[i]); + if (err) + return err; + one_round++; + } + flushed += one_round << PAGE_SHIFT; + + if (n > flushed) { + if (nr_pages == 0) + nullb->cache_flush_pos = 0; + if (one_round == 0) { + /* give other threads a chance */ + spin_unlock_irq(&nullb->lock); + spin_lock_irq(&nullb->lock); + } + goto again; + } + return 0; +} + +static int copy_to_nullb(struct nullb *nullb, struct page *source, + unsigned int off, sector_t sector, size_t n, bool is_fua) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_insert_page(nullb, sector, + !null_cache_active(nullb) || is_fua); + if (!t_page) + return -ENOSPC; + + src = kmap_atomic(source); + dst = kmap_atomic(t_page->page); + memcpy(dst + offset, src + off + count, temp); + kunmap_atomic(dst); + kunmap_atomic(src); + + __set_bit(sector & SECTOR_MASK, t_page->bitmap); + + if (is_fua) + null_free_sector(nullb, sector, true); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int copy_from_nullb(struct nullb *nullb, struct page *dest, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_lookup_page(nullb, sector, false, + !null_cache_active(nullb)); + + dst = kmap_atomic(dest); + if (!t_page) { + memset(dst + off + count, 0, temp); + goto next; + } + src = kmap_atomic(t_page->page); + memcpy(dst + off + count, src + offset, temp); + kunmap_atomic(src); +next: + kunmap_atomic(dst); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static void nullb_fill_pattern(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off) +{ + void *dst; + + dst = kmap_atomic(page); + memset(dst + off, 0xFF, len); + kunmap_atomic(dst); +} + +blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) +{ + struct nullb *nullb = dev->nullb; + size_t n = nr_sectors << SECTOR_SHIFT; + size_t temp; + + spin_lock_irq(&nullb->lock); + while (n > 0) { + temp = min_t(size_t, n, dev->blocksize); + null_free_sector(nullb, sector, false); + if (null_cache_active(nullb)) + null_free_sector(nullb, sector, true); + sector += temp >> SECTOR_SHIFT; + n -= temp; + } + spin_unlock_irq(&nullb->lock); + + return BLK_STS_OK; +} + +static int null_handle_flush(struct nullb *nullb) +{ + int err; + + if (!null_cache_active(nullb)) + return 0; + + spin_lock_irq(&nullb->lock); + while (true) { + err = null_make_cache_space(nullb, + nullb->dev->cache_size * 1024 * 1024); + if (err || nullb->dev->curr_cache == 0) + break; + } + + WARN_ON(!radix_tree_empty(&nullb->dev->cache)); + spin_unlock_irq(&nullb->lock); + return err; +} + +static int null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, sector_t sector, + bool is_fua) +{ + struct nullb_device *dev = nullb->dev; + unsigned int valid_len = len; + int err = 0; + + if (!is_write) { + if (dev->zoned) + valid_len = null_zone_valid_read_len(nullb, + sector, len); + + if (valid_len) { + err = copy_from_nullb(nullb, page, off, + sector, valid_len); + off += valid_len; + len -= valid_len; + } + + if (len) + nullb_fill_pattern(nullb, page, len, off); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + err = copy_to_nullb(nullb, page, off, sector, len, is_fua); + } + + return err; +} + +static int null_handle_rq(struct nullb_cmd *cmd) +{ + struct request *rq = cmd->rq; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector = blk_rq_pos(rq); + struct req_iterator iter; + struct bio_vec bvec; + + spin_lock_irq(&nullb->lock); + rq_for_each_segment(bvec, rq, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(req_op(rq)), sector, + rq->cmd_flags & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + + return 0; +} + +static int null_handle_bio(struct nullb_cmd *cmd) +{ + struct bio *bio = cmd->bio; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector = bio->bi_iter.bi_sector; + struct bio_vec bvec; + struct bvec_iter iter; + + spin_lock_irq(&nullb->lock); + bio_for_each_segment(bvec, bio, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(bio_op(bio)), sector, + bio->bi_opf & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + return 0; +} + +static void null_stop_queue(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_stop_hw_queues(q); +} + +static void null_restart_queue_async(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_start_stopped_hw_queues(q, true); +} + +static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts = BLK_STS_OK; + struct request *rq = cmd->rq; + + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); + + if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + /* requeue request */ + sts = BLK_STS_DEV_RESOURCE; + } + return sts; +} + +static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, + sector_t sector, + sector_t nr_sectors) +{ + struct badblocks *bb = &cmd->nq->dev->badblocks; + sector_t first_bad; + int bad_sectors; + + if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_IOERR; + + return BLK_STS_OK; +} + +static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, + enum req_opf op, + sector_t sector, + sector_t nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + int err; + + if (op == REQ_OP_DISCARD) + return null_handle_discard(dev, sector, nr_sectors); + + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + + return errno_to_blk_status(err); +} + +static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct bio *bio; + + if (dev->memory_backed) + return; + + if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { + zero_fill_bio(cmd->bio); + } else if (req_op(cmd->rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, cmd->rq) + zero_fill_bio(bio); + } +} + +static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +{ + /* + * Since root privileges are required to configure the null_blk + * driver, it is fine that this driver does not initialize the + * data buffers of read commands. Zero-initialize these buffers + * anyway if KMSAN is enabled to prevent that KMSAN complains + * about null_blk not initializing read data buffers. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + nullb_zero_read_cmd_buffer(cmd); + + /* Complete IO by inline, softirq or timer */ + switch (cmd->nq->dev->irqmode) { + case NULL_IRQ_SOFTIRQ: + switch (cmd->nq->dev->queue_mode) { + case NULL_Q_MQ: + if (likely(!blk_should_fake_timeout(cmd->rq->q))) + blk_mq_complete_request(cmd->rq); + break; + case NULL_Q_BIO: + /* + * XXX: no proper submitting cpu information available. + */ + end_cmd(cmd); + break; + } + break; + case NULL_IRQ_NONE: + end_cmd(cmd); + break; + case NULL_IRQ_TIMER: + null_cmd_end_timer(cmd); + break; + } +} + +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + blk_status_t ret; + + if (dev->badblocks.shift != -1) { + ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + } + + if (dev->memory_backed) + return null_handle_memory_backed(cmd, op, sector, nr_sectors); + + return BLK_STS_OK; +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts; + + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + sts = null_handle_throttled(cmd); + if (sts != BLK_STS_OK) + return sts; + } + + if (op == REQ_OP_FLUSH) { + cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + goto out; + } + + if (dev->zoned) + cmd->error = null_process_zoned_cmd(cmd, op, + sector, nr_sectors); + else + cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); + +out: + nullb_complete_cmd(cmd); + return BLK_STS_OK; +} + +static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) +{ + struct nullb *nullb = container_of(timer, struct nullb, bw_timer); + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + unsigned int mbps = nullb->dev->mbps; + + if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) + return HRTIMER_NORESTART; + + atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); + null_restart_queue_async(nullb); + + hrtimer_forward_now(&nullb->bw_timer, timer_interval); + + return HRTIMER_RESTART; +} + +static void nullb_setup_bwtimer(struct nullb *nullb) +{ + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + + hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + nullb->bw_timer.function = nullb_bwtimer_fn; + atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); + hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); +} + +static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +{ + int index = 0; + + if (nullb->nr_queues != 1) + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); + + return &nullb->queues[index]; +} + +static blk_qc_t null_submit_bio(struct bio *bio) +{ + sector_t sector = bio->bi_iter.bi_sector; + sector_t nr_sectors = bio_sectors(bio); + struct nullb *nullb = bio->bi_disk->private_data; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 1); + cmd->bio = bio; + + null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + return BLK_QC_T_NONE; +} + +static bool should_timeout_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_timeout_str[0]) + return should_fail(&null_timeout_attr, 1); +#endif + return false; +} + +static bool should_requeue_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_requeue_str[0]) + return should_fail(&null_requeue_attr, 1); +#endif + return false; +} + +static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +{ + pr_info("rq %p timed out\n", rq); + blk_mq_complete_request(rq); + return BLK_EH_DONE; +} + +static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct nullb_queue *nq = hctx->driver_data; + sector_t nr_sectors = blk_rq_sectors(bd->rq); + sector_t sector = blk_rq_pos(bd->rq); + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + cmd->rq = bd->rq; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + + blk_mq_start_request(bd->rq); + + if (should_requeue_request(bd->rq)) { + /* + * Alternate between hitting the core BUSY path, and the + * driver driven requeue path + */ + nq->requeue_selection++; + if (nq->requeue_selection & 1) + return BLK_STS_RESOURCE; + else { + blk_mq_requeue_request(bd->rq, true); + return BLK_STS_OK; + } + } + if (should_timeout_request(bd->rq)) + return BLK_STS_OK; + + return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); +} + +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + +static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nullb_queue *nq = hctx->driver_data; + struct nullb *nullb = nq->dev->nullb; + + nullb->nr_queues--; +} + +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; + nq->dev = nullb->dev; +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct nullb *nullb = hctx->queue->queuedata; + struct nullb_queue *nq; + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) + return -EFAULT; +#endif + + nq = &nullb->queues[hctx_idx]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + + return 0; +} + +static const struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .complete = null_complete_rq, + .timeout = null_timeout_rq, + .init_hctx = null_init_hctx, + .exit_hctx = null_exit_hctx, +}; + +static void null_del_dev(struct nullb *nullb) +{ + struct nullb_device *dev; + + if (!nullb) + return; + + dev = nullb->dev; + + ida_simple_remove(&nullb_indexes, nullb->index); + + list_del_init(&nullb->list); + + del_gendisk(nullb->disk); + + if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { + hrtimer_cancel(&nullb->bw_timer); + atomic_long_set(&nullb->cur_bytes, LONG_MAX); + null_restart_queue_async(nullb); + } + + blk_cleanup_queue(nullb->q); + if (dev->queue_mode == NULL_Q_MQ && + nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); + put_disk(nullb->disk); + cleanup_queues(nullb); + if (null_cache_active(nullb)) + null_free_device_storage(nullb->dev, true); + kfree(nullb); + dev->nullb = NULL; +} + +static void null_config_discard(struct nullb *nullb) +{ + if (nullb->dev->discard == false) + return; + + if (!nullb->dev->memory_backed) { + nullb->dev->discard = false; + pr_info("discard option is ignored without memory backing\n"); + return; + } + + if (nullb->dev->zoned) { + nullb->dev->discard = false; + pr_info("discard option is ignored in zoned mode\n"); + return; + } + + nullb->q->limits.discard_granularity = nullb->dev->blocksize; + nullb->q->limits.discard_alignment = nullb->dev->blocksize; + blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); +} + +static const struct block_device_operations null_bio_ops = { + .owner = THIS_MODULE, + .submit_bio = null_submit_bio, + .report_zones = null_report_zones, +}; + +static const struct block_device_operations null_rq_ops = { + .owner = THIS_MODULE, + .report_zones = null_report_zones, +}; + +static int setup_commands(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + int i, tag_size; + + nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); + if (!nq->cmds) + return -ENOMEM; + + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; + nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); + if (!nq->tag_map) { + kfree(nq->cmds); + return -ENOMEM; + } + + for (i = 0; i < nq->queue_depth; i++) { + cmd = &nq->cmds[i]; + cmd->tag = -1U; + } + + return 0; +} + +static int setup_queues(struct nullb *nullb) +{ + nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), + GFP_KERNEL); + if (!nullb->queues) + return -ENOMEM; + + nullb->queue_depth = nullb->dev->hw_queue_depth; + + return 0; +} + +static int init_driver_queues(struct nullb *nullb) +{ + struct nullb_queue *nq; + int i, ret = 0; + + for (i = 0; i < nullb->dev->submit_queues; i++) { + nq = &nullb->queues[i]; + + null_init_queue(nullb, nq); + + ret = setup_commands(nq); + if (ret) + return ret; + nullb->nr_queues++; + } + return 0; +} + +static int null_gendisk_register(struct nullb *nullb) +{ + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; + struct gendisk *disk; + + disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); + if (!disk) + return -ENOMEM; + set_capacity(disk, size); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + if (queue_is_mq(nullb->q)) + disk->fops = &null_rq_ops; + else + disk->fops = &null_bio_ops; + disk->private_data = nullb; + disk->queue = nullb->q; + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + if (nullb->dev->zoned) { + int ret = null_register_zoned_dev(nullb); + + if (ret) + return ret; + } + + add_disk(disk); + return 0; +} + +static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +{ + set->ops = &null_mq_ops; + set->nr_hw_queues = nullb ? nullb->dev->submit_queues : + g_submit_queues; + set->queue_depth = nullb ? nullb->dev->hw_queue_depth : + g_hw_queue_depth; + set->numa_node = nullb ? nullb->dev->home_node : g_home_node; + set->cmd_size = sizeof(struct nullb_cmd); + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + set->flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + set->driver_data = NULL; + + if ((nullb && nullb->dev->blocking) || g_blocking) + set->flags |= BLK_MQ_F_BLOCKING; + + return blk_mq_alloc_tag_set(set); +} + +static int null_validate_conf(struct nullb_device *dev) +{ + dev->blocksize = round_down(dev->blocksize, 512); + dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); + + if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->submit_queues != nr_online_nodes) + dev->submit_queues = nr_online_nodes; + } else if (dev->submit_queues > nr_cpu_ids) + dev->submit_queues = nr_cpu_ids; + else if (dev->submit_queues == 0) + dev->submit_queues = 1; + + dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); + dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); + + /* Do memory allocation, so set blocking */ + if (dev->memory_backed) + dev->blocking = true; + else /* cache is meaningless */ + dev->cache_size = 0; + dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, + dev->cache_size); + dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); + /* can not stop a queue */ + if (dev->queue_mode == NULL_Q_BIO) + dev->mbps = 0; + + if (dev->zoned && + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static bool __null_setup_fault(struct fault_attr *attr, char *str) +{ + if (!str[0]) + return true; + + if (!setup_fault_attr(attr, str)) + return false; + + attr->verbose = 0; + return true; +} +#endif + +static bool null_setup_fault(void) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) + return false; + if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) + return false; + if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) + return false; +#endif + return true; +} + +static int null_add_dev(struct nullb_device *dev) +{ + struct nullb *nullb; + int rv; + + rv = null_validate_conf(dev); + if (rv) + return rv; + + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); + if (!nullb) { + rv = -ENOMEM; + goto out; + } + nullb->dev = dev; + dev->nullb = nullb; + + spin_lock_init(&nullb->lock); + + rv = setup_queues(nullb); + if (rv) + goto out_free_nullb; + + if (dev->queue_mode == NULL_Q_MQ) { + if (shared_tags) { + nullb->tag_set = &tag_set; + rv = 0; + } else { + nullb->tag_set = &nullb->__tag_set; + rv = null_init_tag_set(nullb, nullb->tag_set); + } + + if (rv) + goto out_cleanup_queues; + + if (!null_setup_fault()) + goto out_cleanup_queues; + + nullb->tag_set->timeout = 5 * HZ; + nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); + if (IS_ERR(nullb->q)) { + rv = -ENOMEM; + goto out_cleanup_tags; + } + } else if (dev->queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue(dev->home_node); + if (!nullb->q) { + rv = -ENOMEM; + goto out_cleanup_queues; + } + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; + } + + if (dev->mbps) { + set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); + nullb_setup_bwtimer(nullb); + } + + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + blk_queue_write_cache(nullb->q, true, true); + } + + if (dev->zoned) { + rv = null_init_zoned_dev(dev, nullb->q); + if (rv) + goto out_cleanup_blk_queue; + } + + nullb->q->queuedata = nullb; + blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); + + mutex_lock(&lock); + nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + dev->index = nullb->index; + mutex_unlock(&lock); + + blk_queue_logical_block_size(nullb->q, dev->blocksize); + blk_queue_physical_block_size(nullb->q, dev->blocksize); + if (!dev->max_sectors) + dev->max_sectors = queue_max_hw_sectors(nullb->q); + dev->max_sectors = min_t(unsigned int, dev->max_sectors, + BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); + + null_config_discard(nullb); + + sprintf(nullb->disk_name, "nullb%d", nullb->index); + + rv = null_gendisk_register(nullb); + if (rv) + goto out_cleanup_zone; + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + mutex_unlock(&lock); + + return 0; +out_cleanup_zone: + null_free_zoned_dev(dev); +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); + dev->nullb = NULL; +out: + return rv; +} + +static int __init null_init(void) +{ + int ret = 0; + unsigned int i; + struct nullb *nullb; + struct nullb_device *dev; + + if (g_bs > PAGE_SIZE) { + pr_warn("invalid block size\n"); + pr_warn("defaults block size to %lu\n", PAGE_SIZE); + g_bs = PAGE_SIZE; + } + + if (g_max_sectors > BLK_DEF_MAX_SECTORS) { + pr_warn("invalid max sectors\n"); + pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); + g_max_sectors = BLK_DEF_MAX_SECTORS; + } + + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { + pr_err("invalid home_node value\n"); + g_home_node = NUMA_NO_NODE; + } + + if (g_queue_mode == NULL_Q_RQ) { + pr_err("legacy IO path no longer available\n"); + return -EINVAL; + } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_submit_queues != nr_online_nodes) { + pr_warn("submit_queues param is set to %u.\n", + nr_online_nodes); + g_submit_queues = nr_online_nodes; + } + } else if (g_submit_queues > nr_cpu_ids) + g_submit_queues = nr_cpu_ids; + else if (g_submit_queues <= 0) + g_submit_queues = 1; + + if (g_queue_mode == NULL_Q_MQ && shared_tags) { + ret = null_init_tag_set(NULL, &tag_set); + if (ret) + return ret; + } + + config_group_init(&nullb_subsys.su_group); + mutex_init(&nullb_subsys.su_mutex); + + ret = configfs_register_subsystem(&nullb_subsys); + if (ret) + goto err_tagset; + + mutex_init(&lock); + + null_major = register_blkdev(0, "nullb"); + if (null_major < 0) { + ret = null_major; + goto err_conf; + } + + for (i = 0; i < nr_devices; i++) { + dev = null_alloc_dev(); + if (!dev) { + ret = -ENOMEM; + goto err_dev; + } + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + goto err_dev; + } + } + + pr_info("module loaded\n"); + return 0; + +err_dev: + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + unregister_blkdev(null_major, "nullb"); +err_conf: + configfs_unregister_subsystem(&nullb_subsys); +err_tagset: + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); + return ret; +} + +static void __exit null_exit(void) +{ + struct nullb *nullb; + + configfs_unregister_subsystem(&nullb_subsys); + + unregister_blkdev(null_major, "nullb"); + + mutex_lock(&lock); + while (!list_empty(&nullb_list)) { + struct nullb_device *dev; + + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + mutex_unlock(&lock); + + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); +} + +module_init(null_init); +module_exit(null_exit); + +MODULE_AUTHOR("Jens Axboe "); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h new file mode 100644 index 000000000000..83504f3cc9d6 --- /dev/null +++ b/drivers/block/null_blk/null_blk.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_NULL_BLK_H +#define __BLK_NULL_BLK_H + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nullb_cmd { + struct request *rq; + struct bio *bio; + unsigned int tag; + blk_status_t error; + struct nullb_queue *nq; + struct hrtimer timer; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + struct nullb_device *dev; + unsigned int requeue_selection; + + struct nullb_cmd *cmds; +}; + +struct nullb_zone { + /* + * Zone lock to prevent concurrent modification of a zone write + * pointer position and condition: with memory backing, a write + * command execution may sleep on memory allocation. For this case, + * use mutex as the zone lock. Otherwise, use the spinlock for + * locking the zone. + */ + union { + spinlock_t spinlock; + struct mutex mutex; + }; + enum blk_zone_type type; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + unsigned int len; + unsigned int capacity; +}; + +struct nullb_device { + struct nullb *nullb; + struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ + unsigned long flags; /* device flags */ + unsigned int curr_cache; + struct badblocks badblocks; + + unsigned int nr_zones; + unsigned int nr_zones_imp_open; + unsigned int nr_zones_exp_open; + unsigned int nr_zones_closed; + unsigned int imp_close_zone_no; + struct nullb_zone *zones; + sector_t zone_size_sects; + bool need_zone_res_mgmt; + spinlock_t zone_res_lock; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ + unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ + unsigned int zone_nr_conv; /* number of conventional zones */ + unsigned int zone_max_open; /* max number of open zones */ + unsigned int zone_max_active; /* max number of active zones */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int max_sectors; /* Max sectors per command */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ + bool zoned; /* if device is zoned */ +}; + +struct nullb { + struct nullb_device *dev; + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; + unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; + unsigned long cache_flush_pos; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; +}; + +blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, + sector_t nr_sectors); +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors); + +#ifdef CONFIG_BLK_DEV_ZONED +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_register_zoned_dev(struct nullb *nullb); +void null_free_zoned_dev(struct nullb_device *dev); +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len); +#else +static inline int null_init_zoned_dev(struct nullb_device *dev, + struct request_queue *q) +{ + pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); + return -EINVAL; +} +static inline int null_register_zoned_dev(struct nullb *nullb) +{ + return -ENODEV; +} +static inline void null_free_zoned_dev(struct nullb_device *dev) {} +static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, sector_t nr_sectors) +{ + return BLK_STS_NOTSUPP; +} +static inline size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, + unsigned int len) +{ + return len; +} +#define null_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ +#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/trace.c b/drivers/block/null_blk/trace.c new file mode 100644 index 000000000000..3711cba16071 --- /dev/null +++ b/drivers/block/null_blk/trace.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * null_blk trace related helpers. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include "trace.h" + +/* + * Helper to use for all null_blk traces to extract disk name. + */ +const char *nullb_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (name && *name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h new file mode 100644 index 000000000000..ce3b430e88c5 --- /dev/null +++ b/drivers/block/null_blk/trace.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * null_blk device driver tracepoints. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nullb + +#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NULLB_H + +#include +#include + +#include "null_blk.h" + +const char *nullb_trace_disk_name(struct trace_seq *p, char *name); + +#define __print_disk_name(name) nullb_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nullb_zone_op, + TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, + unsigned int zone_cond), + TP_ARGS(cmd, zone_no, zone_cond), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(enum req_opf, op) + __field(unsigned int, zone_no) + __field(unsigned int, zone_cond) + ), + TP_fast_assign( + __entry->op = req_op(cmd->rq); + __entry->zone_no = zone_no; + __entry->zone_cond = zone_cond; + __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + ), + TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", + __print_disk_name(__entry->disk), + blk_op_str(__entry->op), + __entry->zone_no, + blk_zone_cond_str(__entry->zone_cond)) +); + +TRACE_EVENT(nullb_report_zones, + TP_PROTO(struct nullb *nullb, unsigned int nr_zones), + TP_ARGS(nullb, nr_zones), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(unsigned int, nr_zones) + ), + TP_fast_assign( + __entry->nr_zones = nr_zones; + __assign_disk_name(__entry->disk, nullb->disk); + ), + TP_printk("%s nr_zones=%u", + __print_disk_name(__entry->disk), __entry->nr_zones) +); + +#endif /* _TRACE_NULLB_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c new file mode 100644 index 000000000000..148b871f263b --- /dev/null +++ b/drivers/block/null_blk/zoned.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "null_blk.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) + +static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +{ + return sect >> ilog2(dev->zone_size_sects); +} + +static inline void null_lock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_lock_irq(&dev->zone_res_lock); +} + +static inline void null_unlock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_unlock_irq(&dev->zone_res_lock); +} + +static inline void null_init_zone_lock(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_init(&zone->spinlock); + else + mutex_init(&zone->mutex); +} + +static inline void null_lock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_irq(&zone->spinlock); + else + mutex_lock(&zone->mutex); +} + +static inline void null_unlock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_unlock_irq(&zone->spinlock); + else + mutex_unlock(&zone->mutex); +} + +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +{ + sector_t dev_capacity_sects, zone_capacity_sects; + struct nullb_zone *zone; + sector_t sector = 0; + unsigned int i; + + if (!is_power_of_2(dev->zone_size)) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + if (dev->zone_size > dev->size) { + pr_err("Zone size larger than device capacity\n"); + return -EINVAL; + } + + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), + GFP_KERNEL | __GFP_ZERO); + if (!dev->zones) + return -ENOMEM; + + spin_lock_init(&dev->zone_res_lock); + + if (dev->zone_nr_conv >= dev->nr_zones) { + dev->zone_nr_conv = dev->nr_zones - 1; + pr_info("changed the number of conventional zones to %u", + dev->zone_nr_conv); + } + + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ + if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_active = 0; + pr_info("zone_max_active limit disabled, limit >= zone count\n"); + } + + /* Max open zones has to be <= max active zones */ + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { + dev->zone_max_open = dev->zone_max_active; + pr_info("changed the maximum number of open zones to %u\n", + dev->nr_zones); + } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_open = 0; + pr_info("zone_max_open limit disabled, limit >= zone count\n"); + } + dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; + dev->imp_close_zone_no = dev->zone_nr_conv; + + for (i = 0; i < dev->zone_nr_conv; i++) { + zone = &dev->zones[i]; + + null_init_zone_lock(dev, zone); + zone->start = sector; + zone->len = dev->zone_size_sects; + zone->capacity = zone->len; + zone->wp = zone->start + zone->len; + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->cond = BLK_ZONE_COND_NOT_WP; + + sector += dev->zone_size_sects; + } + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[i]; + + null_init_zone_lock(dev, zone); + zone->start = zone->wp = sector; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone->cond = BLK_ZONE_COND_EMPTY; + + sector += dev->zone_size_sects; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + + return 0; +} + +int null_register_zoned_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + struct request_queue *q = nullb->q; + + if (queue_is_mq(q)) { + int ret = blk_revalidate_disk_zones(nullb->disk, NULL); + + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(q, dev->zone_size_sects); + q->nr_zones = blkdev_nr_zones(nullb->disk); + } + + blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); + blk_queue_max_open_zones(q, dev->zone_max_open); + blk_queue_max_active_zones(q, dev->zone_max_active); + + return 0; +} + +void null_free_zoned_dev(struct nullb_device *dev) +{ + kvfree(dev->zones); +} + +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nullb *nullb = disk->private_data; + struct nullb_device *dev = nullb->dev; + unsigned int first_zone, i; + struct nullb_zone *zone; + struct blk_zone blkz; + int error; + + first_zone = null_zone_no(dev, sector); + if (first_zone >= dev->nr_zones) + return 0; + + nr_zones = min(nr_zones, dev->nr_zones - first_zone); + trace_nullb_report_zones(nullb, nr_zones); + + memset(&blkz, 0, sizeof(struct blk_zone)); + zone = &dev->zones[first_zone]; + for (i = 0; i < nr_zones; i++, zone++) { + /* + * Stacked DM target drivers will remap the zone information by + * modifying the zone information passed to the report callback. + * So use a local copy to avoid corruption of the device zone + * array. + */ + null_lock_zone(dev, zone); + blkz.start = zone->start; + blkz.len = zone->len; + blkz.wp = zone->wp; + blkz.type = zone->type; + blkz.cond = zone->cond; + blkz.capacity = zone->capacity; + null_unlock_zone(dev, zone); + + error = cb(&blkz, i, data); + if (error) + return error; + } + + return nr_zones; +} + +/* + * This is called in the case of memory backing from null_process_cmd() + * with the target zone already locked. + */ +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len) +{ + struct nullb_device *dev = nullb->dev; + struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + unsigned int nr_sectors = len >> SECTOR_SHIFT; + + /* Read must be below the write pointer position */ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || + sector + nr_sectors <= zone->wp) + return len; + + if (sector > zone->wp) + return 0; + + return (zone->wp - sector) << SECTOR_SHIFT; +} + +static blk_status_t __null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } + + return BLK_STS_OK; +} + +static void null_close_imp_open_zone(struct nullb_device *dev) +{ + struct nullb_zone *zone; + unsigned int zno, i; + + zno = dev->imp_close_zone_no; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[zno]; + zno++; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + __null_close_zone(dev, zone); + dev->imp_close_zone_no = zno; + return; + } + } +} + +static blk_status_t null_check_active(struct nullb_device *dev) +{ + if (!dev->zone_max_active) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + + dev->nr_zones_closed < dev->zone_max_active) + return BLK_STS_OK; + + return BLK_STS_ZONE_ACTIVE_RESOURCE; +} + +static blk_status_t null_check_open(struct nullb_device *dev) +{ + if (!dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_imp_open) { + if (null_check_active(dev) == BLK_STS_OK) { + null_close_imp_open_zone(dev); + return BLK_STS_OK; + } + } + + return BLK_STS_ZONE_OPEN_RESOURCE; +} + +/* + * This function matches the manage open zone resources function in the ZBC standard, + * with the addition of max active zones support (added in the ZNS standard). + * + * The function determines if a zone can transition to implicit open or explicit open, + * while maintaining the max open zone (and max active zone) limit(s). It may close an + * implicit open zone in order to make additional zone resources available. + * + * ZBC states that an implicit open zone shall be closed only if there is not + * room within the open limit. However, with the addition of an active limit, + * it is not certain that closing an implicit open zone will allow a new zone + * to be opened, since we might already be at the active limit capacity. + */ +static blk_status_t null_check_zone_resources(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret; + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_active(dev); + if (ret != BLK_STS_OK) + return ret; + fallthrough; + case BLK_ZONE_COND_CLOSED: + return null_check_open(dev); + default: + /* Should never be called for other states */ + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, + unsigned int nr_sectors, bool append) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zno = null_zone_no(dev, sector); + struct nullb_zone *zone = &dev->zones[zno]; + blk_status_t ret; + + trace_nullb_zone_op(cmd, zno, zone->cond); + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; + return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } + + null_lock_zone(dev, zone); + + if (zone->cond == BLK_ZONE_COND_FULL) { + /* Cannot write to a full zone */ + ret = BLK_STS_IOERR; + goto unlock; + } + + /* + * Regular writes must be at the write pointer position. + * Zone append writes are automatically issued at the write + * pointer and the position returned using the request or BIO + * sector. + */ + if (append) { + sector = zone->wp; + if (cmd->bio) + cmd->bio->bi_iter.bi_sector = sector; + else + cmd->rq->__sector = sector; + } else if (sector != zone->wp) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->wp + nr_sectors > zone->start + zone->capacity) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) { + null_lock_zone_res(dev); + + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + null_unlock_zone_res(dev); + goto unlock; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + null_unlock_zone_res(dev); + } + + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + if (ret != BLK_STS_OK) + goto unlock; + + zone->wp += nr_sectors; + if (zone->wp == zone->start + zone->capacity) { + null_lock_zone_res(dev); + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + zone->cond = BLK_ZONE_COND_FULL; + null_unlock_zone_res(dev); + } + + ret = BLK_STS_OK; + +unlock: + null_unlock_zone(dev, zone); + + return ret; +} + +static blk_status_t null_open_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + /* open operation on exp open is not an error */ + goto unlock; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + default: + ret = BLK_STS_IOERR; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + dev->nr_zones_exp_open++; + +unlock: + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + ret = __null_close_zone(dev, zone); + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* finish operation on full is not an error */ + goto unlock; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + dev->nr_zones_closed--; + break; + default: + ret = BLK_STS_IOERR; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; + +unlock: + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_reset_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + /* reset operation on empty is not an error */ + null_unlock_zone_res(dev); + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + break; + default: + null_unlock_zone_res(dev); + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + + null_unlock_zone_res(dev); + + if (dev->memory_backed) + return null_handle_discard(dev, zone->start, zone->len); + + return BLK_STS_OK; +} + +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zone_no; + struct nullb_zone *zone; + blk_status_t ret; + size_t i; + + if (op == REQ_OP_ZONE_RESET_ALL) { + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[i]; + null_lock_zone(dev, zone); + if (zone->cond != BLK_ZONE_COND_EMPTY) { + null_reset_zone(dev, zone); + trace_nullb_zone_op(cmd, i, zone->cond); + } + null_unlock_zone(dev, zone); + } + return BLK_STS_OK; + } + + zone_no = null_zone_no(dev, sector); + zone = &dev->zones[zone_no]; + + null_lock_zone(dev, zone); + + switch (op) { + case REQ_OP_ZONE_RESET: + ret = null_reset_zone(dev, zone); + break; + case REQ_OP_ZONE_OPEN: + ret = null_open_zone(dev, zone); + break; + case REQ_OP_ZONE_CLOSE: + ret = null_close_zone(dev, zone); + break; + case REQ_OP_ZONE_FINISH: + ret = null_finish_zone(dev, zone); + break; + default: + ret = BLK_STS_NOTSUPP; + break; + } + + if (ret == BLK_STS_OK) + trace_nullb_zone_op(cmd, zone_no, zone->cond); + + null_unlock_zone(dev, zone); + + return ret; +} + +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) +{ + struct nullb_device *dev; + struct nullb_zone *zone; + blk_status_t sts; + + switch (op) { + case REQ_OP_WRITE: + return null_zone_write(cmd, sector, nr_sectors, false); + case REQ_OP_ZONE_APPEND: + return null_zone_write(cmd, sector, nr_sectors, true); + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return null_zone_mgmt(cmd, op, sector); + default: + dev = cmd->nq->dev; + zone = &dev->zones[null_zone_no(dev, sector)]; + + null_lock_zone(dev, zone); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + null_unlock_zone(dev, zone); + return sts; + } +} diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c deleted file mode 100644 index 5357c3a4a36f..000000000000 --- a/drivers/block/null_blk_main.c +++ /dev/null @@ -1,2031 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Add configfs and memory store: Kyungchan Koh and - * Shaohua Li - */ -#include - -#include -#include -#include -#include -#include "null_blk.h" - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - -#define FREE_BATCH 16 - -#define TICKS_PER_SEC 50ULL -#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static DECLARE_FAULT_ATTR(null_timeout_attr); -static DECLARE_FAULT_ATTR(null_requeue_attr); -static DECLARE_FAULT_ATTR(null_init_hctx_attr); -#endif - -static inline u64 mb_per_tick(int mbps) -{ - return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); -} - -/* - * Status flags for nullb_device. - * - * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. - * UP: Device is currently on and visible in userspace. - * THROTTLED: Device is being throttled. - * CACHE: Device is using a write-back cache. - */ -enum nullb_device_flags { - NULLB_DEV_FL_CONFIGURED = 0, - NULLB_DEV_FL_UP = 1, - NULLB_DEV_FL_THROTTLED = 2, - NULLB_DEV_FL_CACHE = 3, -}; - -#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) -/* - * nullb_page is a page in memory for nullb devices. - * - * @page: The page holding the data. - * @bitmap: The bitmap represents which sector in the page has data. - * Each bit represents one block size. For example, sector 8 - * will use the 7th bit - * The highest 2 bits of bitmap are for special purpose. LOCK means the cache - * page is being flushing to storage. FREE means the cache page is freed and - * should be skipped from flushing to storage. Please see - * null_make_cache_space - */ -struct nullb_page { - struct page *page; - DECLARE_BITMAP(bitmap, MAP_SZ); -}; -#define NULLB_PAGE_LOCK (MAP_SZ - 1) -#define NULLB_PAGE_FREE (MAP_SZ - 2) - -static LIST_HEAD(nullb_list); -static struct mutex lock; -static int null_major; -static DEFINE_IDA(nullb_indexes); -static struct blk_mq_tag_set tag_set; - -enum { - NULL_IRQ_NONE = 0, - NULL_IRQ_SOFTIRQ = 1, - NULL_IRQ_TIMER = 2, -}; - -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - -static int g_no_sched; -module_param_named(no_sched, g_no_sched, int, 0444); -MODULE_PARM_DESC(no_sched, "No io scheduler"); - -static int g_submit_queues = 1; -module_param_named(submit_queues, g_submit_queues, int, 0444); -MODULE_PARM_DESC(submit_queues, "Number of submission queues"); - -static int g_home_node = NUMA_NO_NODE; -module_param_named(home_node, g_home_node, int, 0444); -MODULE_PARM_DESC(home_node, "Home node for the device"); - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -/* - * For more details about fault injection, please refer to - * Documentation/fault-injection/fault-injection.rst. - */ -static char g_timeout_str[80]; -module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); -MODULE_PARM_DESC(timeout, "Fault injection. timeout=,,,"); - -static char g_requeue_str[80]; -module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); -MODULE_PARM_DESC(requeue, "Fault injection. requeue=,,,"); - -static char g_init_hctx_str[80]; -module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); -MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=,,,"); -#endif - -static int g_queue_mode = NULL_Q_MQ; - -static int null_param_store_val(const char *str, int *val, int min, int max) -{ - int ret, new_val; - - ret = kstrtoint(str, 10, &new_val); - if (ret) - return -EINVAL; - - if (new_val < min || new_val > max) - return -EINVAL; - - *val = new_val; - return 0; -} - -static int null_set_queue_mode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); -} - -static const struct kernel_param_ops null_queue_mode_param_ops = { - .set = null_set_queue_mode, - .get = param_get_int, -}; - -device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); -MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); - -static int g_gb = 250; -module_param_named(gb, g_gb, int, 0444); -MODULE_PARM_DESC(gb, "Size in GB"); - -static int g_bs = 512; -module_param_named(bs, g_bs, int, 0444); -MODULE_PARM_DESC(bs, "Block size (in bytes)"); - -static int g_max_sectors; -module_param_named(max_sectors, g_max_sectors, int, 0444); -MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); - -static unsigned int nr_devices = 1; -module_param(nr_devices, uint, 0444); -MODULE_PARM_DESC(nr_devices, "Number of devices to register"); - -static bool g_blocking; -module_param_named(blocking, g_blocking, bool, 0444); -MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); - -static bool shared_tags; -module_param(shared_tags, bool, 0444); -MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); - -static bool g_shared_tag_bitmap; -module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); -MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); - -static int g_irqmode = NULL_IRQ_SOFTIRQ; - -static int null_set_irqmode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, - NULL_IRQ_TIMER); -} - -static const struct kernel_param_ops null_irqmode_param_ops = { - .set = null_set_irqmode, - .get = param_get_int, -}; - -device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); -MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); - -static unsigned long g_completion_nsec = 10000; -module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); -MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); - -static int g_hw_queue_depth = 64; -module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); -MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); - -static bool g_use_per_node_hctx; -module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); -MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); - -static bool g_zoned; -module_param_named(zoned, g_zoned, bool, S_IRUGO); -MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); - -static unsigned long g_zone_size = 256; -module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); - -static unsigned long g_zone_capacity; -module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); -MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); - -static unsigned int g_zone_nr_conv; -module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); -MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); - -static unsigned int g_zone_max_open; -module_param_named(zone_max_open, g_zone_max_open, uint, 0444); -MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); - -static unsigned int g_zone_max_active; -module_param_named(zone_max_active, g_zone_max_active, uint, 0444); -MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); - -static struct nullb_device *null_alloc_dev(void); -static void null_free_dev(struct nullb_device *dev); -static void null_del_dev(struct nullb *nullb); -static int null_add_dev(struct nullb_device *dev); -static void null_free_device_storage(struct nullb_device *dev, bool is_cache); - -static inline struct nullb_device *to_nullb_device(struct config_item *item) -{ - return item ? container_of(item, struct nullb_device, item) : NULL; -} - -static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, - char *page) -{ - return snprintf(page, PAGE_SIZE, "%lu\n", val); -} - -static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static ssize_t nullb_device_uint_attr_store(unsigned int *val, - const char *page, size_t count) -{ - unsigned int tmp; - int result; - - result = kstrtouint(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_ulong_attr_store(unsigned long *val, - const char *page, size_t count) -{ - int result; - unsigned long tmp; - - result = kstrtoul(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, - size_t count) -{ - bool tmp; - int result; - - result = kstrtobool(page, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ -#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ -static ssize_t \ -nullb_device_##NAME##_show(struct config_item *item, char *page) \ -{ \ - return nullb_device_##TYPE##_attr_show( \ - to_nullb_device(item)->NAME, page); \ -} \ -static ssize_t \ -nullb_device_##NAME##_store(struct config_item *item, const char *page, \ - size_t count) \ -{ \ - int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ - struct nullb_device *dev = to_nullb_device(item); \ - TYPE new_value = 0; \ - int ret; \ - \ - ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ - if (ret < 0) \ - return ret; \ - if (apply_fn) \ - ret = apply_fn(dev, new_value); \ - else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ - ret = -EBUSY; \ - if (ret < 0) \ - return ret; \ - dev->NAME = new_value; \ - return count; \ -} \ -CONFIGFS_ATTR(nullb_device_, NAME); - -static int nullb_apply_submit_queues(struct nullb_device *dev, - unsigned int submit_queues) -{ - struct nullb *nullb = dev->nullb; - struct blk_mq_tag_set *set; - - if (!nullb) - return 0; - - /* - * Make sure that null_init_hctx() does not access nullb->queues[] past - * the end of that array. - */ - if (submit_queues > nr_cpu_ids) - return -EINVAL; - set = nullb->tag_set; - blk_mq_update_nr_hw_queues(set, submit_queues); - return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; -} - -NULLB_DEVICE_ATTR(size, ulong, NULL); -NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); -NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); -NULLB_DEVICE_ATTR(home_node, uint, NULL); -NULLB_DEVICE_ATTR(queue_mode, uint, NULL); -NULLB_DEVICE_ATTR(blocksize, uint, NULL); -NULLB_DEVICE_ATTR(max_sectors, uint, NULL); -NULLB_DEVICE_ATTR(irqmode, uint, NULL); -NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); -NULLB_DEVICE_ATTR(index, uint, NULL); -NULLB_DEVICE_ATTR(blocking, bool, NULL); -NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); -NULLB_DEVICE_ATTR(memory_backed, bool, NULL); -NULLB_DEVICE_ATTR(discard, bool, NULL); -NULLB_DEVICE_ATTR(mbps, uint, NULL); -NULLB_DEVICE_ATTR(cache_size, ulong, NULL); -NULLB_DEVICE_ATTR(zoned, bool, NULL); -NULLB_DEVICE_ATTR(zone_size, ulong, NULL); -NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); -NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); - -static ssize_t nullb_device_power_show(struct config_item *item, char *page) -{ - return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); -} - -static ssize_t nullb_device_power_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *dev = to_nullb_device(item); - bool newp = false; - ssize_t ret; - - ret = nullb_device_bool_attr_store(&newp, page, count); - if (ret < 0) - return ret; - - if (!dev->power && newp) { - if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) - return count; - if (null_add_dev(dev)) { - clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; - } - - set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - dev->power = newp; - } else if (dev->power && !newp) { - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = newp; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - } - - return count; -} - -CONFIGFS_ATTR(nullb_device_, power); - -static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) -{ - struct nullb_device *t_dev = to_nullb_device(item); - - return badblocks_show(&t_dev->badblocks, page, 0); -} - -static ssize_t nullb_device_badblocks_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *t_dev = to_nullb_device(item); - char *orig, *buf, *tmp; - u64 start, end; - int ret; - - orig = kstrndup(page, count, GFP_KERNEL); - if (!orig) - return -ENOMEM; - - buf = strstrip(orig); - - ret = -EINVAL; - if (buf[0] != '+' && buf[0] != '-') - goto out; - tmp = strchr(&buf[1], '-'); - if (!tmp) - goto out; - *tmp = '\0'; - ret = kstrtoull(buf + 1, 0, &start); - if (ret) - goto out; - ret = kstrtoull(tmp + 1, 0, &end); - if (ret) - goto out; - ret = -EINVAL; - if (start > end) - goto out; - /* enable badblocks */ - cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) - ret = count; -out: - kfree(orig); - return ret; -} -CONFIGFS_ATTR(nullb_device_, badblocks); - -static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, - &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_max_sectors, - &nullb_device_attr_irqmode, - &nullb_device_attr_hw_queue_depth, - &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, - &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, - &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, - &nullb_device_attr_zone_max_active, - NULL, -}; - -static void nullb_device_release(struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - null_free_device_storage(dev, false); - null_free_dev(dev); -} - -static struct configfs_item_operations nullb_device_ops = { - .release = nullb_device_release, -}; - -static const struct config_item_type nullb_device_type = { - .ct_item_ops = &nullb_device_ops, - .ct_attrs = nullb_device_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct -config_item *nullb_group_make_item(struct config_group *group, const char *name) -{ - struct nullb_device *dev; - - dev = null_alloc_dev(); - if (!dev) - return ERR_PTR(-ENOMEM); - - config_item_init_type_name(&dev->item, name, &nullb_device_type); - - return &dev->item; -} - -static void -nullb_group_drop_item(struct config_group *group, struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = false; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - - config_item_put(item); -} - -static ssize_t memb_group_features_show(struct config_item *item, char *page) -{ - return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); -} - -CONFIGFS_ATTR_RO(memb_group_, features); - -static struct configfs_attribute *nullb_group_attrs[] = { - &memb_group_attr_features, - NULL, -}; - -static struct configfs_group_operations nullb_group_ops = { - .make_item = nullb_group_make_item, - .drop_item = nullb_group_drop_item, -}; - -static const struct config_item_type nullb_group_type = { - .ct_group_ops = &nullb_group_ops, - .ct_attrs = nullb_group_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct configfs_subsystem nullb_subsys = { - .su_group = { - .cg_item = { - .ci_namebuf = "nullb", - .ci_type = &nullb_group_type, - }, - }, -}; - -static inline int null_cache_active(struct nullb *nullb) -{ - return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); -} - -static struct nullb_device *null_alloc_dev(void) -{ - struct nullb_device *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return NULL; - INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); - INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); - if (badblocks_init(&dev->badblocks, 0)) { - kfree(dev); - return NULL; - } - - dev->size = g_gb * 1024; - dev->completion_nsec = g_completion_nsec; - dev->submit_queues = g_submit_queues; - dev->home_node = g_home_node; - dev->queue_mode = g_queue_mode; - dev->blocksize = g_bs; - dev->max_sectors = g_max_sectors; - dev->irqmode = g_irqmode; - dev->hw_queue_depth = g_hw_queue_depth; - dev->blocking = g_blocking; - dev->use_per_node_hctx = g_use_per_node_hctx; - dev->zoned = g_zoned; - dev->zone_size = g_zone_size; - dev->zone_capacity = g_zone_capacity; - dev->zone_nr_conv = g_zone_nr_conv; - dev->zone_max_open = g_zone_max_open; - dev->zone_max_active = g_zone_max_active; - return dev; -} - -static void null_free_dev(struct nullb_device *dev) -{ - if (!dev) - return; - - null_free_zoned_dev(dev); - badblocks_exit(&dev->badblocks); - kfree(dev); -} - -static void put_tag(struct nullb_queue *nq, unsigned int tag) -{ - clear_bit_unlock(tag, nq->tag_map); - - if (waitqueue_active(&nq->wait)) - wake_up(&nq->wait); -} - -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - -static void free_cmd(struct nullb_cmd *cmd) -{ - put_tag(cmd->nq, cmd->tag); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); - -static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; - } - - return NULL; -} - -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) -{ - struct nullb_cmd *cmd; - DEFINE_WAIT(wait); - - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); - cmd = __alloc_cmd(nq); - if (cmd) - break; - - io_schedule(); - } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; -} - -static void end_cmd(struct nullb_cmd *cmd) -{ - int queue_mode = cmd->nq->dev->queue_mode; - - switch (queue_mode) { - case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, cmd->error); - return; - case NULL_Q_BIO: - cmd->bio->bi_status = cmd->error; - bio_endio(cmd->bio); - break; - } - - free_cmd(cmd); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) -{ - end_cmd(container_of(timer, struct nullb_cmd, timer)); - - return HRTIMER_NORESTART; -} - -static void null_cmd_end_timer(struct nullb_cmd *cmd) -{ - ktime_t kt = cmd->nq->dev->completion_nsec; - - hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); -} - -static void null_complete_rq(struct request *rq) -{ - end_cmd(blk_mq_rq_to_pdu(rq)); -} - -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) -{ - struct nullb_page *t_page; - - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); - if (!t_page) - goto out; - - t_page->page = alloc_pages(gfp_flags, 0); - if (!t_page->page) - goto out_freepage; - - memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); - return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; -} - -static void null_free_page(struct nullb_page *t_page) -{ - __set_bit(NULLB_PAGE_FREE, t_page->bitmap); - if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) - return; - __free_page(t_page->page); - kfree(t_page); -} - -static bool null_page_empty(struct nullb_page *page) -{ - int size = MAP_SZ - 2; - - return find_first_bit(page->bitmap, size) == size; -} - -static void null_free_sector(struct nullb *nullb, sector_t sector, - bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page, *ret; - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - t_page = radix_tree_lookup(root, idx); - if (t_page) { - __clear_bit(sector_bit, t_page->bitmap); - - if (null_page_empty(t_page)) { - ret = radix_tree_delete_item(root, idx, t_page); - WARN_ON(ret != t_page); - null_free_page(ret); - if (is_cache) - nullb->dev->curr_cache -= PAGE_SIZE; - } - } -} - -static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, - struct nullb_page *t_page, bool is_cache) -{ - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - - if (radix_tree_insert(root, idx, t_page)) { - null_free_page(t_page); - t_page = radix_tree_lookup(root, idx); - WARN_ON(!t_page || t_page->page->index != idx); - } else if (is_cache) - nullb->dev->curr_cache += PAGE_SIZE; - - return t_page; -} - -static void null_free_device_storage(struct nullb_device *dev, bool is_cache) -{ - unsigned long pos = 0; - int nr_pages; - struct nullb_page *ret, *t_pages[FREE_BATCH]; - struct radix_tree_root *root; - - root = is_cache ? &dev->cache : &dev->data; - - do { - int i; - - nr_pages = radix_tree_gang_lookup(root, - (void **)t_pages, pos, FREE_BATCH); - - for (i = 0; i < nr_pages; i++) { - pos = t_pages[i]->page->index; - ret = radix_tree_delete_item(root, pos, t_pages[i]); - WARN_ON(ret != t_pages[i]); - null_free_page(ret); - } - - pos++; - } while (nr_pages == FREE_BATCH); - - if (is_cache) - dev->curr_cache = 0; -} - -static struct nullb_page *__null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page; - struct radix_tree_root *root; - - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - t_page = radix_tree_lookup(root, idx); - WARN_ON(t_page && t_page->page->index != idx); - - if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) - return t_page; - - return NULL; -} - -static struct nullb_page *null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool ignore_cache) -{ - struct nullb_page *page = NULL; - - if (!ignore_cache) - page = __null_lookup_page(nullb, sector, for_write, true); - if (page) - return page; - return __null_lookup_page(nullb, sector, for_write, false); -} - -static struct nullb_page *null_insert_page(struct nullb *nullb, - sector_t sector, bool ignore_cache) - __releases(&nullb->lock) - __acquires(&nullb->lock) -{ - u64 idx; - struct nullb_page *t_page; - - t_page = null_lookup_page(nullb, sector, true, ignore_cache); - if (t_page) - return t_page; - - spin_unlock_irq(&nullb->lock); - - t_page = null_alloc_page(GFP_NOIO); - if (!t_page) - goto out_lock; - - if (radix_tree_preload(GFP_NOIO)) - goto out_freepage; - - spin_lock_irq(&nullb->lock); - idx = sector >> PAGE_SECTORS_SHIFT; - t_page->page->index = idx; - t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); - radix_tree_preload_end(); - - return t_page; -out_freepage: - null_free_page(t_page); -out_lock: - spin_lock_irq(&nullb->lock); - return null_lookup_page(nullb, sector, true, ignore_cache); -} - -static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) -{ - int i; - unsigned int offset; - u64 idx; - struct nullb_page *t_page, *ret; - void *dst, *src; - - idx = c_page->page->index; - - t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); - - __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); - if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { - null_free_page(c_page); - if (t_page && null_page_empty(t_page)) { - ret = radix_tree_delete_item(&nullb->dev->data, - idx, t_page); - null_free_page(t_page); - } - return 0; - } - - if (!t_page) - return -ENOMEM; - - src = kmap_atomic(c_page->page); - dst = kmap_atomic(t_page->page); - - for (i = 0; i < PAGE_SECTORS; - i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { - if (test_bit(i, c_page->bitmap)) { - offset = (i << SECTOR_SHIFT); - memcpy(dst + offset, src + offset, - nullb->dev->blocksize); - __set_bit(i, t_page->bitmap); - } - } - - kunmap_atomic(dst); - kunmap_atomic(src); - - ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); - null_free_page(ret); - nullb->dev->curr_cache -= PAGE_SIZE; - - return 0; -} - -static int null_make_cache_space(struct nullb *nullb, unsigned long n) -{ - int i, err, nr_pages; - struct nullb_page *c_pages[FREE_BATCH]; - unsigned long flushed = 0, one_round; - -again: - if ((nullb->dev->cache_size * 1024 * 1024) > - nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) - return 0; - - nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, - (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); - /* - * nullb_flush_cache_page could unlock before using the c_pages. To - * avoid race, we don't allow page free - */ - for (i = 0; i < nr_pages; i++) { - nullb->cache_flush_pos = c_pages[i]->page->index; - /* - * We found the page which is being flushed to disk by other - * threads - */ - if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) - c_pages[i] = NULL; - else - __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); - } - - one_round = 0; - for (i = 0; i < nr_pages; i++) { - if (c_pages[i] == NULL) - continue; - err = null_flush_cache_page(nullb, c_pages[i]); - if (err) - return err; - one_round++; - } - flushed += one_round << PAGE_SHIFT; - - if (n > flushed) { - if (nr_pages == 0) - nullb->cache_flush_pos = 0; - if (one_round == 0) { - /* give other threads a chance */ - spin_unlock_irq(&nullb->lock); - spin_lock_irq(&nullb->lock); - } - goto again; - } - return 0; -} - -static int copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n, bool is_fua) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - if (null_cache_active(nullb) && !is_fua) - null_make_cache_space(nullb, PAGE_SIZE); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_insert_page(nullb, sector, - !null_cache_active(nullb) || is_fua); - if (!t_page) - return -ENOSPC; - - src = kmap_atomic(source); - dst = kmap_atomic(t_page->page); - memcpy(dst + offset, src + off + count, temp); - kunmap_atomic(dst); - kunmap_atomic(src); - - __set_bit(sector & SECTOR_MASK, t_page->bitmap); - - if (is_fua) - null_free_sector(nullb, sector, true); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static int copy_from_nullb(struct nullb *nullb, struct page *dest, - unsigned int off, sector_t sector, size_t n) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_lookup_page(nullb, sector, false, - !null_cache_active(nullb)); - - dst = kmap_atomic(dest); - if (!t_page) { - memset(dst + off + count, 0, temp); - goto next; - } - src = kmap_atomic(t_page->page); - memcpy(dst + off + count, src + offset, temp); - kunmap_atomic(src); -next: - kunmap_atomic(dst); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static void nullb_fill_pattern(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off) -{ - void *dst; - - dst = kmap_atomic(page); - memset(dst + off, 0xFF, len); - kunmap_atomic(dst); -} - -blk_status_t null_handle_discard(struct nullb_device *dev, - sector_t sector, sector_t nr_sectors) -{ - struct nullb *nullb = dev->nullb; - size_t n = nr_sectors << SECTOR_SHIFT; - size_t temp; - - spin_lock_irq(&nullb->lock); - while (n > 0) { - temp = min_t(size_t, n, dev->blocksize); - null_free_sector(nullb, sector, false); - if (null_cache_active(nullb)) - null_free_sector(nullb, sector, true); - sector += temp >> SECTOR_SHIFT; - n -= temp; - } - spin_unlock_irq(&nullb->lock); - - return BLK_STS_OK; -} - -static int null_handle_flush(struct nullb *nullb) -{ - int err; - - if (!null_cache_active(nullb)) - return 0; - - spin_lock_irq(&nullb->lock); - while (true) { - err = null_make_cache_space(nullb, - nullb->dev->cache_size * 1024 * 1024); - if (err || nullb->dev->curr_cache == 0) - break; - } - - WARN_ON(!radix_tree_empty(&nullb->dev->cache)); - spin_unlock_irq(&nullb->lock); - return err; -} - -static int null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector, - bool is_fua) -{ - struct nullb_device *dev = nullb->dev; - unsigned int valid_len = len; - int err = 0; - - if (!is_write) { - if (dev->zoned) - valid_len = null_zone_valid_read_len(nullb, - sector, len); - - if (valid_len) { - err = copy_from_nullb(nullb, page, off, - sector, valid_len); - off += valid_len; - len -= valid_len; - } - - if (len) - nullb_fill_pattern(nullb, page, len, off); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len, is_fua); - } - - return err; -} - -static int null_handle_rq(struct nullb_cmd *cmd) -{ - struct request *rq = cmd->rq; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = blk_rq_pos(rq); - struct req_iterator iter; - struct bio_vec bvec; - - spin_lock_irq(&nullb->lock); - rq_for_each_segment(bvec, rq, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, - rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - - return 0; -} - -static int null_handle_bio(struct nullb_cmd *cmd) -{ - struct bio *bio = cmd->bio; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = bio->bi_iter.bi_sector; - struct bio_vec bvec; - struct bvec_iter iter; - - spin_lock_irq(&nullb->lock); - bio_for_each_segment(bvec, bio, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector, - bio->bi_opf & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - return 0; -} - -static void null_stop_queue(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_stop_hw_queues(q); -} - -static void null_restart_queue_async(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_start_stopped_hw_queues(q, true); -} - -static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts = BLK_STS_OK; - struct request *rq = cmd->rq; - - if (!hrtimer_active(&nullb->bw_timer)) - hrtimer_restart(&nullb->bw_timer); - - if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); - /* race with timer */ - if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); - /* requeue request */ - sts = BLK_STS_DEV_RESOURCE; - } - return sts; -} - -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) -{ - struct badblocks *bb = &cmd->nq->dev->badblocks; - sector_t first_bad; - int bad_sectors; - - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; - - return BLK_STS_OK; -} - -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op, - sector_t sector, - sector_t nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - int err; - - if (op == REQ_OP_DISCARD) - return null_handle_discard(dev, sector, nr_sectors); - - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); - - return errno_to_blk_status(err); -} - -static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct bio *bio; - - if (dev->memory_backed) - return; - - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { - zero_fill_bio(cmd->bio); - } else if (req_op(cmd->rq) == REQ_OP_READ) { - __rq_for_each_bio(bio, cmd->rq) - zero_fill_bio(bio); - } -} - -static inline void nullb_complete_cmd(struct nullb_cmd *cmd) -{ - /* - * Since root privileges are required to configure the null_blk - * driver, it is fine that this driver does not initialize the - * data buffers of read commands. Zero-initialize these buffers - * anyway if KMSAN is enabled to prevent that KMSAN complains - * about null_blk not initializing read data buffers. - */ - if (IS_ENABLED(CONFIG_KMSAN)) - nullb_zero_read_cmd_buffer(cmd); - - /* Complete IO by inline, softirq or timer */ - switch (cmd->nq->dev->irqmode) { - case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { - case NULL_Q_MQ: - if (likely(!blk_should_fake_timeout(cmd->rq->q))) - blk_mq_complete_request(cmd->rq); - break; - case NULL_Q_BIO: - /* - * XXX: no proper submitting cpu information available. - */ - end_cmd(cmd); - break; - } - break; - case NULL_IRQ_NONE: - end_cmd(cmd); - break; - case NULL_IRQ_TIMER: - null_cmd_end_timer(cmd); - break; - } -} - -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - blk_status_t ret; - - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); - if (ret != BLK_STS_OK) - return ret; - } - - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op, sector, nr_sectors); - - return BLK_STS_OK; -} - -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_opf op) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts; - - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - sts = null_handle_throttled(cmd); - if (sts != BLK_STS_OK) - return sts; - } - - if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); - goto out; - } - - if (dev->zoned) - cmd->error = null_process_zoned_cmd(cmd, op, - sector, nr_sectors); - else - cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); - -out: - nullb_complete_cmd(cmd); - return BLK_STS_OK; -} - -static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) -{ - struct nullb *nullb = container_of(timer, struct nullb, bw_timer); - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - unsigned int mbps = nullb->dev->mbps; - - if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) - return HRTIMER_NORESTART; - - atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); - null_restart_queue_async(nullb); - - hrtimer_forward_now(&nullb->bw_timer, timer_interval); - - return HRTIMER_RESTART; -} - -static void nullb_setup_bwtimer(struct nullb *nullb) -{ - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; - atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); - hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); -} - -static struct nullb_queue *nullb_to_queue(struct nullb *nullb) -{ - int index = 0; - - if (nullb->nr_queues != 1) - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); - - return &nullb->queues[index]; -} - -static blk_qc_t null_submit_bio(struct bio *bio) -{ - sector_t sector = bio->bi_iter.bi_sector; - sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; - struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); - return BLK_QC_T_NONE; -} - -static bool should_timeout_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_timeout_str[0]) - return should_fail(&null_timeout_attr, 1); -#endif - return false; -} - -static bool should_requeue_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_requeue_str[0]) - return should_fail(&null_requeue_attr, 1); -#endif - return false; -} - -static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) -{ - pr_info("rq %p timed out\n", rq); - blk_mq_complete_request(rq); - return BLK_EH_DONE; -} - -static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - struct nullb_queue *nq = hctx->driver_data; - sector_t nr_sectors = blk_rq_sectors(bd->rq); - sector_t sector = blk_rq_pos(bd->rq); - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - cmd->rq = bd->rq; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - - blk_mq_start_request(bd->rq); - - if (should_requeue_request(bd->rq)) { - /* - * Alternate between hitting the core BUSY path, and the - * driver driven requeue path - */ - nq->requeue_selection++; - if (nq->requeue_selection & 1) - return BLK_STS_RESOURCE; - else { - blk_mq_requeue_request(bd->rq, true); - return BLK_STS_OK; - } - } - if (should_timeout_request(bd->rq)) - return BLK_STS_OK; - - return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); -} - -static void cleanup_queue(struct nullb_queue *nq) -{ - kfree(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - -static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nullb_queue *nq = hctx->driver_data; - struct nullb *nullb = nq->dev->nullb; - - nullb->nr_queues--; -} - -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; - nq->dev = nullb->dev; -} - -static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, - unsigned int hctx_idx) -{ - struct nullb *nullb = hctx->queue->queuedata; - struct nullb_queue *nq; - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) - return -EFAULT; -#endif - - nq = &nullb->queues[hctx_idx]; - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - - return 0; -} - -static const struct blk_mq_ops null_mq_ops = { - .queue_rq = null_queue_rq, - .complete = null_complete_rq, - .timeout = null_timeout_rq, - .init_hctx = null_init_hctx, - .exit_hctx = null_exit_hctx, -}; - -static void null_del_dev(struct nullb *nullb) -{ - struct nullb_device *dev; - - if (!nullb) - return; - - dev = nullb->dev; - - ida_simple_remove(&nullb_indexes, nullb->index); - - list_del_init(&nullb->list); - - del_gendisk(nullb->disk); - - if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { - hrtimer_cancel(&nullb->bw_timer); - atomic_long_set(&nullb->cur_bytes, LONG_MAX); - null_restart_queue_async(nullb); - } - - blk_cleanup_queue(nullb->q); - if (dev->queue_mode == NULL_Q_MQ && - nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); - put_disk(nullb->disk); - cleanup_queues(nullb); - if (null_cache_active(nullb)) - null_free_device_storage(nullb->dev, true); - kfree(nullb); - dev->nullb = NULL; -} - -static void null_config_discard(struct nullb *nullb) -{ - if (nullb->dev->discard == false) - return; - - if (!nullb->dev->memory_backed) { - nullb->dev->discard = false; - pr_info("discard option is ignored without memory backing\n"); - return; - } - - if (nullb->dev->zoned) { - nullb->dev->discard = false; - pr_info("discard option is ignored in zoned mode\n"); - return; - } - - nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); -} - -static const struct block_device_operations null_bio_ops = { - .owner = THIS_MODULE, - .submit_bio = null_submit_bio, - .report_zones = null_report_zones, -}; - -static const struct block_device_operations null_rq_ops = { - .owner = THIS_MODULE, - .report_zones = null_report_zones, -}; - -static int setup_commands(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - int i, tag_size; - - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); - if (!nq->cmds) - return -ENOMEM; - - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); - if (!nq->tag_map) { - kfree(nq->cmds); - return -ENOMEM; - } - - for (i = 0; i < nq->queue_depth; i++) { - cmd = &nq->cmds[i]; - cmd->tag = -1U; - } - - return 0; -} - -static int setup_queues(struct nullb *nullb) -{ - nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), - GFP_KERNEL); - if (!nullb->queues) - return -ENOMEM; - - nullb->queue_depth = nullb->dev->hw_queue_depth; - - return 0; -} - -static int init_driver_queues(struct nullb *nullb) -{ - struct nullb_queue *nq; - int i, ret = 0; - - for (i = 0; i < nullb->dev->submit_queues; i++) { - nq = &nullb->queues[i]; - - null_init_queue(nullb, nq); - - ret = setup_commands(nq); - if (ret) - return ret; - nullb->nr_queues++; - } - return 0; -} - -static int null_gendisk_register(struct nullb *nullb) -{ - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk; - - disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); - if (!disk) - return -ENOMEM; - set_capacity(disk, size); - - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->major = null_major; - disk->first_minor = nullb->index; - if (queue_is_mq(nullb->q)) - disk->fops = &null_rq_ops; - else - disk->fops = &null_bio_ops; - disk->private_data = nullb; - disk->queue = nullb->q; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); - - if (nullb->dev->zoned) { - int ret = null_register_zoned_dev(nullb); - - if (ret) - return ret; - } - - add_disk(disk); - return 0; -} - -static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) -{ - set->ops = &null_mq_ops; - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : - g_submit_queues; - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : - g_hw_queue_depth; - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; - set->cmd_size = sizeof(struct nullb_cmd); - set->flags = BLK_MQ_F_SHOULD_MERGE; - if (g_no_sched) - set->flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; - set->driver_data = NULL; - - if ((nullb && nullb->dev->blocking) || g_blocking) - set->flags |= BLK_MQ_F_BLOCKING; - - return blk_mq_alloc_tag_set(set); -} - -static int null_validate_conf(struct nullb_device *dev) -{ - dev->blocksize = round_down(dev->blocksize, 512); - dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { - if (dev->submit_queues != nr_online_nodes) - dev->submit_queues = nr_online_nodes; - } else if (dev->submit_queues > nr_cpu_ids) - dev->submit_queues = nr_cpu_ids; - else if (dev->submit_queues == 0) - dev->submit_queues = 1; - - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); - dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); - - /* Do memory allocation, so set blocking */ - if (dev->memory_backed) - dev->blocking = true; - else /* cache is meaningless */ - dev->cache_size = 0; - dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, - dev->cache_size); - dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); - /* can not stop a queue */ - if (dev->queue_mode == NULL_Q_BIO) - dev->mbps = 0; - - if (dev->zoned && - (!dev->zone_size || !is_power_of_2(dev->zone_size))) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - - return 0; -} - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static bool __null_setup_fault(struct fault_attr *attr, char *str) -{ - if (!str[0]) - return true; - - if (!setup_fault_attr(attr, str)) - return false; - - attr->verbose = 0; - return true; -} -#endif - -static bool null_setup_fault(void) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) - return false; - if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) - return false; - if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) - return false; -#endif - return true; -} - -static int null_add_dev(struct nullb_device *dev) -{ - struct nullb *nullb; - int rv; - - rv = null_validate_conf(dev); - if (rv) - return rv; - - nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); - if (!nullb) { - rv = -ENOMEM; - goto out; - } - nullb->dev = dev; - dev->nullb = nullb; - - spin_lock_init(&nullb->lock); - - rv = setup_queues(nullb); - if (rv) - goto out_free_nullb; - - if (dev->queue_mode == NULL_Q_MQ) { - if (shared_tags) { - nullb->tag_set = &tag_set; - rv = 0; - } else { - nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb, nullb->tag_set); - } - - if (rv) - goto out_cleanup_queues; - - if (!null_setup_fault()) - goto out_cleanup_queues; - - nullb->tag_set->timeout = 5 * HZ; - nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); - if (IS_ERR(nullb->q)) { - rv = -ENOMEM; - goto out_cleanup_tags; - } - } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue(dev->home_node); - if (!nullb->q) { - rv = -ENOMEM; - goto out_cleanup_queues; - } - rv = init_driver_queues(nullb); - if (rv) - goto out_cleanup_blk_queue; - } - - if (dev->mbps) { - set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); - nullb_setup_bwtimer(nullb); - } - - if (dev->cache_size > 0) { - set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); - } - - if (dev->zoned) { - rv = null_init_zoned_dev(dev, nullb->q); - if (rv) - goto out_cleanup_blk_queue; - } - - nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); - - mutex_lock(&lock); - nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); - dev->index = nullb->index; - mutex_unlock(&lock); - - blk_queue_logical_block_size(nullb->q, dev->blocksize); - blk_queue_physical_block_size(nullb->q, dev->blocksize); - if (!dev->max_sectors) - dev->max_sectors = queue_max_hw_sectors(nullb->q); - dev->max_sectors = min_t(unsigned int, dev->max_sectors, - BLK_DEF_MAX_SECTORS); - blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); - - null_config_discard(nullb); - - sprintf(nullb->disk_name, "nullb%d", nullb->index); - - rv = null_gendisk_register(nullb); - if (rv) - goto out_cleanup_zone; - - mutex_lock(&lock); - list_add_tail(&nullb->list, &nullb_list); - mutex_unlock(&lock); - - return 0; -out_cleanup_zone: - null_free_zoned_dev(dev); -out_cleanup_blk_queue: - blk_cleanup_queue(nullb->q); -out_cleanup_tags: - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); -out_cleanup_queues: - cleanup_queues(nullb); -out_free_nullb: - kfree(nullb); - dev->nullb = NULL; -out: - return rv; -} - -static int __init null_init(void) -{ - int ret = 0; - unsigned int i; - struct nullb *nullb; - struct nullb_device *dev; - - if (g_bs > PAGE_SIZE) { - pr_warn("invalid block size\n"); - pr_warn("defaults block size to %lu\n", PAGE_SIZE); - g_bs = PAGE_SIZE; - } - - if (g_max_sectors > BLK_DEF_MAX_SECTORS) { - pr_warn("invalid max sectors\n"); - pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); - g_max_sectors = BLK_DEF_MAX_SECTORS; - } - - if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { - pr_err("invalid home_node value\n"); - g_home_node = NUMA_NO_NODE; - } - - if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); - return -EINVAL; - } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { - if (g_submit_queues != nr_online_nodes) { - pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); - g_submit_queues = nr_online_nodes; - } - } else if (g_submit_queues > nr_cpu_ids) - g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) - g_submit_queues = 1; - - if (g_queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(NULL, &tag_set); - if (ret) - return ret; - } - - config_group_init(&nullb_subsys.su_group); - mutex_init(&nullb_subsys.su_mutex); - - ret = configfs_register_subsystem(&nullb_subsys); - if (ret) - goto err_tagset; - - mutex_init(&lock); - - null_major = register_blkdev(0, "nullb"); - if (null_major < 0) { - ret = null_major; - goto err_conf; - } - - for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; - goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); - goto err_dev; - } - } - - pr_info("module loaded\n"); - return 0; - -err_dev: - while (!list_empty(&nullb_list)) { - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - unregister_blkdev(null_major, "nullb"); -err_conf: - configfs_unregister_subsystem(&nullb_subsys); -err_tagset: - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); - return ret; -} - -static void __exit null_exit(void) -{ - struct nullb *nullb; - - configfs_unregister_subsystem(&nullb_subsys); - - unregister_blkdev(null_major, "nullb"); - - mutex_lock(&lock); - while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - mutex_unlock(&lock); - - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); -} - -module_init(null_init); -module_exit(null_exit); - -MODULE_AUTHOR("Jens Axboe "); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk_trace.c b/drivers/block/null_blk_trace.c deleted file mode 100644 index f246e7bff698..000000000000 --- a/drivers/block/null_blk_trace.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * null_blk trace related helpers. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ -#include "null_blk_trace.h" - -/* - * Helper to use for all null_blk traces to extract disk name. - */ -const char *nullb_trace_disk_name(struct trace_seq *p, char *name) -{ - const char *ret = trace_seq_buffer_ptr(p); - - if (name && *name) - trace_seq_printf(p, "disk=%s, ", name); - trace_seq_putc(p, 0); - - return ret; -} diff --git a/drivers/block/null_blk_trace.h b/drivers/block/null_blk_trace.h deleted file mode 100644 index 4f83032eb544..000000000000 --- a/drivers/block/null_blk_trace.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * null_blk device driver tracepoints. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM nullb - -#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_NULLB_H - -#include -#include - -#include "null_blk.h" - -const char *nullb_trace_disk_name(struct trace_seq *p, char *name); - -#define __print_disk_name(name) nullb_trace_disk_name(p, name) - -#ifndef TRACE_HEADER_MULTI_READ -static inline void __assign_disk_name(char *name, struct gendisk *disk) -{ - if (disk) - memcpy(name, disk->disk_name, DISK_NAME_LEN); - else - memset(name, 0, DISK_NAME_LEN); -} -#endif - -TRACE_EVENT(nullb_zone_op, - TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, - unsigned int zone_cond), - TP_ARGS(cmd, zone_no, zone_cond), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(enum req_opf, op) - __field(unsigned int, zone_no) - __field(unsigned int, zone_cond) - ), - TP_fast_assign( - __entry->op = req_op(cmd->rq); - __entry->zone_no = zone_no; - __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); - ), - TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", - __print_disk_name(__entry->disk), - blk_op_str(__entry->op), - __entry->zone_no, - blk_zone_cond_str(__entry->zone_cond)) -); - -TRACE_EVENT(nullb_report_zones, - TP_PROTO(struct nullb *nullb, unsigned int nr_zones), - TP_ARGS(nullb, nr_zones), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(unsigned int, nr_zones) - ), - TP_fast_assign( - __entry->nr_zones = nr_zones; - __assign_disk_name(__entry->disk, nullb->disk); - ), - TP_printk("%s nr_zones=%u", - __print_disk_name(__entry->disk), __entry->nr_zones) -); - -#endif /* _TRACE_NULLB_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE null_blk_trace - -/* This part must be outside protection */ -#include diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c deleted file mode 100644 index 65464f7559e0..000000000000 --- a/drivers/block/null_blk_zoned.c +++ /dev/null @@ -1,677 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "null_blk.h" - -#define CREATE_TRACE_POINTS -#include "null_blk_trace.h" - -#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) - -static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) -{ - return sect >> ilog2(dev->zone_size_sects); -} - -static inline void null_lock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_lock_irq(&dev->zone_res_lock); -} - -static inline void null_unlock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_unlock_irq(&dev->zone_res_lock); -} - -static inline void null_init_zone_lock(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_lock_init(&zone->spinlock); - else - mutex_init(&zone->mutex); -} - -static inline void null_lock_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_lock_irq(&zone->spinlock); - else - mutex_lock(&zone->mutex); -} - -static inline void null_unlock_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_unlock_irq(&zone->spinlock); - else - mutex_unlock(&zone->mutex); -} - -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) -{ - sector_t dev_capacity_sects, zone_capacity_sects; - struct nullb_zone *zone; - sector_t sector = 0; - unsigned int i; - - if (!is_power_of_2(dev->zone_size)) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - if (dev->zone_size > dev->size) { - pr_err("Zone size larger than device capacity\n"); - return -EINVAL; - } - - if (!dev->zone_capacity) - dev->zone_capacity = dev->zone_size; - - if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); - return -EINVAL; - } - - zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); - dev_capacity_sects = MB_TO_SECTS(dev->size); - dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); - dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); - if (dev_capacity_sects & (dev->zone_size_sects - 1)) - dev->nr_zones++; - - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), - GFP_KERNEL | __GFP_ZERO); - if (!dev->zones) - return -ENOMEM; - - spin_lock_init(&dev->zone_res_lock); - - if (dev->zone_nr_conv >= dev->nr_zones) { - dev->zone_nr_conv = dev->nr_zones - 1; - pr_info("changed the number of conventional zones to %u", - dev->zone_nr_conv); - } - - /* Max active zones has to be < nbr of seq zones in order to be enforceable */ - if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_active = 0; - pr_info("zone_max_active limit disabled, limit >= zone count\n"); - } - - /* Max open zones has to be <= max active zones */ - if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { - dev->zone_max_open = dev->zone_max_active; - pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); - } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_open = 0; - pr_info("zone_max_open limit disabled, limit >= zone count\n"); - } - dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; - dev->imp_close_zone_no = dev->zone_nr_conv; - - for (i = 0; i < dev->zone_nr_conv; i++) { - zone = &dev->zones[i]; - - null_init_zone_lock(dev, zone); - zone->start = sector; - zone->len = dev->zone_size_sects; - zone->capacity = zone->len; - zone->wp = zone->start + zone->len; - zone->type = BLK_ZONE_TYPE_CONVENTIONAL; - zone->cond = BLK_ZONE_COND_NOT_WP; - - sector += dev->zone_size_sects; - } - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[i]; - - null_init_zone_lock(dev, zone); - zone->start = zone->wp = sector; - if (zone->start + dev->zone_size_sects > dev_capacity_sects) - zone->len = dev_capacity_sects - zone->start; - else - zone->len = dev->zone_size_sects; - zone->capacity = - min_t(sector_t, zone->len, zone_capacity_sects); - zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; - - sector += dev->zone_size_sects; - } - - q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - - return 0; -} - -int null_register_zoned_dev(struct nullb *nullb) -{ - struct nullb_device *dev = nullb->dev; - struct request_queue *q = nullb->q; - - if (queue_is_mq(q)) { - int ret = blk_revalidate_disk_zones(nullb->disk, NULL); - - if (ret) - return ret; - } else { - blk_queue_chunk_sectors(q, dev->zone_size_sects); - q->nr_zones = blkdev_nr_zones(nullb->disk); - } - - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - blk_queue_max_open_zones(q, dev->zone_max_open); - blk_queue_max_active_zones(q, dev->zone_max_active); - - return 0; -} - -void null_free_zoned_dev(struct nullb_device *dev) -{ - kvfree(dev->zones); -} - -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) -{ - struct nullb *nullb = disk->private_data; - struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i; - struct nullb_zone *zone; - struct blk_zone blkz; - int error; - - first_zone = null_zone_no(dev, sector); - if (first_zone >= dev->nr_zones) - return 0; - - nr_zones = min(nr_zones, dev->nr_zones - first_zone); - trace_nullb_report_zones(nullb, nr_zones); - - memset(&blkz, 0, sizeof(struct blk_zone)); - zone = &dev->zones[first_zone]; - for (i = 0; i < nr_zones; i++, zone++) { - /* - * Stacked DM target drivers will remap the zone information by - * modifying the zone information passed to the report callback. - * So use a local copy to avoid corruption of the device zone - * array. - */ - null_lock_zone(dev, zone); - blkz.start = zone->start; - blkz.len = zone->len; - blkz.wp = zone->wp; - blkz.type = zone->type; - blkz.cond = zone->cond; - blkz.capacity = zone->capacity; - null_unlock_zone(dev, zone); - - error = cb(&blkz, i, data); - if (error) - return error; - } - - return nr_zones; -} - -/* - * This is called in the case of memory backing from null_process_cmd() - * with the target zone already locked. - */ -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len) -{ - struct nullb_device *dev = nullb->dev; - struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = len >> SECTOR_SHIFT; - - /* Read must be below the write pointer position */ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || - sector + nr_sectors <= zone->wp) - return len; - - if (sector > zone->wp) - return 0; - - return (zone->wp - sector) << SECTOR_SHIFT; -} - -static blk_status_t __null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - -static void null_close_imp_open_zone(struct nullb_device *dev) -{ - struct nullb_zone *zone; - unsigned int zno, i; - - zno = dev->imp_close_zone_no; - if (zno >= dev->nr_zones) - zno = dev->zone_nr_conv; - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[zno]; - zno++; - if (zno >= dev->nr_zones) - zno = dev->zone_nr_conv; - - if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, zone); - dev->imp_close_zone_no = zno; - return; - } - } -} - -static blk_status_t null_check_active(struct nullb_device *dev) -{ - if (!dev->zone_max_active) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + - dev->nr_zones_closed < dev->zone_max_active) - return BLK_STS_OK; - - return BLK_STS_ZONE_ACTIVE_RESOURCE; -} - -static blk_status_t null_check_open(struct nullb_device *dev) -{ - if (!dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_imp_open) { - if (null_check_active(dev) == BLK_STS_OK) { - null_close_imp_open_zone(dev); - return BLK_STS_OK; - } - } - - return BLK_STS_ZONE_OPEN_RESOURCE; -} - -/* - * This function matches the manage open zone resources function in the ZBC standard, - * with the addition of max active zones support (added in the ZNS standard). - * - * The function determines if a zone can transition to implicit open or explicit open, - * while maintaining the max open zone (and max active zone) limit(s). It may close an - * implicit open zone in order to make additional zone resources available. - * - * ZBC states that an implicit open zone shall be closed only if there is not - * room within the open limit. However, with the addition of an active limit, - * it is not certain that closing an implicit open zone will allow a new zone - * to be opened, since we might already be at the active limit capacity. - */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret; - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - ret = null_check_active(dev); - if (ret != BLK_STS_OK) - return ret; - fallthrough; - case BLK_ZONE_COND_CLOSED: - return null_check_open(dev); - default: - /* Should never be called for other states */ - WARN_ON(1); - return BLK_STS_IOERR; - } -} - -static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors, bool append) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - struct nullb_zone *zone = &dev->zones[zno]; - blk_status_t ret; - - trace_nullb_zone_op(cmd, zno, zone->cond); - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { - if (append) - return BLK_STS_IOERR; - return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - } - - null_lock_zone(dev, zone); - - if (zone->cond == BLK_ZONE_COND_FULL) { - /* Cannot write to a full zone */ - ret = BLK_STS_IOERR; - goto unlock; - } - - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. - */ - if (append) { - sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else - cmd->rq->__sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->wp + nr_sectors > zone->start + zone->capacity) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) { - null_lock_zone_res(dev); - - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) { - null_unlock_zone_res(dev); - goto unlock; - } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } - - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - null_unlock_zone_res(dev); - } - - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - goto unlock; - - zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->capacity) { - null_lock_zone_res(dev); - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; - zone->cond = BLK_ZONE_COND_FULL; - null_unlock_zone_res(dev); - } - - ret = BLK_STS_OK; - -unlock: - null_unlock_zone(dev, zone); - - return ret; -} - -static blk_status_t null_open_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret = BLK_STS_OK; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - default: - ret = BLK_STS_IOERR; - goto unlock; - } - - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; - -unlock: - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - ret = __null_close_zone(dev, zone); - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_finish_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret = BLK_STS_OK; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - default: - ret = BLK_STS_IOERR; - goto unlock; - } - - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zone->len; - -unlock: - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_reset_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - null_unlock_zone_res(dev); - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - null_unlock_zone_res(dev); - return BLK_STS_IOERR; - } - - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; - - null_unlock_zone_res(dev); - - if (dev->memory_backed) - return null_handle_discard(dev, zone->start, zone->len); - - return BLK_STS_OK; -} - -static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zone_no; - struct nullb_zone *zone; - blk_status_t ret; - size_t i; - - if (op == REQ_OP_ZONE_RESET_ALL) { - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[i]; - null_lock_zone(dev, zone); - if (zone->cond != BLK_ZONE_COND_EMPTY) { - null_reset_zone(dev, zone); - trace_nullb_zone_op(cmd, i, zone->cond); - } - null_unlock_zone(dev, zone); - } - return BLK_STS_OK; - } - - zone_no = null_zone_no(dev, sector); - zone = &dev->zones[zone_no]; - - null_lock_zone(dev, zone); - - switch (op) { - case REQ_OP_ZONE_RESET: - ret = null_reset_zone(dev, zone); - break; - case REQ_OP_ZONE_OPEN: - ret = null_open_zone(dev, zone); - break; - case REQ_OP_ZONE_CLOSE: - ret = null_close_zone(dev, zone); - break; - case REQ_OP_ZONE_FINISH: - ret = null_finish_zone(dev, zone); - break; - default: - ret = BLK_STS_NOTSUPP; - break; - } - - if (ret == BLK_STS_OK) - trace_nullb_zone_op(cmd, zone_no, zone->cond); - - null_unlock_zone(dev, zone); - - return ret; -} - -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector, sector_t nr_sectors) -{ - struct nullb_device *dev; - struct nullb_zone *zone; - blk_status_t sts; - - switch (op) { - case REQ_OP_WRITE: - return null_zone_write(cmd, sector, nr_sectors, false); - case REQ_OP_ZONE_APPEND: - return null_zone_write(cmd, sector, nr_sectors, true); - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - case REQ_OP_ZONE_OPEN: - case REQ_OP_ZONE_CLOSE: - case REQ_OP_ZONE_FINISH: - return null_zone_mgmt(cmd, op, sector); - default: - dev = cmd->nq->dev; - zone = &dev->zones[null_zone_no(dev, sector)]; - - null_lock_zone(dev, zone); - sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zone); - return sts; - } -} -- cgit v1.2.3-59-g8ed1b From f87905660ed01d85e45eac22d479f31f380b2f50 Mon Sep 17 00:00:00 2001 From: tangzhenhao Date: Sun, 29 Nov 2020 23:23:56 -0800 Subject: drivers/lightnvm: fix a null-ptr-deref bug in pblk-core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At line 294 in drivers/lightnvm/pblk-write.c, function pblk_gen_run_ws is called with actual param GFP_ATOMIC. pblk_gen_run_ws call mempool_alloc using "GFP_ATOMIC" flag, so mempool_alloc can return null. So we need to check the return-val of mempool_alloc to avoid null-ptr-deref bug. Signed-off-by: tangzhenhao Reviewed-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 97c68731406b..1dddba11e721 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1869,6 +1869,10 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, struct pblk_line_ws *line_ws; line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); + if (!line_ws) { + pblk_err(pblk, "pblk: could not allocate memory\n"); + return; + } line_ws->pblk = pblk; line_ws->line = line; -- cgit v1.2.3-59-g8ed1b From b5f32555567cfe0a5d5dbe7c1e85ebe37b3f545a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:48 +0100 Subject: cdrom: Reset sector_size back it is not 2048. In v2.4.0-test2pre2 mmc_ioctl_cdrom_read_data() was extended by issuing a MODE_SELECT opcode to change the sector size and READ_10 to perform the actual read if the READ_CD opcode is not support. The sector size is never changed back to the previous value of 2048 bytes which is however denoted by the comment for version 3.09 of the cdrom.c file. Use cdrom_switch_blocksize() to change the sector size only if the requested size deviates from 2048. Change it back to 2048 after the read operation if a change was mode. Link: https://lkml.kernel.org/r/20201204164803.ovwurzs3257em2rp@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 0c271b9e3c5b..8f0e52a71493 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2996,13 +2996,15 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, * SCSI-II devices are not required to support * READ_CD, so let's try switching block size */ - /* FIXME: switch back again... */ - ret = cdrom_switch_blocksize(cdi, blocksize); - if (ret) - goto out; + if (blocksize != CD_FRAMESIZE) { + ret = cdrom_switch_blocksize(cdi, blocksize); + if (ret) + goto out; + } cgc->sshdr = NULL; ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); - ret |= cdrom_switch_blocksize(cdi, blocksize); + if (blocksize != CD_FRAMESIZE) + ret |= cdrom_switch_blocksize(cdi, CD_FRAMESIZE); } if (!ret && copy_to_user(arg, cgc->buffer, blocksize)) ret = -EFAULT; -- cgit v1.2.3-59-g8ed1b From 8d2ac857a81d5a44b9643038291ea958bbf05c7f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:49 +0100 Subject: sr: Switch the sector size back to 2048 if sr_read_sector() changed it. sr_read_sector() is hardly used since v2.3.16. Its only purpose is to check if it is a XA medium via sr_is_xa(). This check is only enabled if the module parameter `xa_test' is enabled. Change the sector size back to 2048 if it was changed. With this change, there is no lazy sector size changing left. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/scsi/sr_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index ffcf902da390..5703f8400b73 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -549,6 +549,8 @@ static int sr_read_sector(Scsi_CD *cd, int lba, int blksize, unsigned char *dest cgc.timeout = IOCTL_TIMEOUT; rc = sr_do_ioctl(cd, &cgc); + if (blksize != CD_FRAMESIZE) + rc |= sr_set_blocklength(cd, CD_FRAMESIZE); return rc; } -- cgit v1.2.3-59-g8ed1b From 31cc07761ccb389c7c01f904f6a6479544abbd11 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:50 +0100 Subject: sr: Remove in_interrupt() usage in sr_init_command(). The in_interrupt() check in sr_init_command() is a leftover from the past, pre v2.3.16 era to be exact. Back then the ioctl() was served by `sr' itself and sector size changes by CDROMREADMODE2 (as noted in the comment) were accounted within sr's data structures which allowed a "lazy" reset so it could be skipped on the next request and reset back to the default value once the device node was closed or before a command from the blockqueue was issued. This does not work like that anymore. The CDROMREADMODE2 is served by cdrom's mmc_ioctl() function which may change the sector size but the `sr' driver does not learn about it and so its ->sector_size is not updated. The ioctl() resets the changed sector size back to 2048. sr_read_sector() also resets the sector size back to the default once it is done. Remove the conditional sector size update from sr_init_command() and sr_release() because it is not needed. Link: https://lkml.kernel.org/r/20201204164803.ovwurzs3257em2rp@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/scsi/sr.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index fd4b582110b2..e4633b84c556 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -416,19 +416,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) goto out; } - /* - * we do lazy blocksize switching (when reading XA sectors, - * see CDROMREADMODE2 ioctl) - */ s_size = cd->device->sector_size; - if (s_size > 2048) { - if (!in_interrupt()) - sr_set_blocklength(cd, 2048); - else - scmd_printk(KERN_INFO, SCpnt, - "can't switch blocksize: in interrupt\n"); - } - if (s_size != 512 && s_size != 1024 && s_size != 2048) { scmd_printk(KERN_ERR, SCpnt, "bad sector size %d\n", s_size); goto out; @@ -701,11 +689,6 @@ error_out: static void sr_release(struct cdrom_device_info *cdi) { - struct scsi_cd *cd = cdi->handle; - - if (cd->device->sector_size > 2048) - sr_set_blocklength(cd, 2048); - } static int sr_probe(struct device *dev) -- cgit v1.2.3-59-g8ed1b From aeb2b0b1a3da5791d3b216e71ec72db7570f3571 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sat, 12 Dec 2020 06:13:02 +0100 Subject: block: drop dead assignments in loop_init() Commit 8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand") simplified loop_init(); so computing the range of the block region is not required anymore and can be dropped. Drop dead assignments in loop_init(). As compilers will detect these unneeded assignments and optimize this, the resulting object code is identical before and after this change. No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 22e59410b971..a45248c6e319 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2316,7 +2316,6 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { int i, nr; - unsigned long range; struct loop_device *lo; int err; @@ -2353,13 +2352,10 @@ static int __init loop_init(void) * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ - if (max_loop) { + if (max_loop) nr = max_loop; - range = max_loop << part_shift; - } else { + else nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - range = 1UL << MINORBITS; - } err = misc_register(&loop_misc); if (err < 0) -- cgit v1.2.3-59-g8ed1b