From aff34e192e4eeacfb8b5ffc68e10a240f2c0c6d7 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:27 -0400 Subject: block: Move integrity kobject to struct gendisk The integrity kobject purely exists to support the integrity subdirectory in sysfs and doesn't really have anything to do with the blk_integrity data structure. Move the kobject to struct gendisk where it belongs. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 19c2e947d4d1..830f9c07d4bb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1472,8 +1472,6 @@ struct blk_integrity { unsigned short tag_size; const char *name; - - struct kobject kobj; }; extern bool blk_integrity_is_initialized(struct gendisk *); -- cgit v1.3-6-gb490 From 0f8087ecdeac921fc4920f1328f55c15080bc6aa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:33 -0400 Subject: block: Consolidate static integrity profile properties We previously made a complete copy of a device's data integrity profile even though several of the fields inside the blk_integrity struct are pointers to fixed template entries in t10-pi.c. Split the static and per-device portions so that we can reference the template directly. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Cc: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/bio-integrity.c | 8 ++++---- block/blk-integrity.c | 17 ++++++++--------- block/t10-pi.c | 16 ++++------------ drivers/nvdimm/core.c | 11 +++++++---- drivers/nvme/host/pci.c | 8 ++++---- drivers/scsi/sd_dif.c | 31 ++++++++++++++++++------------- drivers/target/target_core_iblock.c | 10 +++++----- include/linux/blkdev.h | 20 +++++++++++--------- include/linux/t10-pi.h | 8 ++++---- 9 files changed, 65 insertions(+), 64 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 14b8faf8b09d..a10ffe19a8dd 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -177,11 +177,11 @@ bool bio_integrity_enabled(struct bio *bio) if (bi == NULL) return false; - if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && + if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL && (bi->flags & BLK_INTEGRITY_VERIFY)) return true; - if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && + if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL && (bi->flags & BLK_INTEGRITY_GENERATE)) return true; @@ -340,7 +340,7 @@ int bio_integrity_prep(struct bio *bio) /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) - bio_integrity_process(bio, bi->generate_fn); + bio_integrity_process(bio, bi->profile->generate_fn); return 0; } @@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - bio->bi_error = bio_integrity_process(bio, bi->verify_fn); + bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 182bfd2383ea..daf590ab3b46 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -176,10 +176,10 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) return -1; } - if (strcmp(b1->name, b2->name)) { + if (b1->profile != b2->profile) { printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->name, b2->name); + b1->profile->name, b2->profile->name); return -1; } @@ -275,8 +275,8 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { - if (bi != NULL && bi->name != NULL) - return sprintf(page, "%s\n", bi->name); + if (bi != NULL && bi->profile->name != NULL) + return sprintf(page, "%s\n", bi->profile->name); else return sprintf(page, "none\n"); } @@ -401,7 +401,8 @@ bool blk_integrity_is_initialized(struct gendisk *disk) { struct blk_integrity *bi = blk_get_integrity(disk); - return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); + return (bi && bi->profile->name && strcmp(bi->profile->name, + bi_unsupported_name) != 0); } EXPORT_SYMBOL(blk_integrity_is_initialized); @@ -446,14 +447,12 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) /* Use the provided profile as template */ if (template != NULL) { - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; + bi->profile = template->profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; bi->flags |= template->flags; } else - bi->name = bi_unsupported_name; + bi->profile->name = bi_unsupported_name; disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; diff --git a/block/t10-pi.c b/block/t10-pi.c index 24d6e9715318..2c97912335a9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -160,38 +160,30 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity t10_pi_type1_crc = { +struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity t10_pi_type1_ip = { +struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity t10_pi_type3_crc = { +struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity t10_pi_type3_ip = { +struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index cb62ec6a12d0..7df89b547ae1 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -399,19 +399,22 @@ static int nd_pi_nop_generate_verify(struct blk_integrity_iter *iter) int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) { - struct blk_integrity integrity = { + struct blk_integrity bi; + static struct blk_integrity_profile profile = { .name = "ND-PI-NOP", .generate_fn = nd_pi_nop_generate_verify, .verify_fn = nd_pi_nop_generate_verify, - .tuple_size = meta_size, - .tag_size = meta_size, }; int ret; if (meta_size == 0) return 0; - ret = blk_integrity_register(disk, &integrity); + bi.profile = &profile; + bi.tuple_size = meta_size; + bi.tag_size = meta_size; + + ret = blk_integrity_register(disk, &bi); if (ret) return ret; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 22d83752ae87..04e3d60a1e45 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -558,7 +558,7 @@ static int nvme_noop_generate(struct blk_integrity_iter *iter) return 0; } -struct blk_integrity nvme_meta_noop = { +struct blk_integrity_profile nvme_meta_noop = { .name = "NVME_META_NOOP", .generate_fn = nvme_noop_generate, .verify_fn = nvme_noop_verify, @@ -570,14 +570,14 @@ static void nvme_init_integrity(struct nvme_ns *ns) switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: - integrity = t10_pi_type3_crc; + integrity.profile = &t10_pi_type3_crc; break; case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: - integrity = t10_pi_type1_crc; + integrity.profile = &t10_pi_type1_crc; break; default: - integrity = nvme_meta_noop; + integrity.profile = &nvme_meta_noop; break; } integrity.tuple_size = ns->ms; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 5c06d292b94c..987bf392c336 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -43,6 +43,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp) struct scsi_device *sdp = sdkp->device; struct gendisk *disk = sdkp->disk; u8 type = sdkp->protection_type; + struct blk_integrity bi; int dif, dix; dif = scsi_host_dif_capable(sdp->host, type); @@ -55,39 +56,43 @@ void sd_dif_config_host(struct scsi_disk *sdkp) if (!dix) return; + memset(&bi, 0, sizeof(bi)); + /* Enable DMA of protection information */ if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { if (type == SD_DIF_TYPE3_PROTECTION) - blk_integrity_register(disk, &t10_pi_type3_ip); + bi.profile = &t10_pi_type3_ip; else - blk_integrity_register(disk, &t10_pi_type1_ip); + bi.profile = &t10_pi_type1_ip; - disk->integrity->flags |= BLK_INTEGRITY_IP_CHECKSUM; + bi.flags |= BLK_INTEGRITY_IP_CHECKSUM; } else if (type == SD_DIF_TYPE3_PROTECTION) - blk_integrity_register(disk, &t10_pi_type3_crc); + bi.profile = &t10_pi_type3_crc; else - blk_integrity_register(disk, &t10_pi_type1_crc); + bi.profile = &t10_pi_type1_crc; + bi.tuple_size = sizeof(struct t10_pi_tuple); sd_printk(KERN_NOTICE, sdkp, - "Enabling DIX %s protection\n", disk->integrity->name); + "Enabling DIX %s protection\n", bi.profile->name); - /* Signal to block layer that we support sector tagging */ if (dif && type) { - - disk->integrity->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; if (!sdkp->ATO) - return; + goto out; if (type == SD_DIF_TYPE3_PROTECTION) - disk->integrity->tag_size = sizeof(u16) + sizeof(u32); + bi.tag_size = sizeof(u16) + sizeof(u32); else - disk->integrity->tag_size = sizeof(u16); + bi.tag_size = sizeof(u16); sd_printk(KERN_NOTICE, sdkp, "DIF application tag size %u\n", - disk->integrity->tag_size); + bi.tag_size); } + +out: + blk_integrity_register(disk, &bi); } /* diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 0f19e11acac2..f29c69120054 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -155,17 +155,17 @@ static int iblock_configure_device(struct se_device *dev) if (bi) { struct bio_set *bs = ib_dev->ibd_bio_set; - if (!strcmp(bi->name, "T10-DIF-TYPE3-IP") || - !strcmp(bi->name, "T10-DIF-TYPE1-IP")) { + if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") || + !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) { pr_err("IBLOCK export of blk_integrity: %s not" - " supported\n", bi->name); + " supported\n", bi->profile->name); ret = -ENOSYS; goto out_blkdev_put; } - if (!strcmp(bi->name, "T10-DIF-TYPE3-CRC")) { + if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-CRC")) { dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; - } else if (!strcmp(bi->name, "T10-DIF-TYPE1-CRC")) { + } else if (!strcmp(bi->profile->name, "T10-DIF-TYPE1-CRC")) { dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE1_PROT; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 830f9c07d4bb..f36c6476f1c7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,16 +1462,18 @@ struct blk_integrity_iter { typedef int (integrity_processing_fn) (struct blk_integrity_iter *); -struct blk_integrity { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - - unsigned short flags; - unsigned short tuple_size; - unsigned short interval; - unsigned short tag_size; +struct blk_integrity_profile { + integrity_processing_fn *generate_fn; + integrity_processing_fn *verify_fn; + const char *name; +}; - const char *name; +struct blk_integrity { + struct blk_integrity_profile *profile; + unsigned short flags; + unsigned short tuple_size; + unsigned short interval; + unsigned short tag_size; }; extern bool blk_integrity_is_initialized(struct gendisk *); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 6a8b9942632d..dd8de82cf5b5 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -14,9 +14,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity t10_pi_type1_crc; -extern struct blk_integrity t10_pi_type1_ip; -extern struct blk_integrity t10_pi_type3_crc; -extern struct blk_integrity t10_pi_type3_ip; +extern struct blk_integrity_profile t10_pi_type1_crc; +extern struct blk_integrity_profile t10_pi_type1_ip; +extern struct blk_integrity_profile t10_pi_type3_crc; +extern struct blk_integrity_profile t10_pi_type3_ip; #endif -- cgit v1.3-6-gb490 From a48f041d91bf1aee599fa2adb53b780ed20c2ee5 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:38 -0400 Subject: block: Reduce the size of struct blk_integrity The per-device properties in the blk_integrity structure were previously unsigned short. However, most of the values fit inside a char. The only exception is the data interval size and we can work around that by storing it as a power of two. This cuts the size of the dynamic portion of blk_integrity in half. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- block/blk-integrity.c | 6 +++--- include/linux/blkdev.h | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index a10ffe19a8dd..6a90eca9cea1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -202,7 +202,7 @@ EXPORT_SYMBOL(bio_integrity_enabled); static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { - return sectors >> (ilog2(bi->interval) - 9); + return sectors >> (bi->interval_exp - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, @@ -229,7 +229,7 @@ static int bio_integrity_process(struct bio *bio, bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = bi->interval; + iter.interval = 1 << bi->interval_exp; iter.seed = bip_get_seed(bip); iter.prot_buf = prot_buf; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index daf590ab3b46..c7508654faff 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -155,10 +155,10 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) if (!b1 || !b2) return -1; - if (b1->interval != b2->interval) { + if (b1->interval_exp != b2->interval_exp) { pr_err("%s: %s/%s protection interval %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, - b1->interval, b2->interval); + 1 << b1->interval_exp, 1 << b2->interval_exp); return -1; } @@ -440,7 +440,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; - bi->interval = queue_logical_block_size(disk->queue); + bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); disk->integrity = bi; } else bi = disk->integrity; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f36c6476f1c7..4f1968f15e30 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1470,10 +1470,10 @@ struct blk_integrity_profile { struct blk_integrity { struct blk_integrity_profile *profile; - unsigned short flags; - unsigned short tuple_size; - unsigned short interval; - unsigned short tag_size; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; extern bool blk_integrity_is_initialized(struct gendisk *); -- cgit v1.3-6-gb490 From 25520d55cdb6ee289abc68f553d364d22478ff54 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:49 -0400 Subject: block: Inline blk_integrity in struct gendisk Up until now the_integrity profile has been dynamically allocated and attached to struct gendisk after the disk has been made active. This causes problems because NVMe devices need to register the profile prior to the partition table being read due to a mandatory metadata buffer requirement. In addition, DM goes through hoops to deal with preallocating, but not initializing integrity profiles. Since the integrity profile is small (4 bytes + a pointer), Christoph suggested moving it to struct gendisk proper. This requires several changes: - Moving the blk_integrity definition to genhd.h. - Inlining blk_integrity in struct gendisk. - Removing the dynamic allocation code. - Adding helper functions which allow gendisk to set up and tear down the integrity sysfs dir when a disk is added/deleted. - Adding a blk_integrity_revalidate() callback for updating the stable pages bdi setting. - The calls that depend on whether a device has an integrity profile or not now key off of the bi->profile pointer. - Simplifying the integrity support routines in DM (Mike Snitzer). Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Mike Snitzer Cc: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/blk-integrity.c | 160 +++++++++++++++++----------------------------- block/genhd.c | 2 + block/partition-generic.c | 1 + drivers/md/dm-table.c | 88 +++++++++++++------------ drivers/md/md.c | 9 +-- drivers/nvdimm/core.c | 6 +- drivers/nvme/host/pci.c | 5 +- fs/block_dev.c | 2 +- include/linux/blkdev.h | 34 ++++------ include/linux/genhd.h | 26 +++++++- 10 files changed, 152 insertions(+), 181 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 7a96f57ed195..4615a3386798 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -30,10 +30,6 @@ #include "blk.h" -static struct kmem_cache *integrity_cachep; - -static const char *bi_unsupported_name = "unsupported"; - /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -146,13 +142,13 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); */ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) { - struct blk_integrity *b1 = gd1->integrity; - struct blk_integrity *b2 = gd2->integrity; + struct blk_integrity *b1 = &gd1->integrity; + struct blk_integrity *b2 = &gd2->integrity; - if (!b1 && !b2) + if (!b1->profile && !b2->profile) return 0; - if (!b1 || !b2) + if (!b1->profile || !b2->profile) return -1; if (b1->interval_exp != b2->interval_exp) { @@ -163,21 +159,21 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } if (b1->tuple_size != b2->tuple_size) { - printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tuple_size, b2->tuple_size); return -1; } if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + pr_err("%s: %s/%s tag sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tag_size, b2->tag_size); return -1; } if (b1->profile != b2->profile) { - printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, b1->profile->name, b2->profile->name); return -1; @@ -250,7 +246,7 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = blk_get_integrity(disk); + struct blk_integrity *bi = &disk->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); @@ -262,7 +258,7 @@ static ssize_t integrity_attr_store(struct kobject *kobj, size_t count) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = blk_get_integrity(disk); + struct blk_integrity *bi = &disk->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); ssize_t ret = 0; @@ -275,7 +271,7 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { - if (bi != NULL && bi->profile->name != NULL) + if (bi->profile && bi->profile->name) return sprintf(page, "%s\n", bi->profile->name); else return sprintf(page, "none\n"); @@ -283,18 +279,13 @@ static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) { - if (bi != NULL) - return sprintf(page, "%u\n", bi->tag_size); - else - return sprintf(page, "0\n"); + return sprintf(page, "%u\n", bi->tag_size); } static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) { - if (bi != NULL) - return sprintf(page, "%u\n", 1 << bi->interval_exp); - else - return sprintf(page, "0\n"); + return sprintf(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); } static ssize_t integrity_verify_store(struct blk_integrity *bi, @@ -388,113 +379,78 @@ static const struct sysfs_ops integrity_ops = { .store = &integrity_attr_store, }; -static int __init blk_dev_integrity_init(void) -{ - integrity_cachep = kmem_cache_create("blkdev_integrity", - sizeof(struct blk_integrity), - 0, SLAB_PANIC, NULL); - return 0; -} -subsys_initcall(blk_dev_integrity_init); - -static void blk_integrity_release(struct kobject *kobj) -{ - struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = blk_get_integrity(disk); - - kmem_cache_free(integrity_cachep, bi); -} - static struct kobj_type integrity_ktype = { .default_attrs = integrity_attrs, .sysfs_ops = &integrity_ops, - .release = blk_integrity_release, }; -bool blk_integrity_is_initialized(struct gendisk *disk) -{ - struct blk_integrity *bi = blk_get_integrity(disk); - - return (bi && bi->profile->name && strcmp(bi->profile->name, - bi_unsupported_name) != 0); -} -EXPORT_SYMBOL(blk_integrity_is_initialized); - /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: optional integrity profile to register + * @template: block integrity profile to register * - * Description: When a device needs to advertise itself as being able - * to send/receive integrity metadata it must use this function to - * register the capability with the block layer. The template is a - * blk_integrity struct with values appropriate for the underlying - * hardware. If template is NULL the new profile is allocated but - * not filled out. See Documentation/block/data-integrity.txt. + * Description: When a device needs to advertise itself as being able to + * send/receive integrity metadata it must use this function to register + * the capability with the block layer. The template is a blk_integrity + * struct with values appropriate for the underlying hardware. See + * Documentation/block/data-integrity.txt. */ -int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { - struct blk_integrity *bi; - - BUG_ON(disk == NULL); + struct blk_integrity *bi = &disk->integrity; - if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); - if (!bi) - return -1; + bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | + template->flags; + bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); + bi->profile = template->profile; + bi->tuple_size = template->tuple_size; + bi->tag_size = template->tag_size; - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, - "%s", "integrity")) { - kmem_cache_free(integrity_cachep, bi); - return -1; - } - - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); - - bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; - bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); - disk->integrity = bi; - } else - bi = disk->integrity; - - /* Use the provided profile as template */ - if (template != NULL) { - bi->profile = template->profile; - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->flags |= template->flags; - } else - bi->profile->name = bi_unsupported_name; - - disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; - - return 0; + blk_integrity_revalidate(disk); } EXPORT_SYMBOL(blk_integrity_register); /** - * blk_integrity_unregister - Remove block integrity profile - * @disk: disk whose integrity profile to deallocate + * blk_integrity_unregister - Unregister block integrity profile + * @disk: disk whose integrity profile to unregister * - * Description: This function frees all memory used by the block - * integrity profile. To be called at device teardown. + * Description: This function unregisters the integrity capability from + * a block device. */ void blk_integrity_unregister(struct gendisk *disk) { - struct blk_integrity *bi; + blk_integrity_revalidate(disk); + memset(&disk->integrity, 0, sizeof(struct blk_integrity)); +} +EXPORT_SYMBOL(blk_integrity_unregister); - if (!disk || !disk->integrity) +void blk_integrity_revalidate(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->integrity; + + if (!(disk->flags & GENHD_FL_UP)) return; - disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + if (bi->profile) + disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + disk->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; +} - bi = disk->integrity; +void blk_integrity_add(struct gendisk *disk) +{ + if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity")) + return; + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); +} + +void blk_integrity_del(struct gendisk *disk) +{ kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); kobject_del(&disk->integrity_kobj); kobject_put(&disk->integrity_kobj); - disk->integrity = NULL; } -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/genhd.c b/block/genhd.c index 0c706f33a599..e5cafa51567c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -630,6 +630,7 @@ void add_disk(struct gendisk *disk) WARN_ON(retval); disk_add_events(disk); + blk_integrity_add(disk); } EXPORT_SYMBOL(add_disk); @@ -638,6 +639,7 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + blk_integrity_del(disk); disk_del_events(disk); /* invalidate stuff */ diff --git a/block/partition-generic.c b/block/partition-generic.c index e7711133284e..3b030157ec85 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -428,6 +428,7 @@ rescan: if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + blk_integrity_revalidate(disk); check_disk_size_change(disk, bdev); bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e76ed003769e..061152a43730 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1014,15 +1014,16 @@ static int dm_table_build_index(struct dm_table *t) return r; } +static bool integrity_profile_exists(struct gendisk *disk) +{ + return !!blk_get_integrity(disk); +} + /* * Get a disk whose integrity profile reflects the table's profile. - * If %match_all is true, all devices' profiles must match. - * If %match_all is false, all devices must at least have an - * allocated integrity profile; but uninitialized is ok. * Returns NULL if integrity support was inconsistent or unavailable. */ -static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, - bool match_all) +static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) { struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; @@ -1030,10 +1031,8 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, list_for_each_entry(dd, devices, list) { template_disk = dd->dm_dev->bdev->bd_disk; - if (!blk_get_integrity(template_disk)) + if (!integrity_profile_exists(template_disk)) goto no_integrity; - if (!match_all && !blk_integrity_is_initialized(template_disk)) - continue; /* skip uninitialized profiles */ else if (prev_disk && blk_integrity_compare(prev_disk, template_disk) < 0) goto no_integrity; @@ -1052,34 +1051,40 @@ no_integrity: } /* - * Register the mapped device for blk_integrity support if - * the underlying devices have an integrity profile. But all devices - * may not have matching profiles (checking all devices isn't reliable + * Register the mapped device for blk_integrity support if the + * underlying devices have an integrity profile. But all devices may + * not have matching profiles (checking all devices isn't reliable * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity profile). - * Stacked DM devices force a 2 stage integrity profile validation: - * 1 - during load, validate all initialized integrity profiles match - * 2 - during resume, validate all integrity profiles match + * must be resumed before they will have an initialized integity + * profile). Consequently, stacked DM devices force a 2 stage integrity + * profile validation: First pass during table load, final pass during + * resume. */ -static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +static int dm_table_register_integrity(struct dm_table *t) { + struct mapped_device *md = t->md; struct gendisk *template_disk = NULL; - template_disk = dm_table_get_integrity_disk(t, false); + template_disk = dm_table_get_integrity_disk(t); if (!template_disk) return 0; - if (!blk_integrity_is_initialized(dm_disk(md))) { + if (!integrity_profile_exists(dm_disk(md))) { t->integrity_supported = 1; - return blk_integrity_register(dm_disk(md), NULL); + /* + * Register integrity profile during table load; we can do + * this because the final profile must match during resume. + */ + blk_integrity_register(dm_disk(md), + blk_get_integrity(template_disk)); + return 0; } /* - * If DM device already has an initalized integrity + * If DM device already has an initialized integrity * profile the new profile should not conflict. */ - if (blk_integrity_is_initialized(template_disk) && - blk_integrity_compare(dm_disk(md), template_disk) < 0) { + if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { DMWARN("%s: conflict with existing integrity profile: " "%s profile mismatch", dm_device_name(t->md), @@ -1087,7 +1092,7 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device return 1; } - /* Preserve existing initialized integrity profile */ + /* Preserve existing integrity profile */ t->integrity_supported = 1; return 0; } @@ -1112,7 +1117,7 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_prealloc_integrity(t, t->md); + r = dm_table_register_integrity(t); if (r) { DMERR("could not register integrity profile."); return r; @@ -1278,29 +1283,30 @@ combine_limits: } /* - * Set the integrity profile for this device if all devices used have - * matching profiles. We're quite deep in the resume path but still - * don't know if all devices (particularly DM devices this device - * may be stacked on) have matching profiles. Even if the profiles - * don't match we have no way to fail (to resume) at this point. + * Verify that all devices have an integrity profile that matches the + * DM device's registered integrity profile. If the profiles don't + * match then unregister the DM device's integrity profile. */ -static void dm_table_set_integrity(struct dm_table *t) +static void dm_table_verify_integrity(struct dm_table *t) { struct gendisk *template_disk = NULL; - if (!blk_get_integrity(dm_disk(t->md))) - return; + if (t->integrity_supported) { + /* + * Verify that the original integrity profile + * matches all the devices in this table. + */ + template_disk = dm_table_get_integrity_disk(t); + if (template_disk && + blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) + return; + } - template_disk = dm_table_get_integrity_disk(t, true); - if (template_disk) - blk_integrity_register(dm_disk(t->md), - blk_get_integrity(template_disk)); - else if (blk_integrity_is_initialized(dm_disk(t->md))) - DMWARN("%s: device no longer has a valid integrity profile", - dm_device_name(t->md)); - else + if (integrity_profile_exists(dm_disk(t->md))) { DMWARN("%s: unable to establish an integrity profile", dm_device_name(t->md)); + blk_integrity_unregister(dm_disk(t->md)); + } } static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1500,7 +1506,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); - dm_table_set_integrity(t); + dm_table_verify_integrity(t); /* * Determine whether or not this queue's I/O timings contribute diff --git a/drivers/md/md.c b/drivers/md/md.c index c702de18207a..2af9d590e1a0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1962,12 +1962,9 @@ int md_integrity_register(struct mddev *mddev) * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. */ - if (blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)) != 0) { - printk(KERN_ERR "md: failed to register integrity for %s\n", - mdname(mddev)); - return -EINVAL; - } + blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)); + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { printk(KERN_ERR "md: failed to create integrity pool for %s\n", diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 7df89b547ae1..e85848caf8d2 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -405,7 +405,6 @@ int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) .generate_fn = nd_pi_nop_generate_verify, .verify_fn = nd_pi_nop_generate_verify, }; - int ret; if (meta_size == 0) return 0; @@ -414,10 +413,7 @@ int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) bi.tuple_size = meta_size; bi.tag_size = meta_size; - ret = blk_integrity_register(disk, &bi); - if (ret) - return ret; - + blk_integrity_register(disk, &bi); blk_queue_max_integrity_segments(disk->queue, 1); return 0; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 04e3d60a1e45..8d2aeaaa3895 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -538,7 +538,7 @@ static void nvme_dif_remap(struct request *req, virt = bip_get_seed(bip); phys = nvme_block_nr(ns, blk_rq_pos(req)); nlb = (blk_rq_bytes(req) >> ns->lba_shift); - ts = ns->disk->integrity->tuple_size; + ts = ns->disk->integrity.tuple_size; for (i = 0; i < nlb; i++, virt++, phys++) { pi = (struct t10_pi_tuple *)p; @@ -2044,8 +2044,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) ns->pi_type = pi_type; blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && - !ns->ext) + if (ns->ms && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 073bb57adab1..0a793c7930eb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1075,7 +1075,7 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - + blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4f1968f15e30..60669c20190f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,16 +1468,7 @@ struct blk_integrity_profile { const char *name; }; -struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; -}; - -extern bool blk_integrity_is_initialized(struct gendisk *); -extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); +extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); extern void blk_integrity_unregister(struct gendisk *); extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, @@ -1488,15 +1479,20 @@ extern bool blk_integrity_merge_rq(struct request_queue *, struct request *, extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); -static inline -struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { - return bdev->bd_disk->integrity; + struct blk_integrity *bi = &disk->integrity; + + if (!bi->profile) + return NULL; + + return bi; } -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +static inline +struct blk_integrity *bdev_get_integrity(struct block_device *bdev) { - return disk->integrity; + return blk_get_integrity(bdev->bd_disk); } static inline bool blk_integrity_rq(struct request *rq) @@ -1570,10 +1566,9 @@ static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) { return 0; } -static inline int blk_integrity_register(struct gendisk *d, +static inline void blk_integrity_register(struct gendisk *d, struct blk_integrity *b) { - return 0; } static inline void blk_integrity_unregister(struct gendisk *d) { @@ -1598,10 +1593,7 @@ static inline bool blk_integrity_merge_bio(struct request_queue *rq, { return true; } -static inline bool blk_integrity_is_initialized(struct gendisk *g) -{ - return 0; -} + static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9e6e0dfa97ad..82f4911e0ad8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -163,6 +163,18 @@ struct disk_part_tbl { struct disk_events; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +struct blk_integrity { + struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; +}; + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). @@ -198,9 +210,9 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity *integrity; + struct blk_integrity integrity; struct kobject integrity_kobj; -#endif +#endif /* CONFIG_BLK_DEV_INTEGRITY */ int node_id; }; @@ -728,6 +740,16 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } +#if defined(CONFIG_BLK_DEV_INTEGRITY) +extern void blk_integrity_add(struct gendisk *); +extern void blk_integrity_del(struct gendisk *); +extern void blk_integrity_revalidate(struct gendisk *); +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline void blk_integrity_add(struct gendisk *disk) { } +static inline void blk_integrity_del(struct gendisk *disk) { } +static inline void blk_integrity_revalidate(struct gendisk *disk) { } +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -- cgit v1.3-6-gb490 From 3ef28e83ab15799742e55fd13243a5f678b04242 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Oct 2015 13:20:12 -0400 Subject: block: generic request_queue reference counting Allow pmem, and other synchronous/bio-based block drivers, to fallback on a per-cpu reference count managed by the core for tracking queue live/dead state. The existing per-cpu reference count for the blk_mq case is promoted to be used in all block i/o scenarios. This involves initializing it by default, waiting for it to drop to zero at exit, and holding a live reference over the invocation of q->make_request_fn() in generic_make_request(). The blk_mq code continues to take its own reference per blk_mq request and retains the ability to freeze the queue, but the check that the queue is frozen is moved to generic_make_request(). This fixes crash signatures like the following: BUG: unable to handle kernel paging request at ffff880140000000 [..] Call Trace: [] ? copy_user_handle_tail+0x5f/0x70 [] pmem_do_bvec.isra.11+0x70/0xf0 [nd_pmem] [] pmem_make_request+0xd1/0x200 [nd_pmem] [] ? mempool_alloc+0x72/0x1a0 [] generic_make_request+0xd6/0x110 [] submit_bio+0x76/0x170 [] submit_bh_wbc+0x12f/0x160 [] submit_bh+0x12/0x20 [] jbd2_write_superblock+0x8d/0x170 [] jbd2_mark_journal_empty+0x5d/0x90 [] jbd2_journal_destroy+0x24b/0x270 [] ? put_pwq_unlocked+0x2a/0x30 [] ? destroy_workqueue+0x225/0x250 [] ext4_put_super+0x64/0x360 [] generic_shutdown_super+0x6a/0xf0 Cc: Jens Axboe Cc: Keith Busch Cc: Ross Zwisler Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Tested-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/blk-core.c | 71 ++++++++++++++++++++++++++++++++++++++------ block/blk-mq-sysfs.c | 6 ---- block/blk-mq.c | 80 +++++++++++++++----------------------------------- block/blk-sysfs.c | 3 +- block/blk.h | 14 +++++++++ include/linux/blk-mq.h | 1 - include/linux/blkdev.h | 2 +- 7 files changed, 102 insertions(+), 75 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 2eb722d48773..9b4d735cb5b8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -554,13 +554,10 @@ void blk_cleanup_queue(struct request_queue *q) * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ - if (q->mq_ops) { - blk_mq_freeze_queue(q); - spin_lock_irq(lock); - } else { - spin_lock_irq(lock); + blk_freeze_queue(q); + spin_lock_irq(lock); + if (!q->mq_ops) __blk_drain_queue(q, true); - } queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); @@ -570,6 +567,7 @@ void blk_cleanup_queue(struct request_queue *q) if (q->mq_ops) blk_mq_free_queue(q); + percpu_ref_exit(&q->q_usage_counter); spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) @@ -629,6 +627,40 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +int blk_queue_enter(struct request_queue *q, gfp_t gfp) +{ + while (true) { + int ret; + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + if (!(gfp & __GFP_WAIT)) + return -EBUSY; + + ret = wait_event_interruptible(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + wake_up_all(&q->mq_freeze_wq); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -690,11 +722,22 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_waitqueue_head(&q->mq_freeze_wq); - if (blkcg_init_queue(q)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; + if (blkcg_init_queue(q)) + goto fail_ref; + return q; +fail_ref: + percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); fail_split: @@ -1966,9 +2009,19 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { + + q->make_request_fn(q, bio); + + blk_queue_exit(q); - bio = bio_list_pop(current->bio_list); + bio = bio_list_pop(current->bio_list); + } else { + struct bio *bio_next = bio_list_pop(current->bio_list); + + bio_io_error(bio); + bio = bio_next; + } } while (bio); current->bio_list = NULL; /* deactivate */ } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 788fffd9b409..6f57a110289c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -413,12 +413,6 @@ static void blk_mq_sysfs_init(struct request_queue *q) kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } -/* see blk_register_queue() */ -void blk_mq_finish_init(struct request_queue *q) -{ - percpu_ref_switch_to_percpu(&q->mq_usage_counter); -} - int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index d921cd5177f5..6c240712553a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -78,47 +78,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) -{ - while (true) { - int ret; - - if (percpu_ref_tryget_live(&q->mq_usage_counter)) - return 0; - - if (!(gfp & __GFP_WAIT)) - return -EBUSY; - - ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || - blk_queue_dying(q)); - if (blk_queue_dying(q)) - return -ENODEV; - if (ret) - return ret; - } -} - -static void blk_mq_queue_exit(struct request_queue *q) -{ - percpu_ref_put(&q->mq_usage_counter); -} - -static void blk_mq_usage_counter_release(struct percpu_ref *ref) -{ - struct request_queue *q = - container_of(ref, struct request_queue, mq_usage_counter); - - wake_up_all(&q->mq_freeze_wq); -} - void blk_mq_freeze_queue_start(struct request_queue *q) { int freeze_depth; freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { - percpu_ref_kill(&q->mq_usage_counter); + percpu_ref_kill(&q->q_usage_counter); blk_mq_run_hw_queues(q, false); } } @@ -126,18 +92,34 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); static void blk_mq_freeze_queue_wait(struct request_queue *q) { - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_freeze_queue(struct request_queue *q) { + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } + +void blk_mq_freeze_queue(struct request_queue *q) +{ + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); +} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) @@ -147,7 +129,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q) freeze_depth = atomic_dec_return(&q->mq_freeze_depth); WARN_ON_ONCE(freeze_depth < 0); if (!freeze_depth) { - percpu_ref_reinit(&q->mq_usage_counter); + percpu_ref_reinit(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } } @@ -256,7 +238,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q, gfp); + ret = blk_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); @@ -279,7 +261,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, } blk_mq_put_ctx(ctx); if (!rq) { - blk_mq_queue_exit(q); + blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } return rq; @@ -298,7 +280,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, tag, &ctx->last_tag); - blk_mq_queue_exit(q); + blk_queue_exit(q); } void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -1177,11 +1159,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { - bio_io_error(bio); - return NULL; - } - + blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -2000,14 +1978,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - /* - * Init percpu_ref in atomic mode so that it's faster to shutdown. - * See blk_register_queue() for details. - */ - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_hctxs; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -2088,8 +2058,6 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); - - percpu_ref_exit(&q->mq_usage_counter); } /* Basically redo blk_mq_init_queue with queue frozen */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3e44a9da2a13..61fc2633bbea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -599,9 +599,8 @@ int blk_register_queue(struct gendisk *disk) */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); blk_queue_bypass_end(q); - if (q->mq_ops) - blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/block/blk.h b/block/blk.h index 98614ad37c81..5b2cd393afbe 100644 --- a/block/blk.h +++ b/block/blk.h @@ -72,6 +72,20 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); +int blk_queue_enter(struct request_queue *q, gfp_t gfp); +void blk_queue_exit(struct request_queue *q); +void blk_freeze_queue(struct request_queue *q); + +static inline void blk_queue_enter_live(struct request_queue *q) +{ + /* + * Given that running in generic_make_request() context + * guarantees that a live reference against q_usage_counter has + * been established, further references under that same context + * need not check that the queue has been frozen (marked dead). + */ + percpu_ref_get(&q->q_usage_counter); +} void blk_rq_timed_out_timer(unsigned long data); unsigned long blk_rq_timeout(unsigned long timeout); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5e7d43ab61c0..83cc9d4e5455 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -166,7 +166,6 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60669c20190f..3e0465257d68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -450,7 +450,7 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; - struct percpu_ref mq_usage_counter; + struct percpu_ref q_usage_counter; struct list_head all_q_node; struct blk_mq_tag_set *tag_set; -- cgit v1.3-6-gb490 From ac6fc48c9fb7d3220ec4e0be0c29bb314ea75f9f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Oct 2015 13:20:18 -0400 Subject: block: move blk_integrity to request_queue A trace like the following proceeds a crash in bio_integrity_process() when it goes to use an already freed blk_integrity profile. BUG: unable to handle kernel paging request at ffff8800d31b10d8 IP: [] 0xffff8800d31b10d8 PGD 2f65067 PUD 21fffd067 PMD 80000000d30001e3 Oops: 0011 [#1] SMP Dumping ftrace buffer: --------------------------------- ndctl-2222 2.... 44526245us : disk_release: pmem1s systemd--2223 4.... 44573945us : bio_integrity_endio: pmem1s <...>-409 4.... 44574005us : bio_integrity_process: pmem1s --------------------------------- [..] Call Trace: [] ? bio_integrity_process+0x159/0x2d0 [] bio_integrity_verify_fn+0x36/0x60 [] process_one_work+0x1cc/0x4e0 Given that a request_queue is pinned while i/o is in flight and that a gendisk is allowed to have a shorter lifetime, move blk_integrity to request_queue to satisfy requests arriving after the gendisk has been torn down. Cc: Christoph Hellwig Cc: Martin K. Petersen [martin: fix the CONFIG_BLK_DEV_INTEGRITY=n case] Tested-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- block/blk-integrity.c | 14 +++++++------- drivers/nvme/host/pci.c | 2 +- include/linux/blkdev.h | 6 +++++- include/linux/genhd.h | 1 - 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 4615a3386798..5d339ae64d56 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -142,8 +142,8 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); */ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) { - struct blk_integrity *b1 = &gd1->integrity; - struct blk_integrity *b2 = &gd2->integrity; + struct blk_integrity *b1 = &gd1->queue->integrity; + struct blk_integrity *b2 = &gd2->queue->integrity; if (!b1->profile && !b2->profile) return 0; @@ -246,7 +246,7 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); @@ -258,7 +258,7 @@ static ssize_t integrity_attr_store(struct kobject *kobj, size_t count) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); ssize_t ret = 0; @@ -397,7 +397,7 @@ static struct kobj_type integrity_ktype = { */ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | template->flags; @@ -420,13 +420,13 @@ EXPORT_SYMBOL(blk_integrity_register); void blk_integrity_unregister(struct gendisk *disk) { blk_integrity_revalidate(disk); - memset(&disk->integrity, 0, sizeof(struct blk_integrity)); + memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); } EXPORT_SYMBOL(blk_integrity_unregister); void blk_integrity_revalidate(struct gendisk *disk) { - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; if (!(disk->flags & GENHD_FL_UP)) return; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4757a2c9366e..2fa28680ad0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -538,7 +538,7 @@ static void nvme_dif_remap(struct request *req, virt = bip_get_seed(bip); phys = nvme_block_nr(ns, blk_rq_pos(req)); nlb = (blk_rq_bytes(req) >> ns->lba_shift); - ts = ns->disk->integrity.tuple_size; + ts = ns->disk->queue->integrity.tuple_size; for (i = 0; i < nlb; i++, virt++, phys++) { pi = (struct t10_pi_tuple *)p; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e0465257d68..cf57884db4b7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -369,6 +369,10 @@ struct request_queue { */ struct kobject mq_kobj; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity integrity; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #ifdef CONFIG_PM struct device *dev; int rpm_status; @@ -1481,7 +1485,7 @@ extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; if (!bi->profile) return NULL; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 82f4911e0ad8..847cc1d91634 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -210,7 +210,6 @@ struct gendisk { atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ int node_id; -- cgit v1.3-6-gb490 From bbd3e064362e5057cc4799ba2e4d68c7593e490b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Oct 2015 14:10:48 +0200 Subject: block: add an API for Persistent Reservations This commits adds a driver API and ioctls for controlling Persistent Reservations s/genericly/generically/ at the block layer. Persistent Reservations are supported by SCSI and NVMe and allow controlling who gets access to a device in a shared storage setup. Note that we add a pr_ops structure to struct block_device_operations instead of adding the members directly to avoid bloating all instances of devices that will never support Persistent Reservations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/pr.txt | 119 +++++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 103 +++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 2 + include/linux/pr.h | 18 +++++++ include/uapi/linux/pr.h | 48 ++++++++++++++++++ 5 files changed, 290 insertions(+) create mode 100644 Documentation/block/pr.txt create mode 100644 include/linux/pr.h create mode 100644 include/uapi/linux/pr.h (limited to 'include/linux/blkdev.h') diff --git a/Documentation/block/pr.txt b/Documentation/block/pr.txt new file mode 100644 index 000000000000..d3eb1ca65051 --- /dev/null +++ b/Documentation/block/pr.txt @@ -0,0 +1,119 @@ + +Block layer support for Persistent Reservations +=============================================== + +The Linux kernel supports a user space interface for simplified +Persistent Reservations which map to block devices that support +these (like SCSI). Persistent Reservations allow restricting +access to block devices to specific initiators in a shared storage +setup. + +This document gives a general overview of the support ioctl commands. +For a more detailed reference please refer the the SCSI Primary +Commands standard, specifically the section on Reservations and the +"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands. + +All implementations are expected to ensure the reservations survive +a power loss and cover all connections in a multi path environment. +These behaviors are optional in SPC but will be automatically applied +by Linux. + + +The following types of reservations are supported: +-------------------------------------------------- + + - PR_WRITE_EXCLUSIVE + + Only the initiator that owns the reservation can write to the + device. Any initiator can read from the device. + + - PR_EXCLUSIVE_ACCESS + + Only the initiator that owns the reservation can access the + device. + + - PR_WRITE_EXCLUSIVE_REG_ONLY + + Only initiators with a registered key can write to the device, + Any initiator can read from the device. + + - PR_EXCLUSIVE_ACCESS_REG_ONLY + + Only initiators with a registered key can access the device. + + - PR_WRITE_EXCLUSIVE_ALL_REGS + + Only initiators with a registered key can write to the device, + Any initiator can read from the device. + All initiators with a registered key are considered reservation + holders. + Please reference the SPC spec on the meaning of a reservation + holder if you want to use this type. + + - PR_EXCLUSIVE_ACCESS_ALL_REGS + + Only initiators with a registered key can access the device. + All initiators with a registered key are considered reservation + holders. + Please reference the SPC spec on the meaning of a reservation + holder if you want to use this type. + + +The following ioctl are supported: +---------------------------------- + +1. IOC_PR_REGISTER + +This ioctl command registers a new reservation if the new_key argument +is non-null. If no existing reservation exists old_key must be zero, +if an existing reservation should be replaced old_key must contain +the old reservation key. + +If the new_key argument is 0 it unregisters the existing reservation passed +in old_key. + + +2. IOC_PR_RESERVE + +This ioctl command reserves the device and thus restricts access for other +devices based on the type argument. The key argument must be the existing +reservation key for the device as acquired by the IOC_PR_REGISTER, +IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands. + + +3. IOC_PR_RELEASE + +This ioctl command releases the reservation specified by key and flags +and thus removes any access restriction implied by it. + + +4. IOC_PR_PREEMPT + +This ioctl command releases the existing reservation referred to by +old_key and replaces it with a a new reservation of type for the +reservation key new_key. + + +5. IOC_PR_PREEMPT_ABORT + +This ioctl command works like IOC_PR_PREEMPT except that it also aborts +any outstanding command sent over a connection identified by old_key. + +6. IOC_PR_CLEAR + +This ioctl command unregisters both key and any other reservation key +registered with the device and drops any existing reservation. + + +Flags +----- + +All the ioctls have a flag field. Currently only one flag is supported: + + - PR_FL_IGNORE_KEY + + Ignore the existing reservation key. This is commonly supported for + IOC_PR_REGISTER, and some implementation may support the flag for + IOC_PR_RESERVE. + +For all unknown flags the kernel will return -EOPNOTSUPP. diff --git a/block/ioctl.c b/block/ioctl.c index df62b47d2379..0918aed2d847 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) @@ -295,6 +296,96 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); +static int blkdev_pr_register(struct block_device *bdev, + struct pr_registration __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_registration reg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_register) + return -EOPNOTSUPP; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); +} + +static int blkdev_pr_reserve(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_reserve) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); +} + +static int blkdev_pr_release(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_release) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags) + return -EOPNOTSUPP; + return ops->pr_release(bdev, rsv.key, rsv.type); +} + +static int blkdev_pr_preempt(struct block_device *bdev, + struct pr_preempt __user *arg, bool abort) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_preempt p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_preempt) + return -EOPNOTSUPP; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + + if (p.flags) + return -EOPNOTSUPP; + return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); +} + +static int blkdev_pr_clear(struct block_device *bdev, + struct pr_clear __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_clear c; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_clear) + return -EOPNOTSUPP; + if (copy_from_user(&c, arg, sizeof(c))) + return -EFAULT; + + if (c.flags) + return -EOPNOTSUPP; + return ops->pr_clear(bdev, c.key); +} + /* * Is it an unrecognized ioctl? The correct returns are either * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a @@ -477,6 +568,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case IOC_PR_REGISTER: + return blkdev_pr_register(bdev, argp); + case IOC_PR_RESERVE: + return blkdev_pr_reserve(bdev, argp); + case IOC_PR_RELEASE: + return blkdev_pr_release(bdev, argp); + case IOC_PR_PREEMPT: + return blkdev_pr_preempt(bdev, argp, false); + case IOC_PR_PREEMPT_ABORT: + return blkdev_pr_preempt(bdev, argp, true); + case IOC_PR_CLEAR: + return blkdev_pr_clear(bdev, argp); default: return __blkdev_driver_ioctl(bdev, mode, cmd, arg); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 19c2e947d4d1..fe25da05e823 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -35,6 +35,7 @@ struct sg_io_hdr; struct bsg_job; struct blkcg_gq; struct blk_flush_queue; +struct pr_ops; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -1633,6 +1634,7 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); struct module *owner; + const struct pr_ops *pr_ops; }; extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, diff --git a/include/linux/pr.h b/include/linux/pr.h new file mode 100644 index 000000000000..65c01c10b335 --- /dev/null +++ b/include/linux/pr.h @@ -0,0 +1,18 @@ +#ifndef LINUX_PR_H +#define LINUX_PR_H + +#include + +struct pr_ops { + int (*pr_register)(struct block_device *bdev, u64 old_key, u64 new_key, + u32 flags); + int (*pr_reserve)(struct block_device *bdev, u64 key, + enum pr_type type, u32 flags); + int (*pr_release)(struct block_device *bdev, u64 key, + enum pr_type type); + int (*pr_preempt)(struct block_device *bdev, u64 old_key, u64 new_key, + enum pr_type type, bool abort); + int (*pr_clear)(struct block_device *bdev, u64 key); +}; + +#endif /* LINUX_PR_H */ diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h new file mode 100644 index 000000000000..57d7c0f916b6 --- /dev/null +++ b/include/uapi/linux/pr.h @@ -0,0 +1,48 @@ +#ifndef _UAPI_PR_H +#define _UAPI_PR_H + +enum pr_type { + PR_WRITE_EXCLUSIVE = 1, + PR_EXCLUSIVE_ACCESS = 2, + PR_WRITE_EXCLUSIVE_REG_ONLY = 3, + PR_EXCLUSIVE_ACCESS_REG_ONLY = 4, + PR_WRITE_EXCLUSIVE_ALL_REGS = 5, + PR_EXCLUSIVE_ACCESS_ALL_REGS = 6, +}; + +struct pr_reservation { + __u64 key; + __u32 type; + __u32 flags; +}; + +struct pr_registration { + __u64 old_key; + __u64 new_key; + __u32 flags; + __u32 __pad; +}; + +struct pr_preempt { + __u64 old_key; + __u64 new_key; + __u32 type; + __u32 flags; +}; + +struct pr_clear { + __u64 key; + __u32 flags; + __u32 __pad; +}; + +#define PR_FL_IGNORE_KEY (1 << 0) /* ignore existing key */ + +#define IOC_PR_REGISTER _IOW('p', 200, struct pr_registration) +#define IOC_PR_RESERVE _IOW('p', 201, struct pr_reservation) +#define IOC_PR_RELEASE _IOW('p', 202, struct pr_reservation) +#define IOC_PR_PREEMPT _IOW('p', 203, struct pr_preempt) +#define IOC_PR_PREEMPT_ABORT _IOW('p', 204, struct pr_preempt) +#define IOC_PR_CLEAR _IOW('p', 205, struct pr_clear) + +#endif /* _UAPI_PR_H */ -- cgit v1.3-6-gb490 From dece16353ef47d8d33f5302bc158072a9d65e26f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Nov 2015 10:41:16 -0700 Subject: block: change ->make_request_fn() and users to return a queue cookie No functional changes in this patch, but it prepares us for returning a more useful cookie related to the IO that was queued up. Signed-off-by: Jens Axboe Acked-by: Christoph Hellwig Acked-by: Keith Busch --- arch/m68k/emu/nfblock.c | 3 ++- arch/powerpc/sysdev/axonram.c | 5 +++-- arch/xtensa/platforms/iss/simdisk.c | 3 ++- block/blk-core.c | 26 ++++++++++++++++---------- block/blk-mq.c | 26 ++++++++++++++------------ drivers/block/brd.c | 5 +++-- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_req.c | 3 ++- drivers/block/null_blk.c | 3 ++- drivers/block/pktcdvd.c | 9 ++++----- drivers/block/ps3vram.c | 6 ++++-- drivers/block/rsxx/dev.c | 5 +++-- drivers/block/umem.c | 4 ++-- drivers/block/zram/zram_drv.c | 5 +++-- drivers/lightnvm/rrpc.c | 9 +++++---- drivers/md/bcache/request.c | 11 ++++++++--- drivers/md/dm.c | 6 +++--- drivers/md/md.c | 8 +++++--- drivers/nvdimm/blk.c | 3 ++- drivers/nvdimm/btt.c | 3 ++- drivers/nvdimm/pmem.c | 3 ++- drivers/s390/block/dcssblk.c | 8 +++++--- drivers/s390/block/xpram.c | 5 +++-- drivers/staging/lustre/lustre/llite/lloop.c | 5 +++-- include/linux/blk_types.h | 24 ++++++++++++++++++++++++ include/linux/blkdev.h | 4 ++-- include/linux/fs.h | 2 +- include/linux/lightnvm.h | 2 +- 28 files changed, 127 insertions(+), 71 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index f2a00c591bf7..e9110b9b8bcd 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -59,7 +59,7 @@ struct nfhd_device { struct gendisk *disk; }; -static void nfhd_make_request(struct request_queue *queue, struct bio *bio) +static blk_qc_t nfhd_make_request(struct request_queue *queue, struct bio *bio) { struct nfhd_device *dev = queue->queuedata; struct bio_vec bvec; @@ -77,6 +77,7 @@ static void nfhd_make_request(struct request_queue *queue, struct bio *bio) sec += len; } bio_endio(bio); + return BLK_QC_T_NONE; } static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index d2b79bc336c1..7a399b4d60a0 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -103,7 +103,7 @@ axon_ram_irq_handler(int irq, void *dev) * axon_ram_make_request - make_request() method for block device * @queue, @bio: see blk_queue_make_request() */ -static void +static blk_qc_t axon_ram_make_request(struct request_queue *queue, struct bio *bio) { struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data; @@ -120,7 +120,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) bio_for_each_segment(vec, bio, iter) { if (unlikely(phys_mem + vec.bv_len > phys_end)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } user_mem = page_address(vec.bv_page) + vec.bv_offset; @@ -133,6 +133,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) transfered += vec.bv_len; } bio_endio(bio); + return BLK_QC_T_NONE; } /** diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index fa84ca990caa..3c3ace2c46b6 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -101,7 +101,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, spin_unlock(&dev->lock); } -static void simdisk_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) { struct simdisk *dev = q->queuedata; struct bio_vec bvec; @@ -119,6 +119,7 @@ static void simdisk_make_request(struct request_queue *q, struct bio *bio) } bio_endio(bio); + return BLK_QC_T_NONE; } static int simdisk_open(struct block_device *bdev, fmode_t mode) diff --git a/block/blk-core.c b/block/blk-core.c index 89eec7965870..e93df6d386a0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -809,7 +809,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); -static void blk_queue_bio(struct request_queue *q, struct bio *bio); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, @@ -1678,7 +1678,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1698,7 +1698,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio->bi_error = -EIO; bio_endio(bio); - return; + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1713,7 +1713,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) */ if (!blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); @@ -1791,6 +1791,8 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } /* @@ -1996,12 +1998,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -2015,7 +2018,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -2040,7 +2043,7 @@ void generic_make_request(struct bio *bio) if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { - q->make_request_fn(q, bio); + ret = q->make_request_fn(q, bio); blk_queue_exit(q); @@ -2053,6 +2056,9 @@ void generic_make_request(struct bio *bio) } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -2066,7 +2072,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -2100,7 +2106,7 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3eaef64..65f43bd696a0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1235,7 +1235,7 @@ static int blk_mq_direct_issue_request(struct request *rq) * but will attempt to bypass the hctx queueing if we can go straight to * hardware for SYNC IO. */ -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1249,7 +1249,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); @@ -1257,13 +1257,13 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (!is_flush_fua && !blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return; + return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1302,11 +1302,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) old_rq = rq; blk_mq_put_ctx(data.ctx); if (!old_rq) - return; + return BLK_QC_T_NONE; if (!blk_mq_direct_issue_request(old_rq)) - return; + return BLK_QC_T_NONE; blk_mq_insert_request(old_rq, false, true, true); - return; + return BLK_QC_T_NONE; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1320,13 +1320,14 @@ run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); + return BLK_QC_T_NONE; } /* * Single hardware queue variant. This will attempt to use any per-process * plug for merging and IO deferral. */ -static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1339,18 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1374,7 +1375,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) } list_add_tail(&rq->queuelist, &plug->mq_list); blk_mq_put_ctx(data.ctx); - return; + return BLK_QC_T_NONE; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1389,6 +1390,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); + return BLK_QC_T_NONE; } /* diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b9794aeeb878..c9f9c30d6467 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -323,7 +323,7 @@ out: return err; } -static void brd_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct brd_device *brd = bdev->bd_disk->private_data; @@ -358,9 +358,10 @@ static void brd_make_request(struct request_queue *q, struct bio *bio) out: bio_endio(bio); - return; + return BLK_QC_T_NONE; io_error: bio_io_error(bio); + return BLK_QC_T_NONE; } static int brd_rw_page(struct block_device *bdev, sector_t sector, diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 015c6e91b756..e66d453a5f2b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1448,7 +1448,7 @@ extern int proc_details; /* drbd_req */ extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); -extern void drbd_make_request(struct request_queue *q, struct bio *bio); +extern blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); extern int is_valid_ar_handle(struct drbd_request *, sector_t); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 211592682169..3ae2c0086563 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1494,7 +1494,7 @@ void do_submit(struct work_struct *ws) } } -void drbd_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) { struct drbd_device *device = (struct drbd_device *) q->queuedata; unsigned long start_jif; @@ -1510,6 +1510,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) inc_ap_bio(device); __drbd_make_request(device, bio, start_jif); + return BLK_QC_T_NONE; } void request_timer_fn(unsigned long data) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 1c9e4fe5aa44..6255d1c4bba4 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -321,7 +321,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) return &nullb->queues[index]; } -static void null_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) { struct nullb *nullb = q->queuedata; struct nullb_queue *nq = nullb_to_queue(nullb); @@ -331,6 +331,7 @@ static void null_queue_bio(struct request_queue *q, struct bio *bio) cmd->bio = bio; null_handle_cmd(cmd); + return BLK_QC_T_NONE; } static int null_rq_prep_fn(struct request_queue *q, struct request *req) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7be2375db7f2..a7f4abcedee1 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2441,7 +2441,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } } -static void pkt_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) { struct pktcdvd_device *pd; char b[BDEVNAME_SIZE]; @@ -2467,7 +2467,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) */ if (bio_data_dir(bio) == READ) { pkt_make_request_read(pd, bio); - return; + return BLK_QC_T_NONE; } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { @@ -2499,13 +2499,12 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pkt_make_request_write(q, split); } while (split != bio); - return; + return BLK_QC_T_NONE; end_io: bio_io_error(bio); + return BLK_QC_T_NONE; } - - static void pkt_init_queue(struct pktcdvd_device *pd) { struct request_queue *q = pd->disk->queue; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index d89fcac59515..56847fcda086 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -598,7 +598,7 @@ out: return next; } -static void ps3vram_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) { struct ps3_system_bus_device *dev = q->queuedata; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -614,11 +614,13 @@ static void ps3vram_make_request(struct request_queue *q, struct bio *bio) spin_unlock_irq(&priv->lock); if (busy) - return; + return BLK_QC_T_NONE; do { bio = ps3vram_do_bio(dev, bio); } while (bio); + + return BLK_QC_T_NONE; } static int ps3vram_probe(struct ps3_system_bus_device *dev) diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 3163e4cdc2cc..e1b8b7061d2f 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -145,7 +145,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, } } -static void rsxx_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) { struct rsxx_cardinfo *card = q->queuedata; struct rsxx_bio_meta *bio_meta; @@ -199,7 +199,7 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) if (st) goto queue_err; - return; + return BLK_QC_T_NONE; queue_err: kmem_cache_free(bio_meta_pool, bio_meta); @@ -207,6 +207,7 @@ req_err: if (st) bio->bi_error = st; bio_endio(bio); + return BLK_QC_T_NONE; } /*----------------- Device Setup -------------------*/ diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 04d65790a886..7939b9f87441 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -524,7 +524,7 @@ static int mm_check_plugged(struct cardinfo *card) return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } -static void mm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) { struct cardinfo *card = q->queuedata; pr_debug("mm_make_request %llu %u\n", @@ -541,7 +541,7 @@ static void mm_make_request(struct request_queue *q, struct bio *bio) activate(card); spin_unlock_irq(&card->lock); - return; + return BLK_QC_T_NONE; } static irqreturn_t mm_interrupt(int irq, void *__card) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9fa15bb9d118..4c99b6ba8681 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -894,7 +894,7 @@ out: /* * Handler function for all zram I/O requests. */ -static void zram_make_request(struct request_queue *queue, struct bio *bio) +static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; @@ -911,11 +911,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) __zram_make_request(zram, bio); zram_meta_put(zram); - return; + return BLK_QC_T_NONE; put_zram: zram_meta_put(zram); error: bio_io_error(bio); + return BLK_QC_T_NONE; } static void zram_slot_free_notify(struct block_device *bdev, diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 64a888a5e9b3..7ba64c87ba1c 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -803,7 +803,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, return NVM_IO_OK; } -static void rrpc_make_rq(struct request_queue *q, struct bio *bio) +static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio) { struct rrpc *rrpc = q->queuedata; struct nvm_rq *rqd; @@ -811,21 +811,21 @@ static void rrpc_make_rq(struct request_queue *q, struct bio *bio) if (bio->bi_rw & REQ_DISCARD) { rrpc_discard(rrpc, bio); - return; + return BLK_QC_T_NONE; } rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL); if (!rqd) { pr_err_ratelimited("rrpc: not able to queue bio."); bio_io_error(bio); - return; + return BLK_QC_T_NONE; } memset(rqd, 0, sizeof(struct nvm_rq)); err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE); switch (err) { case NVM_IO_OK: - return; + return BLK_QC_T_NONE; case NVM_IO_ERR: bio_io_error(bio); break; @@ -841,6 +841,7 @@ static void rrpc_make_rq(struct request_queue *q, struct bio *bio) } mempool_free(rqd, rrpc->rq_pool); + return BLK_QC_T_NONE; } static void rrpc_requeue(struct work_struct *work) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 8e9877b04637..25fa8445bb24 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -958,7 +958,8 @@ static void cached_dev_nodata(struct closure *cl) /* Cached devices - read & write stuff */ -static void cached_dev_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t cached_dev_make_request(struct request_queue *q, + struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; @@ -997,6 +998,8 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) else generic_make_request(bio); } + + return BLK_QC_T_NONE; } static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, @@ -1070,7 +1073,8 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -static void flash_dev_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t flash_dev_make_request(struct request_queue *q, + struct bio *bio) { struct search *s; struct closure *cl; @@ -1093,7 +1097,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) continue_at_nobarrier(&s->cl, flash_dev_nodata, bcache_wq); - return; + return BLK_QC_T_NONE; } else if (rw) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), @@ -1109,6 +1113,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) } continue_at(cl, search_free, NULL); + return BLK_QC_T_NONE; } static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 32440ad5f684..6e15f3565892 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1755,7 +1755,7 @@ static void __split_and_process_bio(struct mapped_device *md, * The request function that just remaps the bio built up by * dm_merge_bvec. */ -static void dm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; @@ -1774,12 +1774,12 @@ static void dm_make_request(struct request_queue *q, struct bio *bio) queue_io(md, bio); else bio_io_error(bio); - return; + return BLK_QC_T_NONE; } __split_and_process_bio(md, map, bio); dm_put_live_table(md, srcu_idx); - return; + return BLK_QC_T_NONE; } int dm_request_based(struct mapped_device *md) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3f9a514b5b9d..807095f4c793 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -250,7 +250,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ -static void md_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = q->queuedata; @@ -262,13 +262,13 @@ static void md_make_request(struct request_queue *q, struct bio *bio) if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_error = -EROFS; bio_endio(bio); - return; + return BLK_QC_T_NONE; } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); @@ -302,6 +302,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); + + return BLK_QC_T_NONE; } /* mddev_suspend makes sure no new requests are submitted diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 0df77cb07df6..91a336ea8c4f 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -161,7 +161,7 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev, return err; } -static void nd_blk_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct gendisk *disk = bdev->bd_disk; @@ -208,6 +208,7 @@ static void nd_blk_make_request(struct request_queue *q, struct bio *bio) out: bio_endio(bio); + return BLK_QC_T_NONE; } static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index eae93ab8ffcd..efb2c1ceef98 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1150,7 +1150,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, return ret; } -static void btt_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct btt *btt = q->queuedata; @@ -1198,6 +1198,7 @@ static void btt_make_request(struct request_queue *q, struct bio *bio) out: bio_endio(bio); + return BLK_QC_T_NONE; } static int btt_rw_page(struct block_device *bdev, sector_t sector, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 0ba6a978f227..3963b7533b65 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -64,7 +64,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, kunmap_atomic(mem); } -static void pmem_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { bool do_acct; unsigned long start; @@ -84,6 +84,7 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio) wmb_pmem(); bio_endio(bio); + return BLK_QC_T_NONE; } static int pmem_rw_page(struct block_device *bdev, sector_t sector, diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5ed44fe21380..94a8f4ab57bc 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -27,7 +27,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); -static void dcssblk_make_request(struct request_queue *q, struct bio *bio); +static blk_qc_t dcssblk_make_request(struct request_queue *q, + struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, void __pmem **kaddr, unsigned long *pfn); @@ -815,7 +816,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode) up_write(&dcssblk_devices_sem); } -static void +static blk_qc_t dcssblk_make_request(struct request_queue *q, struct bio *bio) { struct dcssblk_dev_info *dev_info; @@ -874,9 +875,10 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) bytes_done += bvec.bv_len; } bio_endio(bio); - return; + return BLK_QC_T_NONE; fail: bio_io_error(bio); + return BLK_QC_T_NONE; } static long diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 02871f1db562..288f59a4147b 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -181,7 +181,7 @@ static unsigned long xpram_highest_page_index(void) /* * Block device make request function. */ -static void xpram_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) { xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; @@ -223,9 +223,10 @@ static void xpram_make_request(struct request_queue *q, struct bio *bio) } } bio_endio(bio); - return; + return BLK_QC_T_NONE; fail: bio_io_error(bio); + return BLK_QC_T_NONE; } static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo) diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index e6974c36276d..fed50d538a41 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -333,7 +333,7 @@ static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req) return count; } -static void loop_make_request(struct request_queue *q, struct bio *old_bio) +static blk_qc_t loop_make_request(struct request_queue *q, struct bio *old_bio) { struct lloop_device *lo = q->queuedata; int rw = bio_rw(old_bio); @@ -364,9 +364,10 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio) goto err; } loop_add_bio(lo, old_bio); - return; + return BLK_QC_T_NONE; err: bio_io_error(old_bio); + return BLK_QC_T_NONE; } static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e8130138f29d..641e5a3ed58c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -244,4 +244,28 @@ enum rq_flag_bits { #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) +typedef unsigned int blk_qc_t; +#define BLK_QC_T_NONE -1U +#define BLK_QC_T_SHIFT 16 + +static inline bool blk_qc_t_valid(blk_qc_t cookie) +{ + return cookie != BLK_QC_T_NONE; +} + +static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num) +{ + return tag | (queue_num << BLK_QC_T_SHIFT); +} + +static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) +{ + return cookie >> BLK_QC_T_SHIFT; +} + +static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) +{ + return cookie & 0xffff; +} + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d045ca8487af..5ee0f5243025 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -209,7 +209,7 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); -typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -761,7 +761,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void generic_make_request(struct bio *bio); +extern blk_qc_t generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 72d8a844c692..bcca36e4bc1e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2625,7 +2625,7 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -extern void submit_bio(int, struct bio *); +extern blk_qc_t submit_bio(int, struct bio *); extern int bdev_read_only(struct block_device *); #endif extern int set_blocksize(struct block_device *, int); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 5ebd70d12f35..69c9057e1ab8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -426,7 +426,7 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } -typedef void (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); +typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); -- cgit v1.3-6-gb490 From 05229beeddf7e75e2e616ddaad4b70e7fca9528d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Nov 2015 10:44:55 -0700 Subject: block: add block polling support Add basic support for polling for specific IO to complete. This uses the cookie that blk-mq passes back, which enables the block layer to pass this cookie to the driver to spin for a specific request. This will be combined with request latency tracking, so we can make qualified decisions about when to poll and when not to. For now, for benchmark purposes, we add a sysfs file that controls whether polling is enabled or not. Signed-off-by: Jens Axboe Acked-by: Christoph Hellwig Acked-by: Keith Busch --- block/blk-core.c | 41 +++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 10 ++++++++++ block/blk-sysfs.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 10 ++++++++++ include/linux/blkdev.h | 3 +++ 5 files changed, 99 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index e93df6d386a0..fa36b4ff7d63 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3312,6 +3312,47 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 6f57a110289c..1cf18784c5cf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) return ret; } +static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -295,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -304,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, NULL, }; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 31849e328b45..565b8dac5782 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -317,6 +317,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + spin_lock_irq(q->queue_lock); + if (poll_on) + queue_flag_set(QUEUE_FLAG_POLL, q); + else + queue_flag_clear(QUEUE_FLAG_POLL, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -442,6 +470,12 @@ static struct queue_sysfs_entry queue_random_entry = { .store = queue_store_random, }; +static struct queue_sysfs_entry queue_poll_entry = { + .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_show, + .store = queue_poll_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -466,6 +500,7 @@ static struct attribute *default_attrs[] = { &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_random_entry.attr, + &queue_poll_entry.attr, NULL, }; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 83cc9d4e5455..daf17d70aeca 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -59,6 +59,9 @@ struct blk_mq_hw_ctx { struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; + + unsigned long poll_invoked; + unsigned long poll_success; }; struct blk_mq_tag_set { @@ -97,6 +100,8 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef void (busy_tag_iter_fn)(struct request *, void *, bool); +typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); + struct blk_mq_ops { /* @@ -114,6 +119,11 @@ struct blk_mq_ops { */ timeout_fn *timeout; + /* + * Called to poll for completion of a specific tag. + */ + poll_fn *poll; + softirq_done_fn *complete; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5ee0f5243025..3fe27f8d91f0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -487,6 +487,7 @@ struct request_queue { #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -814,6 +815,8 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); + static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_disk->queue; /* this is never NULL */ -- cgit v1.3-6-gb490 From 2e6edc95382cc36423aff18a237173ad62d5ab52 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 19 Nov 2015 13:29:28 -0800 Subject: block: protect rw_page against device teardown Fix use after free crashes like the following: general protection fault: 0000 [#1] SMP Call Trace: [] ? pmem_do_bvec.isra.12+0xa6/0xf0 [nd_pmem] [] pmem_rw_page+0x42/0x80 [nd_pmem] [] bdev_read_page+0x50/0x60 [] do_mpage_readpage+0x510/0x770 [] ? I_BDEV+0x20/0x20 [] ? lru_cache_add+0x1c/0x50 [] mpage_readpages+0x107/0x170 [] ? I_BDEV+0x20/0x20 [] ? I_BDEV+0x20/0x20 [] blkdev_readpages+0x1d/0x20 [] __do_page_cache_readahead+0x28f/0x310 [] ? __do_page_cache_readahead+0x169/0x310 [] ? pagecache_get_page+0x2d/0x1d0 [] filemap_fault+0x396/0x530 [] __do_fault+0x4e/0xf0 [] handle_mm_fault+0x11bd/0x1b50 Cc: Cc: Jens Axboe Cc: Alexander Viro Reported-by: kbuild test robot Acked-by: Matthew Wilcox [willy: symmetry fixups] Signed-off-by: Dan Williams --- block/blk.h | 2 -- fs/block_dev.c | 18 ++++++++++++++++-- include/linux/blkdev.h | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk.h b/block/blk.h index da722eb786df..c43926d3d74d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -72,8 +72,6 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); -int blk_queue_enter(struct request_queue *q, gfp_t gfp); -void blk_queue_exit(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) diff --git a/fs/block_dev.c b/fs/block_dev.c index bb0dfb1c7af1..c25639e907bd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -390,9 +390,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; + int result = -EOPNOTSUPP; + if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + return result; + + result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + if (result) + return result; + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + blk_queue_exit(bdev->bd_queue); + return result; } EXPORT_SYMBOL_GPL(bdev_read_page); @@ -421,14 +429,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; + result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + if (result) + return result; + set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); if (result) end_page_writeback(page); else unlock_page(page); + blk_queue_exit(bdev->bd_queue); return result; } EXPORT_SYMBOL_GPL(bdev_write_page); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3fe27f8d91f0..c0d2b7927c1f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -794,6 +794,8 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); +extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); +extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit v1.3-6-gb490