From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:13 +0200 Subject: block: split out request-only flags into a new namespace A lot of the REQ_* flags are only used on struct requests, and only of use to the block layer and a few drivers that dig into struct request internals. This patch adds a new req_flags_t rq_flags field to struct request for them, and thus dramatically shrinks the number of common requests. It also removes the unfortunate situation where we have to fit the fields from the same enum into 32 bits for struct bio and 64 bits for struct request. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block/blk-merge.c') diff --git a/block/blk-merge.c b/block/blk-merge.c index 2642e5fc8b69..fda6a12fc776 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -456,7 +456,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->cmd_flags & REQ_COPY_USER) && + if (unlikely(rq->rq_flags & RQF_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; @@ -634,7 +634,7 @@ void blk_rq_set_mixed_merge(struct request *rq) unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; - if (rq->cmd_flags & REQ_MIXED_MERGE) + if (rq->rq_flags & RQF_MIXED_MERGE) return; /* @@ -647,7 +647,7 @@ void blk_rq_set_mixed_merge(struct request *rq) (bio->bi_opf & REQ_FAILFAST_MASK) != ff); bio->bi_opf |= ff; } - rq->cmd_flags |= REQ_MIXED_MERGE; + rq->rq_flags |= RQF_MIXED_MERGE; } static void blk_account_io_merge(struct request *req) @@ -709,7 +709,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, * makes sure that all involved bios have mixable attributes * set properly. */ - if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) || (req->cmd_flags & REQ_FAILFAST_MASK) != (next->cmd_flags & REQ_FAILFAST_MASK)) { blk_rq_set_mixed_merge(req); -- cgit v1.2.3-59-g8ed1b From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:59 -0800 Subject: block: add support for REQ_OP_WRITE_ZEROES This adds a new block layer operation to zero out a range of LBAs. This allows to implement zeroing for devices that don't use either discard with a predictable zero pattern or WRITE SAME of zeroes. The prominent example of that is NVMe with the Write Zeroes command, but in the future, this should also help with improving the way zeroing discards work. For this operation, suitable entry is exported in sysfs which indicate the number of maximum bytes allowed in one write zeroes operation by the device. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 13 ++++++++ block/bio.c | 1 + block/blk-core.c | 4 +++ block/blk-lib.c | 58 +++++++++++++++++++++++++++++++++-- block/blk-merge.c | 17 +++++++--- block/blk-settings.c | 17 ++++++++++ block/blk-sysfs.c | 11 +++++++ block/blk-wbt.c | 5 +-- include/linux/bio.h | 25 ++++++++------- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 19 ++++++++++++ 11 files changed, 153 insertions(+), 19 deletions(-) (limited to 'block/blk-merge.c') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index ee2d5cd26bfe..2da04ce6aeef 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -235,6 +235,19 @@ Description: write_same_max_bytes is 0, write same is not supported by the device. +What: /sys/block//queue/write_zeroes_max_bytes +Date: November 2016 +Contact: Chaitanya Kulkarni +Description: + Devices that support write zeroes operation in which a + single request can be issued to zero out the range of + contiguous blocks on storage without having any payload + in the request. This can be used to optimize writing zeroes + to the devices. write_zeroes_max_bytes indicates how many + bytes can be written in a single write zeroes command. If + write_zeroes_max_bytes is 0, write zeroes is not supported + by the device. + What: /sys/block//queue/zoned Date: September 2016 Contact: Damien Le Moal diff --git a/block/bio.c b/block/bio.c index de257ced69b1..83db1f37fd0b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: break; case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; diff --git a/block/blk-core.c b/block/blk-core.c index 6c4a425690fc..3f2eb8d80189 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; + case REQ_OP_WRITE_ZEROES: + if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + goto not_supported; + break; default: break; } diff --git a/block/blk-lib.c b/block/blk-lib.c index bfb28b03765e..510a6fb15318 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_write_same); +/** + * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * + * Description: + * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages. + */ +static int __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct bio *bio = *biop; + unsigned int max_write_zeroes_sectors; + struct request_queue *q = bdev_get_queue(bdev); + + if (!q) + return -ENXIO; + + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + + if (max_write_zeroes_sectors == 0) + return -EOPNOTSUPP; + + while (nr_sects) { + bio = next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); + + if (nr_sects > max_write_zeroes_sectors) { + bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; + nr_sects -= max_write_zeroes_sectors; + sector += max_write_zeroes_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, goto out; } + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, ZERO_PAGE(0), biop); if (ret == 0 || (ret && ret != -EOPNOTSUPP)) @@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); * the discard request fail, if the discard flag is not set, or if * discard_zeroes_data is not supported, this function will resort to * zeroing the blocks manually, thus provisioning (allocating, - * anchoring) them. If the block device supports the WRITE SAME command - * blkdev_issue_zeroout() will use it to optimize the process of + * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME + * command(s), blkdev_issue_zeroout() will use it to optimize the process of * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index fda6a12fc776..cf2848cb91d8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, case REQ_OP_SECURE_ERASE: split = blk_bio_discard_split(q, *bio, bs, &nsegs); break; + case REQ_OP_WRITE_ZEROES: + split = NULL; + nsegs = (*bio)->bi_phys_segments; + break; case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); break; @@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, * This should probably be returning 0, but blk_add_request_payload() * (Christoph!!!!) */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } fbio = bio; cluster = blk_queue_cluster(q); @@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: /* * This is a hack - drivers should be neither modifying the * biovec, nor relying on bi_vcnt - but because of diff --git a/block/blk-settings.c b/block/blk-settings.c index c7ccabc0ec3e..8a2bc124a684 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; + lim->max_write_zeroes_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; + lim->max_write_zeroes_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -299,6 +301,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_same_sectors); +/** + * blk_queue_max_write_zeroes_sectors - set max sectors for a single + * write zeroes + * @q: the request queue for the device + * @max_write_zeroes_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_zeroes_sectors) +{ + q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); + t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, + b->max_write_zeroes_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a97841491769..706b27bd73a1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_same_sectors << 9); } +static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_zeroes_sectors << 9); +} static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) @@ -611,6 +616,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = { .show = queue_write_same_max_show, }; +static struct queue_sysfs_entry queue_write_zeroes_max_entry = { + .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO }, + .show = queue_write_zeroes_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, @@ -700,6 +710,7 @@ static struct attribute *default_attrs[] = { &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, + &queue_write_zeroes_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nomerges_entry.attr, diff --git a/block/blk-wbt.c b/block/blk-wbt.c index b8647343141f..d500e43da5d9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) const int op = bio_op(bio); /* - * If not a WRITE (or a discard), do nothing + * If not a WRITE (or a discard or write zeroes), do nothing */ - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_OP_WRITE_ZEROES)) return false; /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 70a7244f08a7..b15323934a29 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio) if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && - bio_op(bio) != REQ_OP_SECURE_ERASE) + bio_op(bio) != REQ_OP_SECURE_ERASE && + bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; @@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_SAME; + bio_op(bio) == REQ_OP_WRITE_SAME || + bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline bool bio_mergeable(struct bio *bio) @@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio) struct bvec_iter iter; /* - * We special case discard/write same, because they interpret bi_size - * differently: + * We special case discard/write same/write zeroes, because they + * interpret bi_size differently: */ - if (bio_op(bio) == REQ_OP_DISCARD) - return 1; - - if (bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } bio_for_each_segment(bv, bio, iter) segs++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f57458a6a93b..519ea2c9df61 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -159,6 +159,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, + /* write the zero filled sector many times */ + REQ_OP_WRITE_ZEROES = 8, REQ_OP_LAST, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e9d8a0895be..ebeef2b79c5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,7 @@ struct queue_limits { unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; + unsigned int max_write_zeroes_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_FLUSH) return false; + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + return q->limits.max_sectors; } @@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); +extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_zeroes_sectors; + + return 0; +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3-59-g8ed1b From e0c723000966ae285295caaa8cda914dfa177fa4 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 1 Dec 2016 08:36:16 -0700 Subject: block: factor out req_set_nomerge Factor out common code for setting REQ_NOMERGE flag which is being used out at certain places and make it a helper instead, req_set_nomerge(). Signed-off-by: Ritesh Harjani Get rid of the inline. Signed-off-by: Jens Axboe --- block/blk-merge.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'block/blk-merge.c') diff --git a/block/blk-merge.c b/block/blk-merge.c index cf2848cb91d8..1002afdfee99 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -501,6 +501,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); +static void req_set_nomerge(struct request_queue *q, struct request *req) +{ + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; +} + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) @@ -521,9 +528,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; no_merge: - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } @@ -537,9 +542,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(req->biotail, BIO_SEG_VALID)) @@ -561,9 +564,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(bio, BIO_SEG_VALID)) -- cgit v1.2.3-59-g8ed1b From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Dec 2016 15:20:32 -0700 Subject: block: improve handling of the magic discard payload Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 10 +-------- block/blk-core.c | 34 ++--------------------------- block/blk-lib.c | 2 +- block/blk-merge.c | 53 +++++++++++++++------------------------------- drivers/nvme/host/core.c | 17 ++++----------- drivers/nvme/host/nvme.h | 6 ++++-- drivers/nvme/host/pci.c | 27 +++++++++++------------ drivers/nvme/host/rdma.c | 13 +++++------- drivers/nvme/target/loop.c | 4 ++-- drivers/scsi/scsi_lib.c | 6 +++--- drivers/scsi/sd.c | 24 ++++++++------------- include/linux/bio.h | 3 ++- include/linux/blkdev.h | 15 ++++++++++--- 13 files changed, 76 insertions(+), 138 deletions(-) (limited to 'block/blk-merge.c') diff --git a/block/bio.c b/block/bio.c index 83db1f37fd0b..2b375020fc49 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1840,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); - /* - * Discards need a mutable bio_vec to accommodate the payload - * required by the DSM TRIM and UNMAP commands. - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - split = bio_clone_bioset(bio, gfp, bs); - else - split = bio_clone_fast(bio, gfp, bs); - + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 4b7ec5958055..bd642a43b98b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1475,38 +1475,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -/** - * blk_add_request_payload - add a payload to a request - * @rq: request to update - * @page: page backing the payload - * @offset: offset in page - * @len: length of the payload. - * - * This allows to later add a payload to an already submitted request by - * a block driver. The driver needs to take care of freeing the payload - * itself. - * - * Note that this is a quite horrible hack and nothing but handling of - * discard requests should ever use it. - */ -void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len) -{ - struct bio *bio = rq->bio; - - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = offset; - bio->bi_io_vec->bv_len = len; - - bio->bi_iter.bi_size = len; - bio->bi_vcnt = 1; - bio->bi_phys_segments = 1; - - rq->__data_len = rq->resid_len = len; - rq->nr_phys_segments = 1; -} -EXPORT_SYMBOL_GPL(blk_add_request_payload); - bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { @@ -2642,6 +2610,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } + WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD); + req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ diff --git a/block/blk-lib.c b/block/blk-lib.c index 510a6fb15318..ed89c8f4b2a0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } - bio = next_bio(bio, 1, gfp_mask); + bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); diff --git a/block/blk-merge.c b/block/blk-merge.c index 1002afdfee99..182398cb1524 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -241,18 +241,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; - /* - * This should probably be returning 0, but blk_add_request_payload() - * (Christoph!!!!) - */ switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; - default: - break; } fbio = bio; @@ -410,39 +405,21 @@ new_segment: *bvprv = *bvec; } +static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv, + struct scatterlist *sglist, struct scatterlist **sg) +{ + *sg = sglist; + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + return 1; +} + static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; - int nsegs, cluster; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - /* - * This is a hack - drivers should be neither modifying the - * biovec, nor relying on bi_vcnt - but because of - * blk_add_request_payload(), a discard bio may or may not have - * a payload we need to set up here (thank you Christoph) and - * bi_vcnt is really the only way of telling if we need to. - */ - if (!bio->bi_vcnt) - return 0; - /* Fall through */ - case REQ_OP_WRITE_SAME: - *sg = sglist; - bvec = bio_iovec(bio); - sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); - return 1; - default: - break; - } + int cluster = blk_queue_cluster(q), nsegs = 0; for_each_bio(bio) bio_for_each_segment(bvec, bio, iter) @@ -462,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg = NULL; int nsegs = 0; - if (rq->bio) + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg); + else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) + nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg); + else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->rq_flags & RQF_COPY_USER) && @@ -495,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ - WARN_ON(nsegs > rq->nr_phys_segments); + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1b48514fbe99..3b1d6478dcfb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -239,8 +239,6 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { struct nvme_dsm_range *range; - struct page *page; - int offset; unsigned int nr_bytes = blk_rq_bytes(req); range = kmalloc(sizeof(*range), GFP_ATOMIC); @@ -257,17 +255,10 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, cmnd->dsm.nr = 0; cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - req->completion_data = range; - page = virt_to_page(range); - offset = offset_in_page(range); - blk_add_request_payload(req, page, offset, sizeof(*range)); - - /* - * we set __data_len back to the size of the area to be discarded - * on disk. This allows us to report completion on the full amount - * of blocks described by the request. - */ - req->__data_len = nr_bytes; + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range); + req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_MQ_RQ_QUEUE_OK; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a3d6ffd874af..bd5321441d12 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -236,8 +236,10 @@ static inline unsigned nvme_map_len(struct request *rq) static inline void nvme_cleanup_cmd(struct request *req) { - if (req_op(req) == REQ_OP_DISCARD) - kfree(req->completion_data); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } } static inline int nvme_error_status(u16 status) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 82b9b3f1f21d..717d6ea47ee4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -302,14 +302,14 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, static __le64 **iod_list(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (__le64 **)(iod->sg + req->nr_phys_segments); + return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } static int nvme_init_iod(struct request *rq, unsigned size, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); - int nseg = rq->nr_phys_segments; + int nseg = blk_rq_nr_phys_segments(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -339,8 +339,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; - nvme_cleanup_cmd(req); - if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -510,7 +508,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_TO_DEVICE : DMA_FROM_DEVICE; int ret = BLK_MQ_RQ_QUEUE_ERROR; - sg_init_table(iod->sg, req->nr_phys_segments); + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; @@ -566,6 +564,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) } } + nvme_cleanup_cmd(req); nvme_free_iod(dev, req); } @@ -596,20 +595,20 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - map_len = nvme_map_len(req); - ret = nvme_init_iod(req, map_len, dev); + ret = nvme_setup_cmd(ns, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; - ret = nvme_setup_cmd(ns, req, &cmnd); + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out; + goto out_free_cmd; - if (req->nr_phys_segments) + if (blk_rq_nr_phys_segments(req)) ret = nvme_map_data(dev, req, map_len, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out; + goto out_cleanup_iod; blk_mq_start_request(req); @@ -620,14 +619,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, else ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); - goto out; + goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; -out: +out_cleanup_iod: nvme_free_iod(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); return ret; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b037d0cb2a7e..251101bf982f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -952,8 +952,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - int nents, count; - int ret; + int count, ret; req->num_sge = 1; req->inline_data = false; @@ -965,16 +964,14 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, return nvme_rdma_set_sg_null(c); req->sg_table.sgl = req->first_sgl; - ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments, - req->sg_table.sgl); + ret = sg_alloc_table_chained(&req->sg_table, + blk_rq_nr_phys_segments(rq), req->sg_table.sgl); if (ret) return -ENOMEM; - nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); - BUG_ON(nents > rq->nr_phys_segments); - req->nents = nents; + req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); - count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents, + count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (unlikely(count <= 0)) { sg_free_table_chained(&req->sg_table, true); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 57ded6b3ed8a..9aaa70071ae5 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -185,13 +185,13 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (blk_rq_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; ret = sg_alloc_table_chained(&iod->sg_table, - req->nr_phys_segments, iod->sg_table.sgl); + blk_rq_nr_phys_segments(req), + iod->sg_table.sgl); if (ret) return BLK_MQ_RQ_QUEUE_BUSY; iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); - BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); } blk_mq_start_request(req); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 47a5c8783b89..9a8ccff1121f 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1007,8 +1007,8 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) /* * If sg table allocation fails, requeue request later. */ - if (unlikely(sg_alloc_table_chained(&sdb->table, req->nr_phys_segments, - sdb->table.sgl))) + if (unlikely(sg_alloc_table_chained(&sdb->table, + blk_rq_nr_phys_segments(req), sdb->table.sgl))) return BLKPREP_DEFER; /* @@ -1040,7 +1040,7 @@ int scsi_init_io(struct scsi_cmnd *cmd) bool is_mq = (rq->mq_ctx != NULL); int error; - BUG_ON(!rq->nr_phys_segments); + BUG_ON(!blk_rq_nr_phys_segments(rq)); error = scsi_init_sgtable(rq, &cmd->sdb); if (error) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65738b0aad36..079c2d9759fb 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -716,7 +716,6 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); - unsigned int nr_bytes = blk_rq_bytes(rq); unsigned int len; int ret; char *buf; @@ -772,24 +771,19 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd) goto out; } - rq->completion_data = page; rq->timeout = SD_TIMEOUT; cmd->transfersize = len; cmd->allowed = SD_MAX_RETRIES; - /* - * Initially __data_len is set to the amount of data that needs to be - * transferred to the target. This amount depends on whether WRITE SAME - * or UNMAP is being used. After the scatterlist has been mapped by - * scsi_init_io() we set __data_len to the size of the area to be - * discarded on disk. This allows us to report completion on the full - * amount of blocks described by the request. - */ - blk_add_request_payload(rq, page, 0, len); - ret = scsi_init_io(cmd); - rq->__data_len = nr_bytes; + rq->special_vec.bv_page = page; + rq->special_vec.bv_offset = 0; + rq->special_vec.bv_len = len; + + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->resid_len = len; + ret = scsi_init_io(cmd); out: if (ret != BLKPREP_OK) __free_page(page); @@ -1182,8 +1176,8 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; - if (req_op(rq) == REQ_OP_DISCARD) - __free_page(rq->completion_data); + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + __free_page(rq->special_vec.bv_page); if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); diff --git a/include/linux/bio.h b/include/linux/bio.h index b15323934a29..7cf8a6c70a3f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -197,8 +197,9 @@ static inline unsigned bio_segments(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; default: break; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ebeef2b79c5a..c5393766909d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,10 +120,13 @@ typedef __u32 __bitwise req_flags_t; #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* IO stats tracking on */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) #define BLK_MAX_CDB 16 @@ -175,6 +178,7 @@ struct request { */ union { struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; void *completion_data; }; @@ -909,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); -extern void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -1153,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3-59-g8ed1b